From 80c469a0a03763f814715f3d12b6f3964c7423e8 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 14 Oct 2021 04:57:19 -0400 Subject: [PATCH 0001/1295] ARM: OMAP2+: hwmod: Add of_node_put() before break Fix following coccicheck warning: ./arch/arm/mach-omap2/omap_hwmod.c:753:1-23: WARNING: Function for_each_matching_node should have of_node_put() before break Early exits from for_each_matching_node should decrement the node reference counter. Signed-off-by: Wan Jiabing Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/omap_hwmod.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c index ccb0e3732c0d..31d1a21f6041 100644 --- a/arch/arm/mach-omap2/omap_hwmod.c +++ b/arch/arm/mach-omap2/omap_hwmod.c @@ -752,8 +752,10 @@ static int __init _init_clkctrl_providers(void) for_each_matching_node(np, ti_clkctrl_match_table) { ret = _setup_clkctrl_provider(np); - if (ret) + if (ret) { + of_node_put(np); break; + } } return ret; From 34596ba380b03d181e24efd50e2f21045bde3696 Mon Sep 17 00:00:00 2001 From: Ye Guojin Date: Tue, 16 Nov 2021 06:27:26 +0000 Subject: [PATCH 0002/1295] ARM: OMAP2+: adjust the location of put_device() call in omapdss_init_of This was found by coccicheck: ./arch/arm/mach-omap2/display.c, 272, 1-7, ERROR missing put_device; call of_find_device_by_node on line 258, but without a corresponding object release within this function. Move the put_device() call before the if judgment. Reported-by: Zeal Robot Signed-off-by: Ye Guojin Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/display.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-omap2/display.c b/arch/arm/mach-omap2/display.c index 6daaa645ae5d..21413a9b7b6c 100644 --- a/arch/arm/mach-omap2/display.c +++ b/arch/arm/mach-omap2/display.c @@ -263,9 +263,9 @@ static int __init omapdss_init_of(void) } r = of_platform_populate(node, NULL, NULL, &pdev->dev); + put_device(&pdev->dev); if (r) { pr_err("Unable to populate DSS submodule devices\n"); - put_device(&pdev->dev); return r; } From 29a5e8496b3ac0d400dfe32288c26c774beb8cc8 Mon Sep 17 00:00:00 2001 From: Jayesh Choudhary Date: Thu, 25 Nov 2021 16:23:26 +0530 Subject: [PATCH 0003/1295] ARM: dts: am335x-wega: Fix typo in mcasp property rx-num-evt Fix the property name 'rx-num-evt'. Signed-off-by: Jayesh Choudhary Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/am335x-wega.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/am335x-wega.dtsi b/arch/arm/boot/dts/am335x-wega.dtsi index 673159d93a6a..f957fea8208e 100644 --- a/arch/arm/boot/dts/am335x-wega.dtsi +++ b/arch/arm/boot/dts/am335x-wega.dtsi @@ -55,7 +55,7 @@ 2 1 0 0 /* # 0: INACTIVE, 1: TX, 2: RX */ >; tx-num-evt = <16>; - rt-num-evt = <16>; + rx-num-evt = <16>; status = "okay"; }; From 23885389dbbbbc698986e77a45c1fc44a6e3632e Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 25 Nov 2021 16:48:34 +0200 Subject: [PATCH 0004/1295] ARM: dts: Fix timer regression for beagleboard revision c Commit e428e250fde6 ("ARM: dts: Configure system timers for omap3") caused a timer regression for beagleboard revision c where the system clockevent stops working if omap3isp module is unloaded. Turns out we still have beagleboard revisions a-b4 capacitor c70 quirks applied that limit the usable timers for no good reason. This also affects the power management as we use the system clock instead of the 32k clock source. Let's fix the issue by adding a new omap3-beagle-ab4.dts for the old timer quirks. This allows us to remove the timer quirks for later beagleboard revisions. We also need to update the related timer quirk check for the correct compatible property. Fixes: e428e250fde6 ("ARM: dts: Configure system timers for omap3") Cc: linux-kernel@vger.kernel.org Cc: Daniel Lezcano Cc: Thomas Gleixner Cc: Rob Herring Reported-by: Jarkko Nikula Tested-by: Jarkko Nikula Signed-off-by: Tony Lindgren --- .../devicetree/bindings/arm/omap/omap.txt | 3 ++ arch/arm/boot/dts/Makefile | 1 + arch/arm/boot/dts/omap3-beagle-ab4.dts | 47 +++++++++++++++++++ arch/arm/boot/dts/omap3-beagle.dts | 33 ------------- drivers/clocksource/timer-ti-dm-systimer.c | 2 +- 5 files changed, 52 insertions(+), 34 deletions(-) create mode 100644 arch/arm/boot/dts/omap3-beagle-ab4.dts diff --git a/Documentation/devicetree/bindings/arm/omap/omap.txt b/Documentation/devicetree/bindings/arm/omap/omap.txt index e77635c5422c..fa8b31660cad 100644 --- a/Documentation/devicetree/bindings/arm/omap/omap.txt +++ b/Documentation/devicetree/bindings/arm/omap/omap.txt @@ -119,6 +119,9 @@ Boards (incomplete list of examples): - OMAP3 BeagleBoard : Low cost community board compatible = "ti,omap3-beagle", "ti,omap3430", "ti,omap3" +- OMAP3 BeagleBoard A to B4 : Early BeagleBoard revisions A to B4 with a timer quirk + compatible = "ti,omap3-beagle-ab4", "ti,omap3-beagle", "ti,omap3430", "ti,omap3" + - OMAP3 Tobi with Overo : Commercial expansion board with daughter board compatible = "gumstix,omap3-overo-tobi", "gumstix,omap3-overo", "ti,omap3430", "ti,omap3" diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile index 0de64f237cd8..a387ebe8919b 100644 --- a/arch/arm/boot/dts/Makefile +++ b/arch/arm/boot/dts/Makefile @@ -794,6 +794,7 @@ dtb-$(CONFIG_ARCH_OMAP3) += \ logicpd-som-lv-37xx-devkit.dtb \ omap3430-sdp.dtb \ omap3-beagle.dtb \ + omap3-beagle-ab4.dtb \ omap3-beagle-xm.dtb \ omap3-beagle-xm-ab.dtb \ omap3-cm-t3517.dtb \ diff --git a/arch/arm/boot/dts/omap3-beagle-ab4.dts b/arch/arm/boot/dts/omap3-beagle-ab4.dts new file mode 100644 index 000000000000..990ff2d84686 --- /dev/null +++ b/arch/arm/boot/dts/omap3-beagle-ab4.dts @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0-only +/dts-v1/; + +#include "omap3-beagle.dts" + +/ { + model = "TI OMAP3 BeagleBoard A to B4"; + compatible = "ti,omap3-beagle-ab4", "ti,omap3-beagle", "ti,omap3430", "ti,omap3"; +}; + +/* + * Workaround for capacitor C70 issue, see "Boards revision A and < B5" + * section at https://elinux.org/BeagleBoard_Community + */ + +/* Unusable as clocksource because of unreliable oscillator */ +&counter32k { + status = "disabled"; +}; + +/* Unusable as clockevent because of unreliable oscillator, allow to idle */ +&timer1_target { + /delete-property/ti,no-reset-on-init; + /delete-property/ti,no-idle; + timer@0 { + /delete-property/ti,timer-alwon; + }; +}; + +/* Preferred always-on timer for clocksource */ +&timer12_target { + ti,no-reset-on-init; + ti,no-idle; + timer@0 { + /* Always clocked by secure_32k_fck */ + }; +}; + +/* Preferred timer for clockevent */ +&timer2_target { + ti,no-reset-on-init; + ti,no-idle; + timer@0 { + assigned-clocks = <&gpt2_fck>; + assigned-clock-parents = <&sys_ck>; + }; +}; diff --git a/arch/arm/boot/dts/omap3-beagle.dts b/arch/arm/boot/dts/omap3-beagle.dts index f9f34b8458e9..0548b391334f 100644 --- a/arch/arm/boot/dts/omap3-beagle.dts +++ b/arch/arm/boot/dts/omap3-beagle.dts @@ -304,39 +304,6 @@ phys = <0 &hsusb2_phy>; }; -/* Unusable as clocksource because of unreliable oscillator */ -&counter32k { - status = "disabled"; -}; - -/* Unusable as clockevent because if unreliable oscillator, allow to idle */ -&timer1_target { - /delete-property/ti,no-reset-on-init; - /delete-property/ti,no-idle; - timer@0 { - /delete-property/ti,timer-alwon; - }; -}; - -/* Preferred always-on timer for clocksource */ -&timer12_target { - ti,no-reset-on-init; - ti,no-idle; - timer@0 { - /* Always clocked by secure_32k_fck */ - }; -}; - -/* Preferred timer for clockevent */ -&timer2_target { - ti,no-reset-on-init; - ti,no-idle; - timer@0 { - assigned-clocks = <&gpt2_fck>; - assigned-clock-parents = <&sys_ck>; - }; -}; - &twl_gpio { ti,use-leds; /* pullups: BIT(1) */ diff --git a/drivers/clocksource/timer-ti-dm-systimer.c b/drivers/clocksource/timer-ti-dm-systimer.c index b6f97960d8ee..5c40ca1d4740 100644 --- a/drivers/clocksource/timer-ti-dm-systimer.c +++ b/drivers/clocksource/timer-ti-dm-systimer.c @@ -241,7 +241,7 @@ static void __init dmtimer_systimer_assign_alwon(void) bool quirk_unreliable_oscillator = false; /* Quirk unreliable 32 KiHz oscillator with incomplete dts */ - if (of_machine_is_compatible("ti,omap3-beagle") || + if (of_machine_is_compatible("ti,omap3-beagle-ab4") || of_machine_is_compatible("timll,omap3-devkit8000")) { quirk_unreliable_oscillator = true; counter_32k = -ENODEV; From 9206a3af4fc0cebbefca2d79876d279bdd8d582b Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Fri, 17 Dec 2021 13:55:58 +0200 Subject: [PATCH 0005/1295] clk: ti: Move dra7 clock devices out of the legacy section I accidentally added some dra7 clock defines to the legacy section that we want to stop using. Let's move the defines to the right location. Note that this is just a cosmetic fix. Cc: linux-clk@vger.kernel.org Cc: Stephen Boyd Cc: Tero Kristo Acked-by: Rob Herring Signed-off-by: Tony Lindgren --- include/dt-bindings/clock/dra7.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/dt-bindings/clock/dra7.h b/include/dt-bindings/clock/dra7.h index 7d57063b8a65..29ff6b895848 100644 --- a/include/dt-bindings/clock/dra7.h +++ b/include/dt-bindings/clock/dra7.h @@ -84,17 +84,10 @@ #define DRA7_L3_MAIN_2_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) #define DRA7_L3_INSTR_CLKCTRL DRA7_CLKCTRL_INDEX(0x28) -/* iva clocks */ -#define DRA7_IVA_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) -#define DRA7_SL2IF_CLKCTRL DRA7_CLKCTRL_INDEX(0x28) - /* dss clocks */ #define DRA7_DSS_CORE_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) #define DRA7_BB2D_CLKCTRL DRA7_CLKCTRL_INDEX(0x30) -/* gpu clocks */ -#define DRA7_GPU_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) - /* l3init clocks */ #define DRA7_MMC1_CLKCTRL DRA7_CLKCTRL_INDEX(0x28) #define DRA7_MMC2_CLKCTRL DRA7_CLKCTRL_INDEX(0x30) @@ -267,10 +260,17 @@ #define DRA7_L3INSTR_L3_MAIN_2_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) #define DRA7_L3INSTR_L3_INSTR_CLKCTRL DRA7_CLKCTRL_INDEX(0x28) +/* iva clocks */ +#define DRA7_IVA_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) +#define DRA7_SL2IF_CLKCTRL DRA7_CLKCTRL_INDEX(0x28) + /* dss clocks */ #define DRA7_DSS_DSS_CORE_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) #define DRA7_DSS_BB2D_CLKCTRL DRA7_CLKCTRL_INDEX(0x30) +/* gpu clocks */ +#define DRA7_GPU_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) + /* l3init clocks */ #define DRA7_L3INIT_MMC1_CLKCTRL DRA7_CLKCTRL_INDEX(0x28) #define DRA7_L3INIT_MMC2_CLKCTRL DRA7_CLKCTRL_INDEX(0x30) From 31aa7056bbec0259e2ec91db7d3571f66b14f93f Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Fri, 17 Dec 2021 13:55:59 +0200 Subject: [PATCH 0006/1295] ARM: dts: Don't use legacy clock defines for dra7 clkctrl Looks like we are still using legacy clock defines for dra7. We want to stop using these as it prevents dropping the legacy clocks. Note that this is just a cosmetic fix. Cc: Tero Kristo Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/dra7.dtsi | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi index 6b485cbed8d5..42bff117656c 100644 --- a/arch/arm/boot/dts/dra7.dtsi +++ b/arch/arm/boot/dts/dra7.dtsi @@ -160,7 +160,7 @@ target-module@48210000 { compatible = "ti,sysc-omap4-simple", "ti,sysc"; power-domains = <&prm_mpu>; - clocks = <&mpu_clkctrl DRA7_MPU_CLKCTRL 0>; + clocks = <&mpu_clkctrl DRA7_MPU_MPU_CLKCTRL 0>; clock-names = "fck"; #address-cells = <1>; #size-cells = <1>; @@ -875,10 +875,10 @@ <0x58000014 4>; reg-names = "rev", "syss"; ti,syss-mask = <1>; - clocks = <&dss_clkctrl DRA7_DSS_CORE_CLKCTRL 0>, - <&dss_clkctrl DRA7_DSS_CORE_CLKCTRL 9>, - <&dss_clkctrl DRA7_DSS_CORE_CLKCTRL 10>, - <&dss_clkctrl DRA7_DSS_CORE_CLKCTRL 11>; + clocks = <&dss_clkctrl DRA7_DSS_DSS_CORE_CLKCTRL 0>, + <&dss_clkctrl DRA7_DSS_DSS_CORE_CLKCTRL 9>, + <&dss_clkctrl DRA7_DSS_DSS_CORE_CLKCTRL 10>, + <&dss_clkctrl DRA7_DSS_DSS_CORE_CLKCTRL 11>; clock-names = "fck", "hdmi_clk", "sys_clk", "tv_clk"; #address-cells = <1>; #size-cells = <1>; @@ -912,7 +912,7 @@ SYSC_OMAP2_SOFTRESET | SYSC_OMAP2_AUTOIDLE)>; ti,syss-mask = <1>; - clocks = <&dss_clkctrl DRA7_DSS_CORE_CLKCTRL 8>; + clocks = <&dss_clkctrl DRA7_DSS_DSS_CORE_CLKCTRL 8>; clock-names = "fck"; #address-cells = <1>; #size-cells = <1>; @@ -939,8 +939,8 @@ , ; ti,sysc-mask = <(SYSC_OMAP4_SOFTRESET)>; - clocks = <&dss_clkctrl DRA7_DSS_CORE_CLKCTRL 9>, - <&dss_clkctrl DRA7_DSS_CORE_CLKCTRL 8>; + clocks = <&dss_clkctrl DRA7_DSS_DSS_CORE_CLKCTRL 9>, + <&dss_clkctrl DRA7_DSS_DSS_CORE_CLKCTRL 8>; clock-names = "fck", "dss_clk"; #address-cells = <1>; #size-cells = <1>; @@ -979,7 +979,7 @@ compatible = "vivante,gc"; reg = <0x0 0x700>; interrupts = ; - clocks = <&dss_clkctrl DRA7_BB2D_CLKCTRL 0>; + clocks = <&dss_clkctrl DRA7_DSS_BB2D_CLKCTRL 0>; clock-names = "core"; }; }; @@ -1333,7 +1333,7 @@ ti,no-reset-on-init; ti,no-idle; timer@0 { - assigned-clocks = <&wkupaon_clkctrl DRA7_TIMER1_CLKCTRL 24>; + assigned-clocks = <&wkupaon_clkctrl DRA7_WKUPAON_TIMER1_CLKCTRL 24>; assigned-clock-parents = <&sys_32k_ck>; }; }; From aa7069d840dab6bef4887657b94e439c82ae985a Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Thu, 16 Dec 2021 18:14:49 +0800 Subject: [PATCH 0007/1295] scsi: qedf: Fix potential dereference of NULL pointer The return value of dma_alloc_coherent() needs to be checked to avoid use of NULL pointer in case of an allocation failure. Link: https://lore.kernel.org/r/20211216101449.375953-1-jiasheng@iscas.ac.cn Fixes: 61d8658b4a43 ("scsi: qedf: Add QLogic FastLinQ offload FCoE driver framework.") Acked-by: Saurav Kashyap Signed-off-by: Jiasheng Jiang Signed-off-by: Martin K. Petersen --- drivers/scsi/qedf/qedf_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c index 1bf7a22d4948..cdc66e2a9488 100644 --- a/drivers/scsi/qedf/qedf_main.c +++ b/drivers/scsi/qedf/qedf_main.c @@ -1415,6 +1415,8 @@ static void qedf_upload_connection(struct qedf_ctx *qedf, */ term_params = dma_alloc_coherent(&qedf->pdev->dev, QEDF_TERM_BUFF_SIZE, &term_params_dma, GFP_KERNEL); + if (!term_params) + return; QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_CONN, "Uploading connection " "port_id=%06x.\n", fcport->rdata->ids.port_id); From 4d516e495235a8e95b482a1e9441c4043c7a8235 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 22 Dec 2021 22:11:19 -0800 Subject: [PATCH 0008/1295] scsi: aacraid: Fix spelling of "its" Use the possessive "its" instead of the contraction "it's" in user messages. Link: https://lore.kernel.org/r/20211223061119.18304-1-rdunlap@infradead.org Cc: Adaptec OEM Raid Solutions Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Signed-off-by: Randy Dunlap Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/aachba.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c index 59f6b7b2a70a..b04d039da276 100644 --- a/drivers/scsi/aacraid/aachba.c +++ b/drivers/scsi/aacraid/aachba.c @@ -271,7 +271,7 @@ MODULE_PARM_DESC(msi, "IRQ handling." " 0=PIC(default), 1=MSI, 2=MSI-X)"); module_param(startup_timeout, int, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(startup_timeout, "The duration of time in seconds to wait for" - " adapter to have it's kernel up and\n" + " adapter to have its kernel up and\n" "running. This is typically adjusted for large systems that do not" " have a BIOS."); module_param(aif_timeout, int, S_IRUGO|S_IWUSR); From 81d3f500ee98cf9f5388f0fb0548ba5a57598827 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 29 Sep 2021 18:17:44 +0900 Subject: [PATCH 0009/1295] scsi: core: Fix scsi_mode_select() interface The modepage argument is unused. Remove it. Link: https://lore.kernel.org/r/20210929091744.706003-3-damien.lemoal@wdc.com Reviewed-by: Himanshu Madhani Signed-off-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 8 +++----- drivers/scsi/sd.c | 2 +- include/scsi/scsi_device.h | 5 ++--- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 621d841d819a..6c2a35d31ecc 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2026,7 +2026,6 @@ void scsi_exit_queue(void) * @sdev: SCSI device to be queried * @pf: Page format bit (1 == standard, 0 == vendor specific) * @sp: Save page bit (0 == don't save, 1 == save) - * @modepage: mode page being requested * @buffer: request buffer (may not be smaller than eight bytes) * @len: length of request buffer. * @timeout: command timeout @@ -2039,10 +2038,9 @@ void scsi_exit_queue(void) * status on error * */ -int -scsi_mode_select(struct scsi_device *sdev, int pf, int sp, int modepage, - unsigned char *buffer, int len, int timeout, int retries, - struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) +int scsi_mode_select(struct scsi_device *sdev, int pf, int sp, + unsigned char *buffer, int len, int timeout, int retries, + struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) { unsigned char cmd[10]; unsigned char *real_buffer; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 2a50a840a00c..2e1250b899da 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -209,7 +209,7 @@ cache_type_store(struct device *dev, struct device_attribute *attr, */ data.device_specific = 0; - if (scsi_mode_select(sdp, 1, sp, 8, buffer_data, len, SD_TIMEOUT, + if (scsi_mode_select(sdp, 1, sp, buffer_data, len, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr)) { if (scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index d1c6fc83b1e3..7571e23d8d8a 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -415,9 +415,8 @@ extern int scsi_mode_sense(struct scsi_device *sdev, int dbd, int modepage, int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *); extern int scsi_mode_select(struct scsi_device *sdev, int pf, int sp, - int modepage, unsigned char *buffer, int len, - int timeout, int retries, - struct scsi_mode_data *data, + unsigned char *buffer, int len, int timeout, + int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *); extern int scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries, struct scsi_sense_hdr *sshdr); From 9211faa39a0350fb2239a0bce03b9459cd14fc40 Mon Sep 17 00:00:00 2001 From: Suganath Prabu S Date: Mon, 27 Dec 2021 11:00:55 +0530 Subject: [PATCH 0010/1295] scsi: mpt3sas: Update persistent trigger pages from sysfs interface Store sysfs-provided trigger values into the corresponding persistent trigger pages. Otherwise trigger entries are not persistent across system reboots. Link: https://lore.kernel.org/r/20211227053055.289537-1-suganath-prabu.subramani@broadcom.com Signed-off-by: Suganath Prabu S Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_base.h | 4 +- drivers/scsi/mpt3sas/mpt3sas_ctl.c | 87 +++++++++++++++++++++++++++-- 2 files changed, 85 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h index a0af986633d2..949e98d523e2 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.h +++ b/drivers/scsi/mpt3sas/mpt3sas_base.h @@ -77,8 +77,8 @@ #define MPT3SAS_DRIVER_NAME "mpt3sas" #define MPT3SAS_AUTHOR "Avago Technologies " #define MPT3SAS_DESCRIPTION "LSI MPT Fusion SAS 3.0 Device Driver" -#define MPT3SAS_DRIVER_VERSION "39.100.00.00" -#define MPT3SAS_MAJOR_VERSION 39 +#define MPT3SAS_DRIVER_VERSION "40.100.00.00" +#define MPT3SAS_MAJOR_VERSION 40 #define MPT3SAS_MINOR_VERSION 100 #define MPT3SAS_BUILD_VERSION 0 #define MPT3SAS_RELEASE_VERSION 00 diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c b/drivers/scsi/mpt3sas/mpt3sas_ctl.c index 05b6c6a073c3..d92ca140d298 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c +++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c @@ -3533,11 +3533,31 @@ diag_trigger_master_store(struct device *cdev, { struct Scsi_Host *shost = class_to_shost(cdev); struct MPT3SAS_ADAPTER *ioc = shost_priv(shost); + struct SL_WH_MASTER_TRIGGER_T *master_tg; unsigned long flags; ssize_t rc; + bool set = 1; + + rc = min(sizeof(struct SL_WH_MASTER_TRIGGER_T), count); + + if (ioc->supports_trigger_pages) { + master_tg = kzalloc(sizeof(struct SL_WH_MASTER_TRIGGER_T), + GFP_KERNEL); + if (!master_tg) + return -ENOMEM; + + memcpy(master_tg, buf, rc); + if (!master_tg->MasterData) + set = 0; + if (mpt3sas_config_update_driver_trigger_pg1(ioc, master_tg, + set)) { + kfree(master_tg); + return -EFAULT; + } + kfree(master_tg); + } spin_lock_irqsave(&ioc->diag_trigger_lock, flags); - rc = min(sizeof(struct SL_WH_MASTER_TRIGGER_T), count); memset(&ioc->diag_trigger_master, 0, sizeof(struct SL_WH_MASTER_TRIGGER_T)); memcpy(&ioc->diag_trigger_master, buf, rc); @@ -3589,11 +3609,31 @@ diag_trigger_event_store(struct device *cdev, { struct Scsi_Host *shost = class_to_shost(cdev); struct MPT3SAS_ADAPTER *ioc = shost_priv(shost); + struct SL_WH_EVENT_TRIGGERS_T *event_tg; unsigned long flags; ssize_t sz; + bool set = 1; + + sz = min(sizeof(struct SL_WH_EVENT_TRIGGERS_T), count); + if (ioc->supports_trigger_pages) { + event_tg = kzalloc(sizeof(struct SL_WH_EVENT_TRIGGERS_T), + GFP_KERNEL); + if (!event_tg) + return -ENOMEM; + + memcpy(event_tg, buf, sz); + if (!event_tg->ValidEntries) + set = 0; + if (mpt3sas_config_update_driver_trigger_pg2(ioc, event_tg, + set)) { + kfree(event_tg); + return -EFAULT; + } + kfree(event_tg); + } spin_lock_irqsave(&ioc->diag_trigger_lock, flags); - sz = min(sizeof(struct SL_WH_EVENT_TRIGGERS_T), count); + memset(&ioc->diag_trigger_event, 0, sizeof(struct SL_WH_EVENT_TRIGGERS_T)); memcpy(&ioc->diag_trigger_event, buf, sz); @@ -3644,11 +3684,31 @@ diag_trigger_scsi_store(struct device *cdev, { struct Scsi_Host *shost = class_to_shost(cdev); struct MPT3SAS_ADAPTER *ioc = shost_priv(shost); + struct SL_WH_SCSI_TRIGGERS_T *scsi_tg; unsigned long flags; ssize_t sz; + bool set = 1; + + sz = min(sizeof(struct SL_WH_SCSI_TRIGGERS_T), count); + if (ioc->supports_trigger_pages) { + scsi_tg = kzalloc(sizeof(struct SL_WH_SCSI_TRIGGERS_T), + GFP_KERNEL); + if (!scsi_tg) + return -ENOMEM; + + memcpy(scsi_tg, buf, sz); + if (!scsi_tg->ValidEntries) + set = 0; + if (mpt3sas_config_update_driver_trigger_pg3(ioc, scsi_tg, + set)) { + kfree(scsi_tg); + return -EFAULT; + } + kfree(scsi_tg); + } spin_lock_irqsave(&ioc->diag_trigger_lock, flags); - sz = min(sizeof(ioc->diag_trigger_scsi), count); + memset(&ioc->diag_trigger_scsi, 0, sizeof(ioc->diag_trigger_scsi)); memcpy(&ioc->diag_trigger_scsi, buf, sz); if (ioc->diag_trigger_scsi.ValidEntries > NUM_VALID_ENTRIES) @@ -3698,11 +3758,30 @@ diag_trigger_mpi_store(struct device *cdev, { struct Scsi_Host *shost = class_to_shost(cdev); struct MPT3SAS_ADAPTER *ioc = shost_priv(shost); + struct SL_WH_MPI_TRIGGERS_T *mpi_tg; unsigned long flags; ssize_t sz; + bool set = 1; + + sz = min(sizeof(struct SL_WH_MPI_TRIGGERS_T), count); + if (ioc->supports_trigger_pages) { + mpi_tg = kzalloc(sizeof(struct SL_WH_MPI_TRIGGERS_T), + GFP_KERNEL); + if (!mpi_tg) + return -ENOMEM; + + memcpy(mpi_tg, buf, sz); + if (!mpi_tg->ValidEntries) + set = 0; + if (mpt3sas_config_update_driver_trigger_pg4(ioc, mpi_tg, + set)) { + kfree(mpi_tg); + return -EFAULT; + } + kfree(mpi_tg); + } spin_lock_irqsave(&ioc->diag_trigger_lock, flags); - sz = min(sizeof(struct SL_WH_MPI_TRIGGERS_T), count); memset(&ioc->diag_trigger_mpi, 0, sizeof(ioc->diag_trigger_mpi)); memcpy(&ioc->diag_trigger_mpi, buf, sz); From 5867b8569e6480d3926b0508ee896532c59fde32 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 24 Dec 2021 17:52:40 +0000 Subject: [PATCH 0011/1295] scsi: mpi3mr: Fix some spelling mistakes There are some spelling mistakes in some literal strings. Fix them. Link: https://lore.kernel.org/r/20211224175240.1348942-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_fw.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index c39dd4978c9d..eb07334dd43d 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -901,7 +901,7 @@ static const struct { }, { MPI3MR_RESET_FROM_SYSFS, "sysfs invocation" }, { MPI3MR_RESET_FROM_SYSFS_TIMEOUT, "sysfs TM timeout" }, - { MPI3MR_RESET_FROM_FIRMWARE, "firmware asynchronus reset" }, + { MPI3MR_RESET_FROM_FIRMWARE, "firmware asynchronous reset" }, }; /** @@ -1242,7 +1242,7 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc) ioc_state = mpi3mr_get_iocstate(mrioc); if (ioc_state == MRIOC_STATE_READY) { ioc_info(mrioc, - "successfully transistioned to %s state\n", + "successfully transitioned to %s state\n", mpi3mr_iocstate_name(ioc_state)); return 0; } @@ -3844,7 +3844,7 @@ retry_init: if (mrioc->shost->nr_hw_queues > mrioc->num_op_reply_q) { ioc_err(mrioc, - "cannot create minimum number of operatioanl queues expected:%d created:%d\n", + "cannot create minimum number of operational queues expected:%d created:%d\n", mrioc->shost->nr_hw_queues, mrioc->num_op_reply_q); goto out_failed_noretry; } From 3bb3c24e268ab64305eec670be253eef2238b013 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 31 Dec 2021 16:23:50 +0800 Subject: [PATCH 0012/1295] scsi: mpi3mr: Fix formatting problems in some kernel-doc comments Remove some warnings found by running scripts/kernel-doc, which is caused by using 'make W=1'. drivers/scsi/mpi3mr/mpi3mr_fw.c:2188: warning: Function parameter or member 'reason_code' not described in 'mpi3mr_check_rh_fault_ioc' drivers/scsi/mpi3mr/mpi3mr_fw.c:3650: warning: Excess function parameter 'init_type' description in 'mpi3mr_init_ioc' drivers/scsi/mpi3mr/mpi3mr_fw.c:4177: warning: bad line Link: https://lore.kernel.org/r/20211231082350.19315-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_fw.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index eb07334dd43d..15bdc21ead66 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -2174,7 +2174,7 @@ out: * mpi3mr_check_rh_fault_ioc - check reset history and fault * controller * @mrioc: Adapter instance reference - * @reason_code, reason code for the fault. + * @reason_code: reason code for the fault. * * This routine will save snapdump and fault the controller with * the given reason code if it is not already in the fault or @@ -3633,7 +3633,6 @@ static int mpi3mr_enable_events(struct mpi3mr_ioc *mrioc) /** * mpi3mr_init_ioc - Initialize the controller * @mrioc: Adapter instance reference - * @init_type: Flag to indicate is the init_type * * This the controller initialization routine, executed either * after soft reset or from pci probe callback. @@ -4174,7 +4173,7 @@ static void mpi3mr_issue_ioc_shutdown(struct mpi3mr_ioc *mrioc) /** * mpi3mr_cleanup_ioc - Cleanup controller * @mrioc: Adapter instance reference - + * * controller cleanup handler, Message unit reset or soft reset * and shutdown notification is issued to the controller. * From ee05cb71f9f7eb0c257f6b80b70edb4659151b26 Mon Sep 17 00:00:00 2001 From: Ajish Koshy Date: Tue, 28 Dec 2021 16:47:53 +0530 Subject: [PATCH 0013/1295] scsi: pm80xx: Port reset timeout error handling correction Error handling steps were not in sequence as per the programmers manual. Expected sequence: - PHY_DOWN (PORT_IN_RESET) - PORT_RESET_TIMER_TMO - Host aborts pending I/Os - Host deregister the device - Host sends HW_EVENT_PHY_DOWN ACK Previously we were sending HW_EVENT_PHY_DOWN ACK first and then deregister the device. Fix this to use the expected sequence. Link: https://lore.kernel.org/r/20211228111753.10802-1-Ajish.Koshy@microchip.com Signed-off-by: Ajish Koshy Signed-off-by: Viswas G Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm8001_sas.c | 7 ++++++- drivers/scsi/pm8001/pm8001_sas.h | 3 +++ drivers/scsi/pm8001/pm80xx_hwi.c | 7 +++++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index c9a16eef38c1..160ee8b228c9 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -1199,7 +1199,7 @@ int pm8001_abort_task(struct sas_task *task) struct pm8001_device *pm8001_dev; struct pm8001_tmf_task tmf_task; int rc = TMF_RESP_FUNC_FAILED, ret; - u32 phy_id; + u32 phy_id, port_id; struct sas_task_slow slow_task; if (unlikely(!task || !task->lldd_task || !task->dev)) @@ -1246,6 +1246,7 @@ int pm8001_abort_task(struct sas_task *task) DECLARE_COMPLETION_ONSTACK(completion_reset); DECLARE_COMPLETION_ONSTACK(completion); struct pm8001_phy *phy = pm8001_ha->phy + phy_id; + port_id = phy->port->port_id; /* 1. Set Device state as Recovery */ pm8001_dev->setds_completion = &completion; @@ -1297,6 +1298,10 @@ int pm8001_abort_task(struct sas_task *task) PORT_RESET_TMO); if (phy->port_reset_status == PORT_RESET_TMO) { pm8001_dev_gone_notify(dev); + PM8001_CHIP_DISP->hw_event_ack_req( + pm8001_ha, 0, + 0x07, /*HW_EVENT_PHY_DOWN ack*/ + port_id, phy_id, 0, 0); goto out; } } diff --git a/drivers/scsi/pm8001/pm8001_sas.h b/drivers/scsi/pm8001/pm8001_sas.h index 83eec16d021d..a17da1cebce1 100644 --- a/drivers/scsi/pm8001/pm8001_sas.h +++ b/drivers/scsi/pm8001/pm8001_sas.h @@ -216,6 +216,9 @@ struct pm8001_dispatch { u32 state); int (*sas_re_init_req)(struct pm8001_hba_info *pm8001_ha); int (*fatal_errors)(struct pm8001_hba_info *pm8001_ha); + void (*hw_event_ack_req)(struct pm8001_hba_info *pm8001_ha, + u32 Qnum, u32 SEA, u32 port_id, u32 phyId, u32 param0, + u32 param1); }; struct pm8001_chip_info { diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index 0849ecc913c7..97750d0ebee9 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -3709,8 +3709,10 @@ static int mpi_hw_event(struct pm8001_hba_info *pm8001_ha, void *piomb) break; case HW_EVENT_PORT_RESET_TIMER_TMO: pm8001_dbg(pm8001_ha, MSG, "HW_EVENT_PORT_RESET_TIMER_TMO\n"); - pm80xx_hw_event_ack_req(pm8001_ha, 0, HW_EVENT_PHY_DOWN, - port_id, phy_id, 0, 0); + if (!pm8001_ha->phy[phy_id].reset_completion) { + pm80xx_hw_event_ack_req(pm8001_ha, 0, HW_EVENT_PHY_DOWN, + port_id, phy_id, 0, 0); + } sas_phy_disconnected(sas_phy); phy->phy_attached = 0; sas_notify_port_event(sas_phy, PORTE_LINK_RESET_ERR, @@ -5051,4 +5053,5 @@ const struct pm8001_dispatch pm8001_80xx_dispatch = { .fw_flash_update_req = pm8001_chip_fw_flash_update_req, .set_dev_state_req = pm8001_chip_set_dev_state_req, .fatal_errors = pm80xx_fatal_errors, + .hw_event_ack_req = pm80xx_hw_event_ack_req, }; From c3b48443ba7c4467d44f7faa16fea204ea6c239c Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Tue, 4 Jan 2022 11:24:52 +0000 Subject: [PATCH 0014/1295] scsi: aic79xx: Remove redundant error variable Return the value from ahd_linux_queue_abort_cmd() directly instead of using a redundant variable. Link: https://lore.kernel.org/r/20220104112452.601899-1-chi.minghao@zte.com.cn Reported-by: Zeal Robot Signed-off-by: Minghao Chi Signed-off-by: CGEL ZTE Signed-off-by: Martin K. Petersen --- drivers/scsi/aic7xxx/aic79xx_osm.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c index 5d566d2b2997..928099163f0f 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm.c +++ b/drivers/scsi/aic7xxx/aic79xx_osm.c @@ -755,11 +755,7 @@ ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, static int ahd_linux_abort(struct scsi_cmnd *cmd) { - int error; - - error = ahd_linux_queue_abort_cmd(cmd); - - return error; + return ahd_linux_queue_abort_cmd(cmd); } /* From b5e7b59c3480f355910f9d2c6ece5857922a5e54 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 28 Sep 2021 09:47:57 +1000 Subject: [PATCH 0015/1295] NFS: change nfs_access_get_cached to only report the mask Currently the nfs_access_get_cached family of functions report a 'struct nfs_access_entry' as the result, with both .mask and .cred set. However the .cred is never used. This is probably good and there is no guarantee that it won't be freed before use. Change to only report the 'mask' - as this is all that is used or needed. Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 20 +++++++++----------- fs/nfs/nfs4proc.c | 18 +++++++++--------- include/linux/nfs_fs.h | 4 ++-- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 731d31015b6a..8487a6d69116 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2673,7 +2673,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co return NULL; } -static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, bool may_block) +static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_access_entry *cache; @@ -2703,8 +2703,7 @@ static int nfs_access_get_cached_locked(struct inode *inode, const struct cred * spin_lock(&inode->i_lock); retry = false; } - res->cred = cache->cred; - res->mask = cache->mask; + *mask = cache->mask; list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); err = 0; out: @@ -2716,7 +2715,7 @@ out_zap: return -ENOENT; } -static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res) +static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, u32 *mask) { /* Only check the most recently returned cache entry, * but do it without locking. @@ -2738,22 +2737,21 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre goto out; if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) goto out; - res->cred = cache->cred; - res->mask = cache->mask; + *mask = cache->mask; err = 0; out: rcu_read_unlock(); return err; } -int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct -nfs_access_entry *res, bool may_block) +int nfs_access_get_cached(struct inode *inode, const struct cred *cred, + u32 *mask, bool may_block) { int status; - status = nfs_access_get_cached_rcu(inode, cred, res); + status = nfs_access_get_cached_rcu(inode, cred, mask); if (status != 0) - status = nfs_access_get_cached_locked(inode, cred, res, + status = nfs_access_get_cached_locked(inode, cred, mask, may_block); return status; @@ -2874,7 +2872,7 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) trace_nfs_access_enter(inode); - status = nfs_access_get_cached(inode, cred, &cache, may_block); + status = nfs_access_get_cached(inode, cred, &cache.mask, may_block); if (status == 0) goto out_cached; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ee3bc79f6ca3..322ff45ad15c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7611,7 +7611,7 @@ static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, const char *key, const void *buf, size_t buflen, int flags) { - struct nfs_access_entry cache; + u32 mask; int ret; if (!nfs_server_capable(inode, NFS_CAP_XATTR)) @@ -7626,8 +7626,8 @@ static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, * do a cached access check for the XA* flags to possibly avoid * doing an RPC and getting EACCES back. */ - if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) { - if (!(cache.mask & NFS_ACCESS_XAWRITE)) + if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) { + if (!(mask & NFS_ACCESS_XAWRITE)) return -EACCES; } @@ -7648,14 +7648,14 @@ static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *key, void *buf, size_t buflen) { - struct nfs_access_entry cache; + u32 mask; ssize_t ret; if (!nfs_server_capable(inode, NFS_CAP_XATTR)) return -EOPNOTSUPP; - if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) { - if (!(cache.mask & NFS_ACCESS_XAREAD)) + if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) { + if (!(mask & NFS_ACCESS_XAREAD)) return -EACCES; } @@ -7680,13 +7680,13 @@ nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len) ssize_t ret, size; char *buf; size_t buflen; - struct nfs_access_entry cache; + u32 mask; if (!nfs_server_capable(inode, NFS_CAP_XATTR)) return 0; - if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) { - if (!(cache.mask & NFS_ACCESS_XALIST)) + if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) { + if (!(mask & NFS_ACCESS_XALIST)) return 0; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 05f249f20f55..f33559acbcc2 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -533,8 +533,8 @@ extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr); extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags); extern void nfs_access_zap_cache(struct inode *inode); -extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, - bool may_block); +extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred, + u32 *mask, bool may_block); /* * linux/fs/nfs/symlink.c From 73fbb3fa647bdb5b60469af8101c741ece03a825 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 28 Sep 2021 09:47:57 +1000 Subject: [PATCH 0016/1295] NFS: pass cred explicitly for access tests Storing the 'struct cred *' in nfs_access_entry is problematic. An active 'cred' can keep a 'struct key *' active, and a quota is imposed on the number of such keys that a user can maintain. Cached 'nfs_access_entry' structs have indefinite lifetime, and having these keep 'struct key's alive imposes on that quota. So a future patch will remove the ->cred ref from nfs_access_entry. To prepare, change various functions to not assume there is a 'cred' in the nfs_access_entry, but to pass the cred around explicitly. Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 17 ++++++++++------- fs/nfs/nfs3proc.c | 5 +++-- fs/nfs/nfs4proc.c | 12 +++++++----- include/linux/nfs_fs.h | 2 +- include/linux/nfs_xdr.h | 2 +- 5 files changed, 22 insertions(+), 16 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8487a6d69116..a34351ce79a2 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2758,7 +2758,9 @@ int nfs_access_get_cached(struct inode *inode, const struct cred *cred, } EXPORT_SYMBOL_GPL(nfs_access_get_cached); -static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set) +static void nfs_access_add_rbtree(struct inode *inode, + struct nfs_access_entry *set, + const struct cred *cred) { struct nfs_inode *nfsi = NFS_I(inode); struct rb_root *root_node = &nfsi->access_cache; @@ -2771,7 +2773,7 @@ static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry * while (*p != NULL) { parent = *p; entry = rb_entry(parent, struct nfs_access_entry, rb_node); - cmp = cred_fscmp(set->cred, entry->cred); + cmp = cred_fscmp(cred, entry->cred); if (cmp < 0) p = &parent->rb_left; @@ -2793,13 +2795,14 @@ found: nfs_access_free_entry(entry); } -void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) +void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set, + const struct cred *cred) { struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); if (cache == NULL) return; RB_CLEAR_NODE(&cache->rb_node); - cache->cred = get_cred(set->cred); + cache->cred = get_cred(cred); cache->mask = set->mask; /* The above field assignments must be visible @@ -2807,7 +2810,7 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) * use rcu_assign_pointer, so just force the memory barrier. */ smp_wmb(); - nfs_access_add_rbtree(inode, cache); + nfs_access_add_rbtree(inode, cache, cred); /* Update accounting */ smp_mb__before_atomic(); @@ -2893,7 +2896,7 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) else cache.mask |= NFS_ACCESS_EXECUTE; cache.cred = cred; - status = NFS_PROTO(inode)->access(inode, &cache); + status = NFS_PROTO(inode)->access(inode, &cache, cred); if (status != 0) { if (status == -ESTALE) { if (!S_ISDIR(inode->i_mode)) @@ -2903,7 +2906,7 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) } goto out; } - nfs_access_add_cache(inode, &cache); + nfs_access_add_cache(inode, &cache, cred); out_cached: cache_mask = nfs_access_calc_mask(cache.mask, inode->i_mode); if ((mask & ~cache_mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 7100514d306b..1597eef40d54 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -220,7 +220,8 @@ static int nfs3_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle, task_flags); } -static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) +static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry, + const struct cred *cred) { struct nfs3_accessargs arg = { .fh = NFS_FH(inode), @@ -231,7 +232,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], .rpc_argp = &arg, .rpc_resp = &res, - .rpc_cred = entry->cred, + .rpc_cred = cred, }; int status = -ENOMEM; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 322ff45ad15c..f02aa9877e6f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2655,7 +2655,7 @@ static int nfs4_opendata_access(const struct cred *cred, cache.cred = cred; nfs_access_set_mask(&cache, opendata->o_res.access_result); - nfs_access_add_cache(state->inode, &cache); + nfs_access_add_cache(state->inode, &cache, cred); flags = NFS4_ACCESS_READ | NFS4_ACCESS_EXECUTE | NFS4_ACCESS_LOOKUP; if ((mask & ~cache.mask & flags) == 0) @@ -4441,7 +4441,8 @@ static int nfs4_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle, return err; } -static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) +static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry, + const struct cred *cred) { struct nfs_server *server = NFS_SERVER(inode); struct nfs4_accessargs args = { @@ -4455,7 +4456,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], .rpc_argp = &args, .rpc_resp = &res, - .rpc_cred = entry->cred, + .rpc_cred = cred, }; int status = 0; @@ -4475,14 +4476,15 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry return status; } -static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) +static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry, + const struct cred *cred) { struct nfs4_exception exception = { .interruptible = true, }; int err; do { - err = _nfs4_proc_access(inode, entry); + err = _nfs4_proc_access(inode, entry, cred); trace_nfs4_access(inode, err); err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f33559acbcc2..c33b1b792bc9 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -396,7 +396,7 @@ extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fa extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct user_namespace *, const struct path *, struct kstat *, u32, unsigned int); -extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *); +extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *, const struct cred *); extern void nfs_access_set_mask(struct nfs_access_entry *, u32); extern int nfs_permission(struct user_namespace *, struct inode *, int); extern int nfs_open(struct inode *, struct file *); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 967a0098f0a9..9102bdaa3faa 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1737,7 +1737,7 @@ struct nfs_rpc_ops { struct nfs_fh *, struct nfs_fattr *); int (*lookupp) (struct inode *, struct nfs_fh *, struct nfs_fattr *); - int (*access) (struct inode *, struct nfs_access_entry *); + int (*access) (struct inode *, struct nfs_access_entry *, const struct cred *); int (*readlink)(struct inode *, struct page *, unsigned int, unsigned int); int (*create) (struct inode *, struct dentry *, From 6238aec83f3fb12132f964937e5bbcf248fea8f9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 28 Sep 2021 09:47:57 +1000 Subject: [PATCH 0017/1295] NFS: don't store 'struct cred *' in struct nfs_access_entry Storing the 'struct cred *' in nfs_access_entry is problematic. An active 'cred' can keep a 'struct key *' active, and a quota is imposed on the number of such keys that a user can maintain. Cached 'nfs_access_entry' structs have indefinite lifetime, and having these keep 'struct key's alive imposes on that quota. So remove the 'struct cred *' and replace it with the fields we need: kuid_t, kgid_t, and struct group_info * This makes the 'struct nfs_access_entry' 64 bits larger. New function "access_cmp" is introduced which is identical to cred_fscmp() except that the second arg is an 'nfs_access_entry', rather than a 'cred' Fixes: b68572e07c58 ("NFS: change access cache to use 'struct cred'.") Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 50 +++++++++++++++++++++++++++++++++++++----- fs/nfs/nfs4proc.c | 1 - include/linux/nfs_fs.h | 4 +++- 3 files changed, 47 insertions(+), 8 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a34351ce79a2..7dee3fd10382 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2528,7 +2528,7 @@ MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache lengt static void nfs_access_free_entry(struct nfs_access_entry *entry) { - put_cred(entry->cred); + put_group_info(entry->group_info); kfree_rcu(entry, rcu_head); smp_mb__before_atomic(); atomic_long_dec(&nfs_access_nr_entries); @@ -2654,6 +2654,43 @@ void nfs_access_zap_cache(struct inode *inode) } EXPORT_SYMBOL_GPL(nfs_access_zap_cache); +static int access_cmp(const struct cred *a, const struct nfs_access_entry *b) +{ + struct group_info *ga, *gb; + int g; + + if (uid_lt(a->fsuid, b->fsuid)) + return -1; + if (uid_gt(a->fsuid, b->fsuid)) + return 1; + + if (gid_lt(a->fsgid, b->fsgid)) + return -1; + if (gid_gt(a->fsgid, b->fsgid)) + return 1; + + ga = a->group_info; + gb = b->group_info; + if (ga == gb) + return 0; + if (ga == NULL) + return -1; + if (gb == NULL) + return 1; + if (ga->ngroups < gb->ngroups) + return -1; + if (ga->ngroups > gb->ngroups) + return 1; + + for (g = 0; g < ga->ngroups; g++) { + if (gid_lt(ga->gid[g], gb->gid[g])) + return -1; + if (gid_gt(ga->gid[g], gb->gid[g])) + return 1; + } + return 0; +} + static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, const struct cred *cred) { struct rb_node *n = NFS_I(inode)->access_cache.rb_node; @@ -2661,7 +2698,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co while (n != NULL) { struct nfs_access_entry *entry = rb_entry(n, struct nfs_access_entry, rb_node); - int cmp = cred_fscmp(cred, entry->cred); + int cmp = access_cmp(cred, entry); if (cmp < 0) n = n->rb_left; @@ -2731,7 +2768,7 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru)); cache = list_entry(lh, struct nfs_access_entry, lru); if (lh == &nfsi->access_cache_entry_lru || - cred_fscmp(cred, cache->cred) != 0) + access_cmp(cred, cache) != 0) cache = NULL; if (cache == NULL) goto out; @@ -2773,7 +2810,7 @@ static void nfs_access_add_rbtree(struct inode *inode, while (*p != NULL) { parent = *p; entry = rb_entry(parent, struct nfs_access_entry, rb_node); - cmp = cred_fscmp(cred, entry->cred); + cmp = access_cmp(cred, entry); if (cmp < 0) p = &parent->rb_left; @@ -2802,7 +2839,9 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set, if (cache == NULL) return; RB_CLEAR_NODE(&cache->rb_node); - cache->cred = get_cred(cred); + cache->fsuid = cred->fsuid; + cache->fsgid = cred->fsgid; + cache->group_info = get_group_info(cred->group_info); cache->mask = set->mask; /* The above field assignments must be visible @@ -2895,7 +2934,6 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP; else cache.mask |= NFS_ACCESS_EXECUTE; - cache.cred = cred; status = NFS_PROTO(inode)->access(inode, &cache, cred); if (status != 0) { if (status == -ESTALE) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f02aa9877e6f..5d9ff01ddf46 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2653,7 +2653,6 @@ static int nfs4_opendata_access(const struct cred *cred, } else if ((fmode & FMODE_READ) && !opendata->file_created) mask = NFS4_ACCESS_READ; - cache.cred = cred; nfs_access_set_mask(&cache, opendata->o_res.access_result); nfs_access_add_cache(state->inode, &cache, cred); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index c33b1b792bc9..450e393d4bf2 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -61,7 +61,9 @@ struct nfs_access_entry { struct rb_node rb_node; struct list_head lru; - const struct cred * cred; + kuid_t fsuid; + kgid_t fsgid; + struct group_info *group_info; __u32 mask; struct rcu_head rcu_head; }; From 204975036b34f55237bc44c8a302a88468ef21b5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 15 Dec 2021 16:38:15 -0500 Subject: [PATCH 0018/1295] NFS: Ensure the server has an up to date ctime before hardlinking Creating a hard link is required by POSIX to update the file ctime, so ensure that the file data is synced to disk so that we don't clobber the updated ctime by writing back after creating the hard link. Fixes: 9f7682728728 ("NFS: Move the delegation return down into nfs4_proc_link()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7dee3fd10382..3f8e5122d004 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2379,6 +2379,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) trace_nfs_link_enter(inode, dir, dentry); d_drop(dentry); + if (S_ISREG(inode->i_mode)) + nfs_sync_inode(inode); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); From 6ff9d99bb88faebf134ca668842349d9718e5464 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 15 Dec 2021 16:38:16 -0500 Subject: [PATCH 0019/1295] NFS: Ensure the server has an up to date ctime before renaming Renaming a file is required by POSIX to update the file ctime, so ensure that the file data is synced to disk so that we don't clobber the updated ctime by writing back after creating the hard link. Fixes: f2c2c552f119 ("NFS: Move delegation recall into the NFSv4 callback for rename_setup()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 3f8e5122d004..9883f72fdb6f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2470,6 +2470,8 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, } } + if (S_ISREG(old_inode->i_mode)) + nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); if (IS_ERR(task)) { error = PTR_ERR(task); From 4b0c359b813bbf115f5e2219ea8c0e4fad92400b Mon Sep 17 00:00:00 2001 From: Pierguido Lambri Date: Mon, 13 Dec 2021 08:38:48 +0000 Subject: [PATCH 0020/1295] SUNRPC: Add source address/port to rpc_socket* traces The rpc_socket* traces now show also the source address and port. An example is: kworker/u17:1-951 [005] 134218.925343: rpc_socket_close: socket:[46913] srcaddr=192.168.100.187:793 dstaddr=192.168.100.129:2049 state=4 (DISCONNECTING) sk_state=7 (CLOSE) kworker/u17:0-242 [006] 134360.841370: rpc_socket_connect: error=-115 socket:[56322] srcaddr=192.168.100.187:769 dstaddr=192.168.100.129:2049 state=2 (CONNECTING) sk_state=2 (SYN_SENT) -0 [006] 134360.841859: rpc_socket_state_change: socket:[56322] srcaddr=192.168.100.187:769 dstaddr=192.168.100.129:2049 state=2 (CONNECTING) sk_state=1 (ESTABLISHED) Signed-off-by: Pierguido Lambri Signed-off-by: Anna Schumaker --- include/trace/events/sunrpc.h | 52 +++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 3a99358c262b..a6af347c935d 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -794,6 +794,9 @@ RPC_SHOW_SOCKET RPC_SHOW_SOCK + +#include + /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. @@ -816,27 +819,32 @@ DECLARE_EVENT_CLASS(xs_socket_event, __field(unsigned int, socket_state) __field(unsigned int, sock_state) __field(unsigned long long, ino) - __string(dstaddr, - xprt->address_strings[RPC_DISPLAY_ADDR]) - __string(dstport, - xprt->address_strings[RPC_DISPLAY_PORT]) + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( struct inode *inode = SOCK_INODE(socket); + const struct sock *sk = socket->sk; + const struct inet_sock *inet = inet_sk(sk); + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + + TP_STORE_ADDR_PORTS(__entry, inet, sk); + __entry->socket_state = socket->state; __entry->sock_state = socket->sk->sk_state; __entry->ino = (unsigned long long)inode->i_ino; - __assign_str(dstaddr, - xprt->address_strings[RPC_DISPLAY_ADDR]); - __assign_str(dstport, - xprt->address_strings[RPC_DISPLAY_PORT]); + ), TP_printk( - "socket:[%llu] dstaddr=%s/%s " + "socket:[%llu] srcaddr=%pISpc dstaddr=%pISpc " "state=%u (%s) sk_state=%u (%s)", - __entry->ino, __get_str(dstaddr), __get_str(dstport), + __entry->ino, + __entry->saddr, + __entry->daddr, __entry->socket_state, rpc_show_socket_state(__entry->socket_state), __entry->sock_state, @@ -866,29 +874,33 @@ DECLARE_EVENT_CLASS(xs_socket_event_done, __field(unsigned int, socket_state) __field(unsigned int, sock_state) __field(unsigned long long, ino) - __string(dstaddr, - xprt->address_strings[RPC_DISPLAY_ADDR]) - __string(dstport, - xprt->address_strings[RPC_DISPLAY_PORT]) + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( struct inode *inode = SOCK_INODE(socket); + const struct sock *sk = socket->sk; + const struct inet_sock *inet = inet_sk(sk); + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + + TP_STORE_ADDR_PORTS(__entry, inet, sk); + __entry->socket_state = socket->state; __entry->sock_state = socket->sk->sk_state; __entry->ino = (unsigned long long)inode->i_ino; __entry->error = error; - __assign_str(dstaddr, - xprt->address_strings[RPC_DISPLAY_ADDR]); - __assign_str(dstport, - xprt->address_strings[RPC_DISPLAY_PORT]); ), TP_printk( - "error=%d socket:[%llu] dstaddr=%s/%s " + "error=%d socket:[%llu] srcaddr=%pISpc dstaddr=%pISpc " "state=%u (%s) sk_state=%u (%s)", __entry->error, - __entry->ino, __get_str(dstaddr), __get_str(dstport), + __entry->ino, + __entry->saddr, + __entry->daddr, __entry->socket_state, rpc_show_socket_state(__entry->socket_state), __entry->sock_state, From c72a826829ccfb38019187a3a5ba6d3584b7b7dc Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 10 Aug 2021 18:31:01 -0500 Subject: [PATCH 0021/1295] nfs41: pnfs: filelayout: Replace one-element array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. Refactor the code a bit according to the use of a flexible-array member in struct nfs4_file_layout_dsaddr instead of a one-element array, and use the struct_size() helper. This helps with the ongoing efforts to globally enable -Warray-bounds and get us closer to being able to tighten the FORTIFY_SOURCE routines on memcpy(). This issue was found with the help of Coccinelle and audited and fixed, manually. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/79 Link: https://github.com/KSPP/linux/issues/109 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayout.h | 2 +- fs/nfs/filelayout/filelayoutdev.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h index 79323b5dab0c..aed0748fd6ec 100644 --- a/fs/nfs/filelayout/filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -51,7 +51,7 @@ struct nfs4_file_layout_dsaddr { u32 stripe_count; u8 *stripe_indices; u32 ds_num; - struct nfs4_pnfs_ds *ds_list[1]; + struct nfs4_pnfs_ds *ds_list[]; }; struct nfs4_filelayout_segment { diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 86c3f7e69ec4..acf4b88889dc 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -136,9 +136,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_free_stripe_indices; } - dsaddr = kzalloc(sizeof(*dsaddr) + - (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), - gfp_flags); + dsaddr = kzalloc(struct_size(dsaddr, ds_list, num), gfp_flags); if (!dsaddr) goto out_err_free_stripe_indices; From 35e0f9a9af4869a4b1a3943da08d3a29ac6c4a42 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Wed, 10 Nov 2021 09:32:17 +0000 Subject: [PATCH 0022/1295] sunrpc: Remove unneeded null check In g_verify_token_header, the null check of 'ret' is unneeded to be done twice. Signed-off-by: Xu Wang Signed-off-by: Anna Schumaker --- net/sunrpc/auth_gss/gss_generic_token.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c index fe97f3106536..4a4082bb22ad 100644 --- a/net/sunrpc/auth_gss/gss_generic_token.c +++ b/net/sunrpc/auth_gss/gss_generic_token.c @@ -222,10 +222,8 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size, if (ret) return ret; - if (!ret) { - *buf_in = buf; - *body_size = toksize; - } + *buf_in = buf; + *body_size = toksize; return ret; } From c4f0396688b5916b3a95bcb004d158634f2234ff Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 18 Nov 2021 17:37:41 +0800 Subject: [PATCH 0023/1295] SUNRPC: clean up some inconsistent indenting Eliminate the follow smatch warning: net/sunrpc/xprtsock.c:1912 xs_local_connect() warn: inconsistent indenting. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d8ee06a9650a..69b6ee5a5fd1 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1910,7 +1910,7 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); int ret; - if (RPC_IS_ASYNC(task)) { + if (RPC_IS_ASYNC(task)) { /* * We want the AF_LOCAL connect to be resolved in the * filesystem namespace of the process making the rpc From 2c52c8376db7160a1dd8a681c61c9258405ef143 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 29 Nov 2021 15:33:56 -0500 Subject: [PATCH 0024/1295] NFSv4 only print the label when its queried When the bitmask of the attributes doesn't include the security label, don't bother printing it. Since the label might not be null terminated, adjust the printing format accordingly. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4xdr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 69862bf6db00..801119b7a596 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4200,10 +4200,11 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, } else printk(KERN_WARNING "%s: label too long (%u)!\n", __func__, len); + if (label && label->label) + dprintk("%s: label=%.*s, len=%d, PI=%d, LFS=%d\n", + __func__, label->len, (char *)label->label, + label->len, label->pi, label->lfs); } - if (label && label->label) - dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, - (char *)label->label, label->len, label->pi, label->lfs); return status; } From fbd2057e5329d3502a27491190237b6be52a1cb6 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Fri, 17 Dec 2021 01:01:33 +0800 Subject: [PATCH 0025/1295] nfs: nfs4clinet: check the return value of kstrdup() kstrdup() returns NULL when some internal memory errors happen, it is better to check the return value of it so to catch the memory error in time. Signed-off-by: Xiaoke Wang Signed-off-by: Anna Schumaker --- fs/nfs/nfs4client.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index d8b5a250ca05..47a6cf892c95 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1343,8 +1343,11 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, } nfs_put_client(clp); - if (server->nfs_client->cl_hostname == NULL) + if (server->nfs_client->cl_hostname == NULL) { server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); + if (server->nfs_client->cl_hostname == NULL) + return -ENOMEM; + } nfs_server_insert_lists(server); return nfs_probe_server(server, NFS_FH(d_inode(server->super->s_root))); From b05bf5c63b326ce1da84ef42498d8e0e292e694c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 3 Jan 2022 14:50:16 -0500 Subject: [PATCH 0026/1295] NFSv4.1: Fix uninitialised variable in devicenotify When decode_devicenotify_args() exits with no entries, we need to ensure that the struct cb_devicenotifyargs is initialised to { 0, NULL } in order to avoid problems in nfs4_callback_devicenotify(). Reported-by: Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/callback.h | 2 +- fs/nfs/callback_proc.c | 2 +- fs/nfs/callback_xdr.c | 18 +++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 6a2033131c06..ccd4f245cae2 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -170,7 +170,7 @@ struct cb_devicenotifyitem { }; struct cb_devicenotifyargs { - int ndevs; + uint32_t ndevs; struct cb_devicenotifyitem *devs; }; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 09c5b1cb3e07..c343666d9a42 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -358,7 +358,7 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp, struct cb_process_state *cps) { struct cb_devicenotifyargs *args = argp; - int i; + uint32_t i; __be32 res = 0; struct nfs_client *clp = cps->clp; struct nfs_server *server = NULL; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index a67c41ec545f..f90de8043b0f 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -258,11 +258,9 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, void *argp) { struct cb_devicenotifyargs *args = argp; + uint32_t tmp, n, i; __be32 *p; __be32 status = 0; - u32 tmp; - int n, i; - args->ndevs = 0; /* Num of device notifications */ p = xdr_inline_decode(xdr, sizeof(uint32_t)); @@ -271,7 +269,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, goto out; } n = ntohl(*p++); - if (n <= 0) + if (n == 0) goto out; if (n > ULONG_MAX / sizeof(*args->devs)) { status = htonl(NFS4ERR_BADXDR); @@ -330,19 +328,21 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, dev->cbd_immediate = 0; } - args->ndevs++; - dprintk("%s: type %d layout 0x%x immediate %d\n", __func__, dev->cbd_notify_type, dev->cbd_layout_type, dev->cbd_immediate); } + args->ndevs = n; + dprintk("%s: ndevs %d\n", __func__, args->ndevs); + return 0; +err: + kfree(args->devs); out: + args->devs = NULL; + args->ndevs = 0; dprintk("%s: status %d ndevs %d\n", __func__, ntohl(status), args->ndevs); return status; -err: - kfree(args->devs); - goto out; } static __be32 decode_sessionid(struct xdr_stream *xdr, From 1ab5be4ac5b1c9ce39ce1037c45b68d2ce6eede0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Dec 2021 15:36:54 -0500 Subject: [PATCH 0027/1295] NFSv4: Add some support for case insensitive filesystems Add capabilities to allow the NFS client to recognise when it is dealing with case insensitive and case preserving filesystems. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 8 +++++++- fs/nfs/nfs4xdr.c | 40 +++++++++++++++++++++++++++++++++++++++ include/linux/nfs_fs_sb.h | 2 ++ include/linux/nfs_xdr.h | 2 ++ 4 files changed, 51 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5d9ff01ddf46..a6d7472058ad 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3840,7 +3840,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f FATTR4_WORD0_FH_EXPIRE_TYPE | FATTR4_WORD0_LINK_SUPPORT | FATTR4_WORD0_SYMLINK_SUPPORT | - FATTR4_WORD0_ACLSUPPORT; + FATTR4_WORD0_ACLSUPPORT | + FATTR4_WORD0_CASE_INSENSITIVE | + FATTR4_WORD0_CASE_PRESERVING; if (minorversion) bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; @@ -3869,6 +3871,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; + if (res.case_insensitive) + server->caps |= NFS_CAP_CASE_INSENSITIVE; + if (res.case_preserving) + server->caps |= NFS_CAP_CASE_PRESERVING; #ifdef CONFIG_NFS_V4_SECURITY_LABEL if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL) server->caps |= NFS_CAP_SECURITY_LABEL; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 801119b7a596..a9487c840052 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3533,6 +3533,42 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint return 0; } +static int decode_attr_case_insensitive(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_CASE_INSENSITIVE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_CASE_INSENSITIVE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_CASE_INSENSITIVE; + } + dprintk("%s: case_insensitive=%s\n", __func__, *res == 0 ? "false" : "true"); + return 0; +} + +static int decode_attr_case_preserving(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_CASE_PRESERVING - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_CASE_PRESERVING)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_CASE_PRESERVING; + } + dprintk("%s: case_preserving=%s\n", __func__, *res == 0 ? "false" : "true"); + return 0; +} + static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) { __be32 *p; @@ -4413,6 +4449,10 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re goto xdr_error; if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0) goto xdr_error; + if ((status = decode_attr_case_insensitive(xdr, bitmap, &res->case_insensitive)) != 0) + goto xdr_error; + if ((status = decode_attr_case_preserving(xdr, bitmap, &res->case_preserving)) != 0) + goto xdr_error; if ((status = decode_attr_exclcreat_supported(xdr, bitmap, res->exclcreat_bitmask)) != 0) goto xdr_error; diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 2a9acbfe00f0..f24fc67af42d 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -271,6 +271,8 @@ struct nfs_server { #define NFS_CAP_ACLS (1U << 3) #define NFS_CAP_ATOMIC_OPEN (1U << 4) #define NFS_CAP_LGOPEN (1U << 5) +#define NFS_CAP_CASE_INSENSITIVE (1U << 6) +#define NFS_CAP_CASE_PRESERVING (1U << 7) #define NFS_CAP_POSIX_LOCK (1U << 14) #define NFS_CAP_UIDGID_NOMAP (1U << 15) #define NFS_CAP_STATEID_NFSV41 (1U << 16) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9102bdaa3faa..ddff92396a3f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1194,6 +1194,8 @@ struct nfs4_server_caps_res { u32 has_links; u32 has_symlinks; u32 fh_expire_type; + u32 case_insensitive; + u32 case_preserving; }; #define NFS4_PATHNAME_MAXCOMPONENTS 512 From 98ca3ee60b9e4b039cc3ef21a169e775afa9bd0c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Dec 2021 15:36:55 -0500 Subject: [PATCH 0028/1295] NFSv4: Just don't cache negative dentries on case insensitive servers If the directory contents change, we cannot rely on the negative dentry being cacheable. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 9883f72fdb6f..7ebfe4e3cf85 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1436,6 +1436,9 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, return 0; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) return 1; + /* Case insensitive server? Revalidate negative dentries */ + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + return 1; return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU); } From 8ce37abdeb4c73842d16620e1da151765ac86b5e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Dec 2021 15:36:56 -0500 Subject: [PATCH 0029/1295] NFS: Invalidate negative dentries on all case insensitive directory changes If we create a file, rename it, or hardlink it, then we need to assume that cached negative dentries need to be revalidated. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7ebfe4e3cf85..a691a1e364a7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1324,6 +1324,14 @@ void nfs_clear_verifier_delegated(struct inode *inode) EXPORT_SYMBOL_GPL(nfs_clear_verifier_delegated); #endif /* IS_ENABLED(CONFIG_NFS_V4) */ +static int nfs_dentry_verify_change(struct inode *dir, struct dentry *dentry) +{ + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE) && + d_really_is_negative(dentry)) + return dentry->d_time == inode_peek_iversion_raw(dir); + return nfs_verify_change_attribute(dir, dentry->d_time); +} + /* * A check for whether or not the parent directory has changed. * In the case it has, we assume that the dentries are untrustworthy @@ -1337,7 +1345,7 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, return 1; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) return 0; - if (!nfs_verify_change_attribute(dir, dentry->d_time)) + if (!nfs_dentry_verify_change(dir, dentry)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ if (nfs_mapping_need_revalidate_inode(dir)) { @@ -1346,7 +1354,7 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, if (__nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) return 0; } - if (!nfs_verify_change_attribute(dir, dentry->d_time)) + if (!nfs_dentry_verify_change(dir, dentry)) return 0; return 1; } @@ -1539,7 +1547,7 @@ out: * If the lookup failed despite the dentry change attribute being * a match, then we should revalidate the directory cache. */ - if (!ret && nfs_verify_change_attribute(dir, dentry->d_time)) + if (!ret && nfs_dentry_verify_change(dir, dentry)) nfs_mark_dir_for_revalidate(dir); return nfs_lookup_revalidate_done(dir, dentry, inode, ret); } @@ -1778,8 +1786,11 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in dir_verifier = nfs_save_change_attribute(dir); trace_nfs_lookup_enter(dir, dentry, flags); error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); - if (error == -ENOENT) + if (error == -ENOENT) { + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + dir_verifier = inode_peek_iversion_raw(dir); goto no_entry; + } if (error < 0) { res = ERR_PTR(error); goto out; From 00bdadc7accfce944dc30fbc205cd28a7eed657b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Dec 2021 15:36:57 -0500 Subject: [PATCH 0030/1295] NFS: Add a helper to remove case-insensitive aliases When dealing with case insensitive names, the client has no idea how the server performs the mapping, so cannot collapse the dentries into a single representative. So both rename and unlink need to deal with the fact that there could be several dentries representing the file, and have to somehow force them to be revalidated. Use d_prune_aliases() as a big hammer approach. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 12 +++++++++++- fs/nfs/internal.h | 1 + fs/nfs/nfs4proc.c | 5 ++++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a691a1e364a7..d2a58f7a73a6 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1819,6 +1819,14 @@ out: } EXPORT_SYMBOL_GPL(nfs_lookup); +void nfs_d_prune_case_insensitive_aliases(struct inode *inode) +{ + /* Case insensitive server? Revalidate dentries */ + if (inode && nfs_server_capable(inode, NFS_CAP_CASE_INSENSITIVE)) + d_prune_aliases(inode); +} +EXPORT_SYMBOL_GPL(nfs_d_prune_case_insensitive_aliases); + #if IS_ENABLED(CONFIG_NFS_V4) static int nfs4_lookup_revalidate(struct dentry *, unsigned int); @@ -2199,8 +2207,10 @@ static void nfs_dentry_remove_handle_error(struct inode *dir, switch (error) { case -ENOENT: d_delete(dentry); - fallthrough; + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + break; case 0: + nfs_d_prune_case_insensitive_aliases(d_inode(dentry)); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); } } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 12f6acb483bb..2de7c56a1fbe 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -373,6 +373,7 @@ extern unsigned long nfs_access_cache_count(struct shrinker *shrink, extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); +void nfs_d_prune_case_insensitive_aliases(struct inode *inode); int nfs_create(struct user_namespace *, struct inode *, struct dentry *, umode_t, bool); int nfs_mkdir(struct user_namespace *, struct inode *, struct dentry *, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a6d7472058ad..7df2eb74025f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4670,8 +4670,10 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, nfs_fattr_init(res->dir_attr); - if (inode) + if (inode) { nfs4_inode_return_delegation(inode); + nfs_d_prune_case_insensitive_aliases(inode); + } } static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) @@ -4737,6 +4739,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, return 0; if (task->tk_status == 0) { + nfs_d_prune_case_insensitive_aliases(d_inode(data->old_dentry)); if (new_dir != old_dir) { /* Note: If we moved a directory, nlink will change */ nfs4_update_changeattr(old_dir, &res->old_cinfo, From 68eaba4ca924a97a863c5c81c0b23a11dcb6db90 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Dec 2021 15:36:58 -0500 Subject: [PATCH 0031/1295] NFS: Fix the verifier for case sensitive filesystem in nfs_atomic_open() Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d2a58f7a73a6..2884138848f4 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1888,6 +1888,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct iattr attr = { .ia_valid = ATTR_OPEN }; struct inode *inode; unsigned int lookup_flags = 0; + unsigned long dir_verifier; bool switched = false; int created = 0; int err; @@ -1961,7 +1962,11 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, switch (err) { case -ENOENT: d_splice_alias(NULL, dentry); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + dir_verifier = inode_peek_iversion_raw(dir); + else + dir_verifier = nfs_save_change_attribute(dir); + nfs_set_verifier(dentry, dir_verifier); break; case -EISDIR: case -ENOTDIR: From 01f34245722b9dac0b2667db4dc17d2049b99c76 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 28 Dec 2021 15:41:38 +0100 Subject: [PATCH 0032/1295] NFS: use default_groups in kobj_type There are currently 2 ways to create a set of sysfs files for a kobj_type, through the default_attrs field, and the default_groups field. Move the NFS code to use default_groups field which has been the preferred way since aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") so that we can soon get rid of the obsolete default_attrs field. Cc: Trond Myklebust Cc: Anna Schumaker Cc: linux-nfs@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Anna Schumaker --- fs/nfs/sysfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 8cb70755e3c9..a6f740366963 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -142,10 +142,11 @@ static struct attribute *nfs_netns_client_attrs[] = { &nfs_netns_client_id.attr, NULL, }; +ATTRIBUTE_GROUPS(nfs_netns_client); static struct kobj_type nfs_netns_client_type = { .release = nfs_netns_client_release, - .default_attrs = nfs_netns_client_attrs, + .default_groups = nfs_netns_client_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = nfs_netns_client_namespace, }; From 86439fa2678d1ae752ac9c787aac1c145b87b4c2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 28 Dec 2021 15:48:23 +0100 Subject: [PATCH 0033/1295] SUNRPC: use default_groups in kobj_type There are currently 2 ways to create a set of sysfs files for a kobj_type, through the default_attrs field, and the default_groups field. Move the sunrpc sysfs code to use default_groups field which has been the preferred way since aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") so that we can soon get rid of the obsolete default_attrs field. Cc: "J. Bruce Fields" Cc: Chuck Lever Cc: Trond Myklebust Cc: Anna Schumaker Cc: linux-nfs@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Anna Schumaker --- net/sunrpc/sysfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 2766dd21935b..b1aea3419218 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -422,6 +422,7 @@ static struct attribute *rpc_sysfs_xprt_attrs[] = { &rpc_sysfs_xprt_change_state.attr, NULL, }; +ATTRIBUTE_GROUPS(rpc_sysfs_xprt); static struct kobj_attribute rpc_sysfs_xprt_switch_info = __ATTR(xprt_switch_info, 0444, rpc_sysfs_xprt_switch_info_show, NULL); @@ -430,6 +431,7 @@ static struct attribute *rpc_sysfs_xprt_switch_attrs[] = { &rpc_sysfs_xprt_switch_info.attr, NULL, }; +ATTRIBUTE_GROUPS(rpc_sysfs_xprt_switch); static struct kobj_type rpc_sysfs_client_type = { .release = rpc_sysfs_client_release, @@ -439,14 +441,14 @@ static struct kobj_type rpc_sysfs_client_type = { static struct kobj_type rpc_sysfs_xprt_switch_type = { .release = rpc_sysfs_xprt_switch_release, - .default_attrs = rpc_sysfs_xprt_switch_attrs, + .default_groups = rpc_sysfs_xprt_switch_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_xprt_switch_namespace, }; static struct kobj_type rpc_sysfs_xprt_type = { .release = rpc_sysfs_xprt_release, - .default_attrs = rpc_sysfs_xprt_attrs, + .default_groups = rpc_sysfs_xprt_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_xprt_namespace, }; From 85847280b11666ae24aac519e06f0742fab72064 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 27 Dec 2021 14:40:51 -0500 Subject: [PATCH 0034/1295] NFSv4: Allow writebacks to request 'blocks used' When doing a non-pNFS write, allow the writeback code to specify that it also needs to update 'blocks used'. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4_fs.h | 2 ++ fs/nfs/nfs4proc.c | 21 +++++++-------------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index ed5eaca6801e..89076dd32334 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -315,6 +315,8 @@ extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, fmode_t fmode); +extern void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[], + struct inode *inode, unsigned long cache_validity); extern int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct inode *inode); extern int update_open_stateid(struct nfs4_state *state, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7df2eb74025f..44ab27f20f04 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -108,10 +108,6 @@ static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, const struct cred *, bool); #endif -static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], - const __u32 *src, struct inode *inode, - struct nfs_server *server, - struct nfs4_label *label); #ifdef CONFIG_NFS_V4_SECURITY_LABEL static inline struct nfs4_label * @@ -3669,7 +3665,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (!nfs4_have_delegation(inode, FMODE_READ)) { nfs4_bitmask_set(calldata->arg.bitmask_store, server->cache_consistency_bitmask, - inode, server, NULL); + inode, 0); calldata->arg.bitmask = calldata->arg.bitmask_store; } else calldata->arg.bitmask = NULL; @@ -5432,14 +5428,14 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr) return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; } -static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src, - struct inode *inode, struct nfs_server *server, - struct nfs4_label *label) +void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[], + struct inode *inode, unsigned long cache_validity) { - unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + struct nfs_server *server = NFS_SERVER(inode); unsigned int i; memcpy(bitmask, src, sizeof(*bitmask) * NFS4_BITMASK_SZ); + cache_validity |= READ_ONCE(NFS_I(inode)->cache_validity); if (cache_validity & NFS_INO_INVALID_CHANGE) bitmask[0] |= FATTR4_WORD0_CHANGE; @@ -5451,8 +5447,6 @@ static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src, bitmask[1] |= FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP; if (cache_validity & NFS_INO_INVALID_NLINK) bitmask[1] |= FATTR4_WORD1_NUMLINKS; - if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL) - bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL; if (cache_validity & NFS_INO_INVALID_CTIME) bitmask[1] |= FATTR4_WORD1_TIME_METADATA; if (cache_validity & NFS_INO_INVALID_MTIME) @@ -5479,7 +5473,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, } else { nfs4_bitmask_set(hdr->args.bitmask_store, server->cache_consistency_bitmask, - hdr->inode, server, NULL); + hdr->inode, NFS_INO_INVALID_BLOCKS); hdr->args.bitmask = hdr->args.bitmask_store; } @@ -6517,8 +6511,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, data->args.fhandle = &data->fh; data->args.stateid = &data->stateid; nfs4_bitmask_set(data->args.bitmask_store, - server->cache_consistency_bitmask, inode, server, - NULL); + server->cache_consistency_bitmask, inode, 0); data->args.bitmask = data->args.bitmask_store; nfs_copy_fh(&data->fh, NFS_FH(inode)); nfs4_stateid_copy(&data->stateid, stateid); From 34bf20ce986c441c1088ed09a33e0bb96e52f99a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 27 Dec 2021 14:40:52 -0500 Subject: [PATCH 0035/1295] NFSv42: Fallocate and clone should also request 'blocks used' Both fallocate and clone can end up updating the blocks used attribute. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs42proc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 8b21ff1be717..32129446beca 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -46,7 +46,7 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, { struct inode *inode = file_inode(filep); struct nfs_server *server = NFS_SERVER(inode); - u32 bitmask[3]; + u32 bitmask[NFS_BITMASK_SZ]; struct nfs42_falloc_args args = { .falloc_fh = NFS_FH(inode), .falloc_offset = offset, @@ -69,9 +69,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, return status; } - memcpy(bitmask, server->cache_consistency_bitmask, sizeof(bitmask)); - if (server->attr_bitmask[1] & FATTR4_WORD1_SPACE_USED) - bitmask[1] |= FATTR4_WORD1_SPACE_USED; + nfs4_bitmask_set(bitmask, server->cache_consistency_bitmask, inode, + NFS_INO_INVALID_BLOCKS); res.falloc_fattr = nfs_alloc_fattr(); if (!res.falloc_fattr) @@ -1044,13 +1043,14 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct inode *src_inode = file_inode(src_f); struct inode *dst_inode = file_inode(dst_f); struct nfs_server *server = NFS_SERVER(dst_inode); + __u32 dst_bitmask[NFS_BITMASK_SZ]; struct nfs42_clone_args args = { .src_fh = NFS_FH(src_inode), .dst_fh = NFS_FH(dst_inode), .src_offset = src_offset, .dst_offset = dst_offset, .count = count, - .dst_bitmask = server->cache_consistency_bitmask, + .dst_bitmask = dst_bitmask, }; struct nfs42_clone_res res = { .server = server, @@ -1079,6 +1079,9 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, if (!res.dst_fattr) return -ENOMEM; + nfs4_bitmask_set(dst_bitmask, server->cache_consistency_bitmask, + dst_inode, NFS_INO_INVALID_BLOCKS); + status = nfs4_call_sync(server->client, server, msg, &args.seq_args, &res.seq_res, 0); trace_nfs4_clone(src_inode, dst_inode, &args, status); From 5d9224fb076e9a2023e0b06d6a164d644612c0c0 Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Tue, 4 Jan 2022 20:42:06 +0800 Subject: [PATCH 0036/1295] scsi: hisi_sas: Remove unused variable and check in hisi_sas_send_ata_reset_each_phy() In commit 29e2bac87421 ("scsi: hisi_sas: Fix some issues related to asd_sas_port->phy_list"), we use asd_sas_port->phy_mask instead of accessing asd_sas_port->phy_list, and it is enough to use asd_sas_port->phy_mask to check the state of phy, so remove the unused check and variable. Link: https://lore.kernel.org/r/1641300126-53574-1-git-send-email-chenxiang66@hisilicon.com Fixes: 29e2bac87421 ("scsi: hisi_sas: Fix some issues related to asd_sas_port->phy_list") Reported-by: Nathan Chancellor Reported-by: Colin King Acked-by: John Garry Signed-off-by: Xiang Chen Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_main.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index f46f679fe825..a05ec7aece5a 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1525,16 +1525,11 @@ static void hisi_sas_send_ata_reset_each_phy(struct hisi_hba *hisi_hba, struct device *dev = hisi_hba->dev; int s = sizeof(struct host_to_dev_fis); int rc = TMF_RESP_FUNC_FAILED; - struct asd_sas_phy *sas_phy; struct ata_link *link; u8 fis[20] = {0}; - u32 state; int i; - state = hisi_hba->hw->get_phys_state(hisi_hba); for (i = 0; i < hisi_hba->n_phy; i++) { - if (!(state & BIT(sas_phy->id))) - continue; if (!(sas_port->phy_mask & BIT(i))) continue; From 315d049ad1951cef02d9337a2469cac51cca6932 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 5 Jan 2022 09:36:33 -0800 Subject: [PATCH 0037/1295] scsi: megaraid: Avoid mismatched storage type sizes Remove needless use of mbox_t, replacing with just struct mbox_out. Silences compiler warnings under a -Warray-bounds build: drivers/scsi/megaraid.c: In function 'megaraid_probe_one': drivers/scsi/megaraid.c:3615:30: error: array subscript 'mbox_t[0]' is partly outside array bounds of 'unsigned char[15]' [-Werror=array-bounds] 3615 | mbox->m_out.xferaddr = (u32)adapter->buf_dma_handle; | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/scsi/megaraid.c:3599:23: note: while referencing 'raw_mbox' 3599 | unsigned char raw_mbox[sizeof(struct mbox_out)]; | ^~~~~~~~ Link: https://lore.kernel.org/r/20220105173633.2421129-1-keescook@chromium.org Cc: Kashyap Desai Cc: Sumit Saxena Cc: Shivasharan S Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: megaraidlinux.pdl@broadcom.com Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid.c | 84 +++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 50 deletions(-) diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index 0d31d7a5e335..bf987f3a7f3f 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -192,23 +192,21 @@ mega_query_adapter(adapter_t *adapter) { dma_addr_t prod_info_dma_handle; mega_inquiry3 *inquiry3; - u8 raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; + struct mbox_out mbox; + u8 *raw_mbox = (u8 *)&mbox; int retval; /* Initialize adapter inquiry mailbox */ - mbox = (mbox_t *)raw_mbox; - memset((void *)adapter->mega_buffer, 0, MEGA_BUFFER_SIZE); - memset(&mbox->m_out, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); /* * Try to issue Inquiry3 command * if not succeeded, then issue MEGA_MBOXCMD_ADAPTERINQ command and * update enquiry3 structure */ - mbox->m_out.xferaddr = (u32)adapter->buf_dma_handle; + mbox.xferaddr = (u32)adapter->buf_dma_handle; inquiry3 = (mega_inquiry3 *)adapter->mega_buffer; @@ -232,10 +230,10 @@ mega_query_adapter(adapter_t *adapter) inq = &ext_inq->raid_inq; - mbox->m_out.xferaddr = (u32)dma_handle; + mbox.xferaddr = (u32)dma_handle; /*issue old 0x04 command to adapter */ - mbox->m_out.cmd = MEGA_MBOXCMD_ADPEXTINQ; + mbox.cmd = MEGA_MBOXCMD_ADPEXTINQ; issue_scb_block(adapter, raw_mbox); @@ -262,7 +260,7 @@ mega_query_adapter(adapter_t *adapter) sizeof(mega_product_info), DMA_FROM_DEVICE); - mbox->m_out.xferaddr = prod_info_dma_handle; + mbox.xferaddr = prod_info_dma_handle; raw_mbox[0] = FC_NEW_CONFIG; /* i.e. mbox->cmd=0xA1 */ raw_mbox[2] = NC_SUBOP_PRODUCT_INFO; /* i.e. 0x0E */ @@ -3569,16 +3567,14 @@ mega_n_to_m(void __user *arg, megacmd_t *mc) static int mega_is_bios_enabled(adapter_t *adapter) { - unsigned char raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; + struct mbox_out mbox; + unsigned char *raw_mbox = (u8 *)&mbox; - mbox = (mbox_t *)raw_mbox; - - memset(&mbox->m_out, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); memset((void *)adapter->mega_buffer, 0, MEGA_BUFFER_SIZE); - mbox->m_out.xferaddr = (u32)adapter->buf_dma_handle; + mbox.xferaddr = (u32)adapter->buf_dma_handle; raw_mbox[0] = IS_BIOS_ENABLED; raw_mbox[2] = GET_BIOS; @@ -3600,13 +3596,11 @@ mega_is_bios_enabled(adapter_t *adapter) static void mega_enum_raid_scsi(adapter_t *adapter) { - unsigned char raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; + struct mbox_out mbox; + unsigned char *raw_mbox = (u8 *)&mbox; int i; - mbox = (mbox_t *)raw_mbox; - - memset(&mbox->m_out, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); /* * issue command to find out what channels are raid/scsi @@ -3616,7 +3610,7 @@ mega_enum_raid_scsi(adapter_t *adapter) memset((void *)adapter->mega_buffer, 0, MEGA_BUFFER_SIZE); - mbox->m_out.xferaddr = (u32)adapter->buf_dma_handle; + mbox.xferaddr = (u32)adapter->buf_dma_handle; /* * Non-ROMB firmware fail this command, so all channels @@ -3655,23 +3649,21 @@ static void mega_get_boot_drv(adapter_t *adapter) { struct private_bios_data *prv_bios_data; - unsigned char raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; + struct mbox_out mbox; + unsigned char *raw_mbox = (u8 *)&mbox; u16 cksum = 0; u8 *cksum_p; u8 boot_pdrv; int i; - mbox = (mbox_t *)raw_mbox; - - memset(&mbox->m_out, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); raw_mbox[0] = BIOS_PVT_DATA; raw_mbox[2] = GET_BIOS_PVT_DATA; memset((void *)adapter->mega_buffer, 0, MEGA_BUFFER_SIZE); - mbox->m_out.xferaddr = (u32)adapter->buf_dma_handle; + mbox.xferaddr = (u32)adapter->buf_dma_handle; adapter->boot_ldrv_enabled = 0; adapter->boot_ldrv = 0; @@ -3721,13 +3713,11 @@ mega_get_boot_drv(adapter_t *adapter) static int mega_support_random_del(adapter_t *adapter) { - unsigned char raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; + struct mbox_out mbox; + unsigned char *raw_mbox = (u8 *)&mbox; int rval; - mbox = (mbox_t *)raw_mbox; - - memset(&mbox->m_out, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); /* * issue command @@ -3750,13 +3740,11 @@ mega_support_random_del(adapter_t *adapter) static int mega_support_ext_cdb(adapter_t *adapter) { - unsigned char raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; + struct mbox_out mbox; + unsigned char *raw_mbox = (u8 *)&mbox; int rval; - mbox = (mbox_t *)raw_mbox; - - memset(&mbox->m_out, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); /* * issue command to find out if controller supports extended CDBs. */ @@ -3865,16 +3853,14 @@ mega_do_del_logdrv(adapter_t *adapter, int logdrv) static void mega_get_max_sgl(adapter_t *adapter) { - unsigned char raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; + struct mbox_out mbox; + unsigned char *raw_mbox = (u8 *)&mbox; - mbox = (mbox_t *)raw_mbox; - - memset(mbox, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); memset((void *)adapter->mega_buffer, 0, MEGA_BUFFER_SIZE); - mbox->m_out.xferaddr = (u32)adapter->buf_dma_handle; + mbox.xferaddr = (u32)adapter->buf_dma_handle; raw_mbox[0] = MAIN_MISC_OPCODE; raw_mbox[2] = GET_MAX_SG_SUPPORT; @@ -3888,7 +3874,7 @@ mega_get_max_sgl(adapter_t *adapter) } else { adapter->sglen = *((char *)adapter->mega_buffer); - + /* * Make sure this is not more than the resources we are * planning to allocate @@ -3910,16 +3896,14 @@ mega_get_max_sgl(adapter_t *adapter) static int mega_support_cluster(adapter_t *adapter) { - unsigned char raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; + struct mbox_out mbox; + unsigned char *raw_mbox = (u8 *)&mbox; - mbox = (mbox_t *)raw_mbox; - - memset(mbox, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); memset((void *)adapter->mega_buffer, 0, MEGA_BUFFER_SIZE); - mbox->m_out.xferaddr = (u32)adapter->buf_dma_handle; + mbox.xferaddr = (u32)adapter->buf_dma_handle; /* * Try to get the initiator id. This command will succeed iff the From ac795161c93699d600db16c1a8cc23a65a1eceaf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 6 Jan 2022 18:24:02 -0500 Subject: [PATCH 0038/1295] NFSv4: Handle case where the lookup of a directory fails If the application sets the O_DIRECTORY flag, and tries to open a regular file, nfs_atomic_open() will punt to doing a regular lookup. If the server then returns a regular file, we will happily return a file descriptor with uninitialised open state. The fix is to return the expected ENOTDIR error in these cases. Reported-by: Lyu Tao Fixes: 0dd2b474d0b6 ("nfs: implement i_op->atomic_open()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2884138848f4..408c3bb549b1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1994,6 +1994,19 @@ out: no_open: res = nfs_lookup(dir, dentry, lookup_flags); + if (!res) { + inode = d_inode(dentry); + if ((lookup_flags & LOOKUP_DIRECTORY) && inode && + !S_ISDIR(inode->i_mode)) + res = ERR_PTR(-ENOTDIR); + } else if (!IS_ERR(res)) { + inode = d_inode(res); + if ((lookup_flags & LOOKUP_DIRECTORY) && inode && + !S_ISDIR(inode->i_mode)) { + dput(res); + res = ERR_PTR(-ENOTDIR); + } + } if (switched) { d_lookup_done(dentry); if (!res) From 1751fc1db36f6f411709e143d5393f92d12137a9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 6 Jan 2022 18:24:03 -0500 Subject: [PATCH 0039/1295] NFSv4: nfs_atomic_open() can race when looking up a non-regular file If the file type changes back to being a regular file on the server between the failed OPEN and our LOOKUP, then we need to re-run the OPEN. Fixes: 0dd2b474d0b6 ("nfs: implement i_op->atomic_open()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 408c3bb549b1..5df75ed09268 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1999,12 +1999,17 @@ no_open: if ((lookup_flags & LOOKUP_DIRECTORY) && inode && !S_ISDIR(inode->i_mode)) res = ERR_PTR(-ENOTDIR); + else if (inode && S_ISREG(inode->i_mode)) + res = ERR_PTR(-EOPENSTALE); } else if (!IS_ERR(res)) { inode = d_inode(res); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && !S_ISDIR(inode->i_mode)) { dput(res); res = ERR_PTR(-ENOTDIR); + } else if (inode && S_ISREG(inode->i_mode)) { + dput(res); + res = ERR_PTR(-EOPENSTALE); } } if (switched) { From 6dc701ee9fabfc929cae2d7acc957bf38e4c3264 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Sat, 11 Dec 2021 13:36:32 +0100 Subject: [PATCH 0040/1295] MAINTAINERS: Add Apple watchdog to ARM/APPLE MACHINE SUPPORT Add apple_wdt.c under the ARM/APPLE MACHINE SUPPORT entry. Signed-off-by: Sven Peter Signed-off-by: Hector Martin --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 03ffcf49f5cf..95cd7274cb61 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1760,6 +1760,7 @@ F: drivers/irqchip/irq-apple-aic.c F: drivers/mailbox/apple-mailbox.c F: drivers/pinctrl/pinctrl-apple-gpio.c F: drivers/soc/apple/* +F: drivers/watchdog/apple_wdt.c F: include/dt-bindings/interrupt-controller/apple-aic.h F: include/dt-bindings/pinctrl/apple.h F: include/linux/apple-mailbox.h From 5225e1b87432dcf0d0fc3440824b91d04c1d6cc1 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Mon, 27 Dec 2021 19:00:24 +0100 Subject: [PATCH 0041/1295] ARM: dts: meson: Fix the UART compatible strings The dt-bindings for the UART controller only allow the following values for Meson6 SoCs: - "amlogic,meson6-uart", "amlogic,meson-ao-uart" - "amlogic,meson6-uart" Use the correct fallback compatible string "amlogic,meson-ao-uart" for AO UART. Drop the "amlogic,meson-uart" compatible string from the EE domain UART controllers. Fixes: ec9b59162fd831 ("ARM: dts: meson6: use stable UART bindings") Signed-off-by: Martin Blumenstingl Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20211227180026.4068352-2-martin.blumenstingl@googlemail.com --- arch/arm/boot/dts/meson.dtsi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/boot/dts/meson.dtsi b/arch/arm/boot/dts/meson.dtsi index 3be7cba603d5..26eaba3fa96f 100644 --- a/arch/arm/boot/dts/meson.dtsi +++ b/arch/arm/boot/dts/meson.dtsi @@ -59,7 +59,7 @@ }; uart_A: serial@84c0 { - compatible = "amlogic,meson6-uart", "amlogic,meson-uart"; + compatible = "amlogic,meson6-uart"; reg = <0x84c0 0x18>; interrupts = ; fifo-size = <128>; @@ -67,7 +67,7 @@ }; uart_B: serial@84dc { - compatible = "amlogic,meson6-uart", "amlogic,meson-uart"; + compatible = "amlogic,meson6-uart"; reg = <0x84dc 0x18>; interrupts = ; status = "disabled"; @@ -105,7 +105,7 @@ }; uart_C: serial@8700 { - compatible = "amlogic,meson6-uart", "amlogic,meson-uart"; + compatible = "amlogic,meson6-uart"; reg = <0x8700 0x18>; interrupts = ; status = "disabled"; @@ -228,7 +228,7 @@ }; uart_AO: serial@4c0 { - compatible = "amlogic,meson6-uart", "amlogic,meson-ao-uart", "amlogic,meson-uart"; + compatible = "amlogic,meson6-uart", "amlogic,meson-ao-uart"; reg = <0x4c0 0x18>; interrupts = ; status = "disabled"; From 57007bfb5469ba31cacf69d52195e8b75f43e32d Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Mon, 27 Dec 2021 19:00:25 +0100 Subject: [PATCH 0042/1295] ARM: dts: meson8: Fix the UART device-tree schema validation The dt-bindings for the UART controller only allow the following values for Meson8 SoCs: - "amlogic,meson8-uart", "amlogic,meson-ao-uart" - "amlogic,meson8-uart" Use the correct fallback compatible string "amlogic,meson-ao-uart" for AO UART. Drop the "amlogic,meson-uart" compatible string from the EE domain UART controllers. Also update the order of the clocks to match the order defined in the yaml schema. Fixes: 6ca77502050eff ("ARM: dts: meson8: use stable UART bindings with correct gate clock") Signed-off-by: Martin Blumenstingl Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20211227180026.4068352-3-martin.blumenstingl@googlemail.com --- arch/arm/boot/dts/meson8.dtsi | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arm/boot/dts/meson8.dtsi b/arch/arm/boot/dts/meson8.dtsi index f80ddc98d3a2..9997a5d0333a 100644 --- a/arch/arm/boot/dts/meson8.dtsi +++ b/arch/arm/boot/dts/meson8.dtsi @@ -736,27 +736,27 @@ }; &uart_AO { - compatible = "amlogic,meson8-uart", "amlogic,meson-uart"; - clocks = <&clkc CLKID_CLK81>, <&xtal>, <&clkc CLKID_CLK81>; - clock-names = "baud", "xtal", "pclk"; + compatible = "amlogic,meson8-uart", "amlogic,meson-ao-uart"; + clocks = <&xtal>, <&clkc CLKID_CLK81>, <&clkc CLKID_CLK81>; + clock-names = "xtal", "pclk", "baud"; }; &uart_A { - compatible = "amlogic,meson8-uart", "amlogic,meson-uart"; - clocks = <&clkc CLKID_CLK81>, <&xtal>, <&clkc CLKID_UART0>; - clock-names = "baud", "xtal", "pclk"; + compatible = "amlogic,meson8-uart"; + clocks = <&xtal>, <&clkc CLKID_UART0>, <&clkc CLKID_CLK81>; + clock-names = "xtal", "pclk", "baud"; }; &uart_B { - compatible = "amlogic,meson8-uart", "amlogic,meson-uart"; - clocks = <&clkc CLKID_CLK81>, <&xtal>, <&clkc CLKID_UART1>; - clock-names = "baud", "xtal", "pclk"; + compatible = "amlogic,meson8-uart"; + clocks = <&xtal>, <&clkc CLKID_UART0>, <&clkc CLKID_CLK81>; + clock-names = "xtal", "pclk", "baud"; }; &uart_C { - compatible = "amlogic,meson8-uart", "amlogic,meson-uart"; - clocks = <&clkc CLKID_CLK81>, <&xtal>, <&clkc CLKID_UART2>; - clock-names = "baud", "xtal", "pclk"; + compatible = "amlogic,meson8-uart"; + clocks = <&xtal>, <&clkc CLKID_UART0>, <&clkc CLKID_CLK81>; + clock-names = "xtal", "pclk", "baud"; }; &usb0 { From 3375aa77135f6aeb1107ed839a2050a4118444bc Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Mon, 27 Dec 2021 19:00:26 +0100 Subject: [PATCH 0043/1295] ARM: dts: meson8b: Fix the UART device-tree schema validation The dt-bindings for the UART controller only allow the following values for Meson8 SoCs: - "amlogic,meson8b-uart", "amlogic,meson-ao-uart" - "amlogic,meson8b-uart" Use the correct fallback compatible string "amlogic,meson-ao-uart" for AO UART. Drop the "amlogic,meson-uart" compatible string from the EE domain UART controllers. Also update the order of the clocks to match the order defined in the yaml bindings. Fixes: b02d6e73f5fc96 ("ARM: dts: meson8b: use stable UART bindings with correct gate clock") Signed-off-by: Martin Blumenstingl Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20211227180026.4068352-4-martin.blumenstingl@googlemail.com --- arch/arm/boot/dts/meson8b.dtsi | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arm/boot/dts/meson8b.dtsi b/arch/arm/boot/dts/meson8b.dtsi index b49b7cbaed4e..94f1c03decce 100644 --- a/arch/arm/boot/dts/meson8b.dtsi +++ b/arch/arm/boot/dts/meson8b.dtsi @@ -724,27 +724,27 @@ }; &uart_AO { - compatible = "amlogic,meson8b-uart", "amlogic,meson-uart"; - clocks = <&clkc CLKID_CLK81>, <&xtal>, <&clkc CLKID_CLK81>; - clock-names = "baud", "xtal", "pclk"; + compatible = "amlogic,meson8b-uart", "amlogic,meson-ao-uart"; + clocks = <&xtal>, <&clkc CLKID_CLK81>, <&clkc CLKID_CLK81>; + clock-names = "xtal", "pclk", "baud"; }; &uart_A { - compatible = "amlogic,meson8b-uart", "amlogic,meson-uart"; - clocks = <&clkc CLKID_CLK81>, <&xtal>, <&clkc CLKID_UART0>; - clock-names = "baud", "xtal", "pclk"; + compatible = "amlogic,meson8b-uart"; + clocks = <&xtal>, <&clkc CLKID_UART0>, <&clkc CLKID_CLK81>; + clock-names = "xtal", "pclk", "baud"; }; &uart_B { - compatible = "amlogic,meson8b-uart", "amlogic,meson-uart"; - clocks = <&clkc CLKID_CLK81>, <&xtal>, <&clkc CLKID_UART1>; - clock-names = "baud", "xtal", "pclk"; + compatible = "amlogic,meson8b-uart"; + clocks = <&xtal>, <&clkc CLKID_UART0>, <&clkc CLKID_CLK81>; + clock-names = "xtal", "pclk", "baud"; }; &uart_C { - compatible = "amlogic,meson8b-uart", "amlogic,meson-uart"; - clocks = <&clkc CLKID_CLK81>, <&xtal>, <&clkc CLKID_UART2>; - clock-names = "baud", "xtal", "pclk"; + compatible = "amlogic,meson8b-uart"; + clocks = <&xtal>, <&clkc CLKID_UART0>, <&clkc CLKID_CLK81>; + clock-names = "xtal", "pclk", "baud"; }; &usb0 { From e958b5884725dac86d36c1e7afe5a55f31feb0b2 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Fri, 7 Jan 2022 15:47:06 -0600 Subject: [PATCH 0044/1295] ASoC: xilinx: xlnx_formatter_pcm: Make buffer bytes multiple of period bytes This patch is based on one in the Xilinx kernel tree, "ASoc: xlnx: Make buffer bytes multiple of period bytes" by Devarsh Thakkar. The same issue exists in the mainline version of the driver. The original patch description is as follows: "The Xilinx Audio Formatter IP has a constraint on period bytes to be multiple of 64. This leads to driver changing the period size to suitable frames such that period bytes are multiple of 64. Now since period bytes and period size are updated but not the buffer bytes, this may make the buffer bytes unaligned and not multiple of period bytes. When this happens we hear popping noise as while DMA is being done the buffer bytes are not enough to complete DMA access for last period of frame within the application buffer boundary. To avoid this, align buffer bytes too as multiple of 64, and set another constraint to always enforce number of periods as integer. Now since, there is already a rule in alsa core to enforce Buffer size = Number of Periods * Period Size this automatically aligns buffer bytes as multiple of period bytes." Fixes: 6f6c3c36f091 ("ASoC: xlnx: add pcm formatter platform driver") Cc: Devarsh Thakkar Signed-off-by: Robert Hancock Link: https://lore.kernel.org/r/20220107214711.1100162-2-robert.hancock@calian.com Signed-off-by: Mark Brown --- sound/soc/xilinx/xlnx_formatter_pcm.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/sound/soc/xilinx/xlnx_formatter_pcm.c b/sound/soc/xilinx/xlnx_formatter_pcm.c index 91afea9d5de6..ce19a6058b27 100644 --- a/sound/soc/xilinx/xlnx_formatter_pcm.c +++ b/sound/soc/xilinx/xlnx_formatter_pcm.c @@ -37,6 +37,7 @@ #define XLNX_AUD_XFER_COUNT 0x28 #define XLNX_AUD_CH_STS_START 0x2C #define XLNX_BYTES_PER_CH 0x44 +#define XLNX_AUD_ALIGN_BYTES 64 #define AUD_STS_IOC_IRQ_MASK BIT(31) #define AUD_STS_CH_STS_MASK BIT(29) @@ -368,12 +369,32 @@ static int xlnx_formatter_pcm_open(struct snd_soc_component *component, snd_soc_set_runtime_hwparams(substream, &xlnx_pcm_hardware); runtime->private_data = stream_data; - /* Resize the period size divisible by 64 */ + /* Resize the period bytes as divisible by 64 */ err = snd_pcm_hw_constraint_step(runtime, 0, - SNDRV_PCM_HW_PARAM_PERIOD_BYTES, 64); + SNDRV_PCM_HW_PARAM_PERIOD_BYTES, + XLNX_AUD_ALIGN_BYTES); if (err) { dev_err(component->dev, - "unable to set constraint on period bytes\n"); + "Unable to set constraint on period bytes\n"); + return err; + } + + /* Resize the buffer bytes as divisible by 64 */ + err = snd_pcm_hw_constraint_step(runtime, 0, + SNDRV_PCM_HW_PARAM_BUFFER_BYTES, + XLNX_AUD_ALIGN_BYTES); + if (err) { + dev_err(component->dev, + "Unable to set constraint on buffer bytes\n"); + return err; + } + + /* Set periods as integer multiple */ + err = snd_pcm_hw_constraint_integer(runtime, + SNDRV_PCM_HW_PARAM_PERIODS); + if (err < 0) { + dev_err(component->dev, + "Unable to set constraint on periods to be integer\n"); return err; } From b114dda6f2f10cc8b2ddcde3285a576fe3f12c5d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 Jan 2022 22:54:05 +0100 Subject: [PATCH 0045/1295] scsi: message: fusion: Remove usage of the deprecated "pci-dma-compat.h" API In [1], Christoph Hellwig has proposed to remove the wrappers in include/linux/pci-dma-compat.h. Some reasons why this API should be removed have been given by Julia Lawall in [2]. A coccinelle script has been used to perform the needed transformation. It can be found in [3]. In this patch, all functions but pci_alloc_consistent() are handled. pci_alloc_consistent() needs more attention and explanation. [1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/ [2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/ [3]: https://lore.kernel.org/kernel-janitors/20200716192821.321233-1-christophe.jaillet@wanadoo.fr/ Link: https://lore.kernel.org/r/e38e897fbd3314718315b0e357c824e3f01775d6.1641500561.git.christophe.jaillet@wanadoo.fr Reviewed-by: Christoph Hellwig Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/message/fusion/mptbase.c | 94 +++++++++++++++++--------------- drivers/message/fusion/mptctl.c | 51 ++++++++++------- drivers/message/fusion/mptlan.c | 90 ++++++++++++++++-------------- drivers/message/fusion/mptsas.c | 51 ++++++++--------- 4 files changed, 154 insertions(+), 132 deletions(-) diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c index 24a4532053e4..5a3b7b56e85a 100644 --- a/drivers/message/fusion/mptbase.c +++ b/drivers/message/fusion/mptbase.c @@ -316,8 +316,8 @@ mpt_is_discovery_complete(MPT_ADAPTER *ioc) rc = 1; out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return rc; } @@ -1661,16 +1661,14 @@ mpt_mapresources(MPT_ADAPTER *ioc) const uint64_t required_mask = dma_get_required_mask (&pdev->dev); if (required_mask > DMA_BIT_MASK(32) - && !pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) - && !pci_set_consistent_dma_mask(pdev, - DMA_BIT_MASK(64))) { + && !dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)) + && !dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64))) { ioc->dma_mask = DMA_BIT_MASK(64); dinitprintk(ioc, printk(MYIOC_s_INFO_FMT ": 64 BIT PCI BUS DMA ADDRESSING SUPPORTED\n", ioc->name)); - } else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) - && !pci_set_consistent_dma_mask(pdev, - DMA_BIT_MASK(32))) { + } else if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)) + && !dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32))) { ioc->dma_mask = DMA_BIT_MASK(32); dinitprintk(ioc, printk(MYIOC_s_INFO_FMT ": 32 BIT PCI BUS DMA ADDRESSING SUPPORTED\n", @@ -1681,9 +1679,8 @@ mpt_mapresources(MPT_ADAPTER *ioc) goto out_pci_release_region; } } else { - if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) - && !pci_set_consistent_dma_mask(pdev, - DMA_BIT_MASK(32))) { + if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)) + && !dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32))) { ioc->dma_mask = DMA_BIT_MASK(32); dinitprintk(ioc, printk(MYIOC_s_INFO_FMT ": 32 BIT PCI BUS DMA ADDRESSING SUPPORTED\n", @@ -2769,9 +2766,9 @@ mpt_adapter_disable(MPT_ADAPTER *ioc) if (ioc->spi_data.pIocPg4 != NULL) { sz = ioc->spi_data.IocPg4Sz; - pci_free_consistent(ioc->pcidev, sz, - ioc->spi_data.pIocPg4, - ioc->spi_data.IocPg4_dma); + dma_free_coherent(&ioc->pcidev->dev, sz, + ioc->spi_data.pIocPg4, + ioc->spi_data.IocPg4_dma); ioc->spi_data.pIocPg4 = NULL; ioc->alloc_total -= sz; } @@ -3548,7 +3545,8 @@ mpt_free_fw_memory(MPT_ADAPTER *ioc) sz = ioc->facts.FWImageSize; dinitprintk(ioc, printk(MYIOC_s_DEBUG_FMT "free_fw_memory: FW Image @ %p[%p], sz=%d[%x] bytes\n", ioc->name, ioc->cached_fw, (void *)(ulong)ioc->cached_fw_dma, sz, sz)); - pci_free_consistent(ioc->pcidev, sz, ioc->cached_fw, ioc->cached_fw_dma); + dma_free_coherent(&ioc->pcidev->dev, sz, ioc->cached_fw, + ioc->cached_fw_dma); ioc->alloc_total -= sz; ioc->cached_fw = NULL; } @@ -4447,9 +4445,8 @@ PrimeIocFifos(MPT_ADAPTER *ioc) */ if (ioc->pcidev->device == MPI_MANUFACTPAGE_DEVID_SAS1078 && ioc->dma_mask > DMA_BIT_MASK(35)) { - if (!pci_set_dma_mask(ioc->pcidev, DMA_BIT_MASK(32)) - && !pci_set_consistent_dma_mask(ioc->pcidev, - DMA_BIT_MASK(32))) { + if (!dma_set_mask(&ioc->pcidev->dev, DMA_BIT_MASK(32)) + && !dma_set_coherent_mask(&ioc->pcidev->dev, DMA_BIT_MASK(32))) { dma_mask = DMA_BIT_MASK(35); d36memprintk(ioc, printk(MYIOC_s_DEBUG_FMT "setting 35 bit addressing for " @@ -4457,10 +4454,10 @@ PrimeIocFifos(MPT_ADAPTER *ioc) ioc->name)); } else { /*Reseting DMA mask to 64 bit*/ - pci_set_dma_mask(ioc->pcidev, - DMA_BIT_MASK(64)); - pci_set_consistent_dma_mask(ioc->pcidev, - DMA_BIT_MASK(64)); + dma_set_mask(&ioc->pcidev->dev, + DMA_BIT_MASK(64)); + dma_set_coherent_mask(&ioc->pcidev->dev, + DMA_BIT_MASK(64)); printk(MYIOC_s_ERR_FMT "failed setting 35 bit addressing for " @@ -4595,8 +4592,8 @@ PrimeIocFifos(MPT_ADAPTER *ioc) alloc_dma += ioc->reply_sz; } - if (dma_mask == DMA_BIT_MASK(35) && !pci_set_dma_mask(ioc->pcidev, - ioc->dma_mask) && !pci_set_consistent_dma_mask(ioc->pcidev, + if (dma_mask == DMA_BIT_MASK(35) && !dma_set_mask(&ioc->pcidev->dev, + ioc->dma_mask) && !dma_set_coherent_mask(&ioc->pcidev->dev, ioc->dma_mask)) d36memprintk(ioc, printk(MYIOC_s_DEBUG_FMT "restoring 64 bit addressing\n", ioc->name)); @@ -4620,8 +4617,8 @@ out_fail: ioc->sense_buf_pool = NULL; } - if (dma_mask == DMA_BIT_MASK(35) && !pci_set_dma_mask(ioc->pcidev, - DMA_BIT_MASK(64)) && !pci_set_consistent_dma_mask(ioc->pcidev, + if (dma_mask == DMA_BIT_MASK(35) && !dma_set_mask(&ioc->pcidev->dev, + DMA_BIT_MASK(64)) && !dma_set_coherent_mask(&ioc->pcidev->dev, DMA_BIT_MASK(64))) d36memprintk(ioc, printk(MYIOC_s_DEBUG_FMT "restoring 64 bit addressing\n", ioc->name)); @@ -4982,7 +4979,8 @@ GetLanConfigPages(MPT_ADAPTER *ioc) } - pci_free_consistent(ioc->pcidev, data_sz, (u8 *) ppage0_alloc, page0_dma); + dma_free_coherent(&ioc->pcidev->dev, data_sz, + (u8 *)ppage0_alloc, page0_dma); /* FIXME! * Normalize endianness of structure data, @@ -5026,7 +5024,8 @@ GetLanConfigPages(MPT_ADAPTER *ioc) memcpy(&ioc->lan_cnfg_page1, ppage1_alloc, copy_sz); } - pci_free_consistent(ioc->pcidev, data_sz, (u8 *) ppage1_alloc, page1_dma); + dma_free_coherent(&ioc->pcidev->dev, data_sz, + (u8 *)ppage1_alloc, page1_dma); /* FIXME! * Normalize endianness of structure data, @@ -5325,7 +5324,8 @@ GetIoUnitPage2(MPT_ADAPTER *ioc) if ((rc = mpt_config(ioc, &cfg)) == 0) ioc->biosVersion = le32_to_cpu(ppage_alloc->BiosVersion); - pci_free_consistent(ioc->pcidev, data_sz, (u8 *) ppage_alloc, page_dma); + dma_free_coherent(&ioc->pcidev->dev, data_sz, + (u8 *)ppage_alloc, page_dma); } return rc; @@ -5456,7 +5456,9 @@ mpt_GetScsiPortSettings(MPT_ADAPTER *ioc, int portnum) } } if (pbuf) { - pci_free_consistent(ioc->pcidev, header.PageLength * 4, pbuf, buf_dma); + dma_free_coherent(&ioc->pcidev->dev, + header.PageLength * 4, pbuf, + buf_dma); } } } @@ -5543,7 +5545,9 @@ mpt_GetScsiPortSettings(MPT_ADAPTER *ioc, int portnum) } } - pci_free_consistent(ioc->pcidev, header.PageLength * 4, pbuf, buf_dma); + dma_free_coherent(&ioc->pcidev->dev, + header.PageLength * 4, pbuf, + buf_dma); } } @@ -5707,8 +5711,8 @@ mpt_inactive_raid_volumes(MPT_ADAPTER *ioc, u8 channel, u8 id) out: if (buffer) - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, buffer, - dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + buffer, dma_handle); } /** @@ -5776,8 +5780,8 @@ mpt_raid_phys_disk_pg0(MPT_ADAPTER *ioc, u8 phys_disk_num, out: if (buffer) - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, buffer, - dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + buffer, dma_handle); return rc; } @@ -5840,8 +5844,8 @@ mpt_raid_phys_disk_get_num_paths(MPT_ADAPTER *ioc, u8 phys_disk_num) out: if (buffer) - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, buffer, - dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + buffer, dma_handle); return rc; } @@ -5929,8 +5933,8 @@ mpt_raid_phys_disk_pg1(MPT_ADAPTER *ioc, u8 phys_disk_num, out: if (buffer) - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, buffer, - dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + buffer, dma_handle); return rc; } @@ -6011,7 +6015,7 @@ mpt_findImVolumes(MPT_ADAPTER *ioc) pIoc2->RaidVolume[i].VolumeID); out: - pci_free_consistent(ioc->pcidev, iocpage2sz, pIoc2, ioc2_dma); + dma_free_coherent(&ioc->pcidev->dev, iocpage2sz, pIoc2, ioc2_dma); return rc; } @@ -6070,7 +6074,7 @@ mpt_read_ioc_pg_3(MPT_ADAPTER *ioc) } } - pci_free_consistent(ioc->pcidev, iocpage3sz, pIoc3, ioc3_dma); + dma_free_coherent(&ioc->pcidev->dev, iocpage3sz, pIoc3, ioc3_dma); return 0; } @@ -6122,7 +6126,8 @@ mpt_read_ioc_pg_4(MPT_ADAPTER *ioc) ioc->spi_data.IocPg4_dma = ioc4_dma; ioc->spi_data.IocPg4Sz = iocpage4sz; } else { - pci_free_consistent(ioc->pcidev, iocpage4sz, pIoc4, ioc4_dma); + dma_free_coherent(&ioc->pcidev->dev, iocpage4sz, pIoc4, + ioc4_dma); ioc->spi_data.pIocPg4 = NULL; ioc->alloc_total -= iocpage4sz; } @@ -6210,7 +6215,7 @@ mpt_read_ioc_pg_1(MPT_ADAPTER *ioc) } } - pci_free_consistent(ioc->pcidev, iocpage1sz, pIoc1, ioc1_dma); + dma_free_coherent(&ioc->pcidev->dev, iocpage1sz, pIoc1, ioc1_dma); return; } @@ -6255,7 +6260,8 @@ mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc) out: if (pbuf) - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, pbuf, buf_dma); + dma_free_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, pbuf, + buf_dma); } /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/ diff --git a/drivers/message/fusion/mptctl.c b/drivers/message/fusion/mptctl.c index ae433c150b37..0f447179e3f5 100644 --- a/drivers/message/fusion/mptctl.c +++ b/drivers/message/fusion/mptctl.c @@ -1046,9 +1046,9 @@ kbuf_alloc_2_sgl(int bytes, u32 sgdir, int sge_offset, int *frags, goto free_and_fail; if (sgdir & 0x04000000) - dir = PCI_DMA_TODEVICE; + dir = DMA_TO_DEVICE; else - dir = PCI_DMA_FROMDEVICE; + dir = DMA_FROM_DEVICE; /* At start: * sgl = sglbuf = point to beginning of sg buffer @@ -1080,8 +1080,9 @@ kbuf_alloc_2_sgl(int bytes, u32 sgdir, int sge_offset, int *frags, bytes_allocd += this_alloc; sgl->FlagsLength = (0x10000000|sgdir|this_alloc); - dma_addr = pci_map_single(ioc->pcidev, - buflist[buflist_ent].kptr, this_alloc, dir); + dma_addr = dma_map_single(&ioc->pcidev->dev, + buflist[buflist_ent].kptr, + this_alloc, dir); sgl->Address = dma_addr; fragcnt++; @@ -1140,9 +1141,11 @@ free_and_fail: kptr = buflist[i].kptr; len = buflist[i].len; - pci_free_consistent(ioc->pcidev, len, kptr, dma_addr); + dma_free_coherent(&ioc->pcidev->dev, len, kptr, + dma_addr); } - pci_free_consistent(ioc->pcidev, MAX_SGL_BYTES, sglbuf, *sglbuf_dma); + dma_free_coherent(&ioc->pcidev->dev, MAX_SGL_BYTES, sglbuf, + *sglbuf_dma); } kfree(buflist); return NULL; @@ -1162,9 +1165,9 @@ kfree_sgl(MptSge_t *sgl, dma_addr_t sgl_dma, struct buflist *buflist, MPT_ADAPTE int n = 0; if (sg->FlagsLength & 0x04000000) - dir = PCI_DMA_TODEVICE; + dir = DMA_TO_DEVICE; else - dir = PCI_DMA_FROMDEVICE; + dir = DMA_FROM_DEVICE; nib = (sg->FlagsLength & 0xF0000000) >> 28; while (! (nib & 0x4)) { /* eob */ @@ -1179,8 +1182,10 @@ kfree_sgl(MptSge_t *sgl, dma_addr_t sgl_dma, struct buflist *buflist, MPT_ADAPTE dma_addr = sg->Address; kptr = bl->kptr; len = bl->len; - pci_unmap_single(ioc->pcidev, dma_addr, len, dir); - pci_free_consistent(ioc->pcidev, len, kptr, dma_addr); + dma_unmap_single(&ioc->pcidev->dev, dma_addr, len, + dir); + dma_free_coherent(&ioc->pcidev->dev, len, kptr, + dma_addr); n++; } sg++; @@ -1197,12 +1202,12 @@ kfree_sgl(MptSge_t *sgl, dma_addr_t sgl_dma, struct buflist *buflist, MPT_ADAPTE dma_addr = sg->Address; kptr = bl->kptr; len = bl->len; - pci_unmap_single(ioc->pcidev, dma_addr, len, dir); - pci_free_consistent(ioc->pcidev, len, kptr, dma_addr); + dma_unmap_single(&ioc->pcidev->dev, dma_addr, len, dir); + dma_free_coherent(&ioc->pcidev->dev, len, kptr, dma_addr); n++; } - pci_free_consistent(ioc->pcidev, MAX_SGL_BYTES, sgl, sgl_dma); + dma_free_coherent(&ioc->pcidev->dev, MAX_SGL_BYTES, sgl, sgl_dma); kfree(buflist); dctlprintk(ioc, printk(MYIOC_s_DEBUG_FMT "-SG: Free'd 1 SGL buf + %d kbufs!\n", ioc->name, n)); @@ -2283,13 +2288,13 @@ done_free_mem: /* Free the allocated memory. */ if (bufOut.kptr != NULL) { - pci_free_consistent(ioc->pcidev, - bufOut.len, (void *) bufOut.kptr, dma_addr_out); + dma_free_coherent(&ioc->pcidev->dev, bufOut.len, + (void *)bufOut.kptr, dma_addr_out); } if (bufIn.kptr != NULL) { - pci_free_consistent(ioc->pcidev, - bufIn.len, (void *) bufIn.kptr, dma_addr_in); + dma_free_coherent(&ioc->pcidev->dev, bufIn.len, + (void *)bufIn.kptr, dma_addr_in); } /* mf is null if command issued successfully @@ -2405,7 +2410,9 @@ mptctl_hp_hostinfo(MPT_ADAPTER *ioc, unsigned long arg, unsigned int data_size) pdata->BoardTracerNumber, 24); } } - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, pbuf, buf_dma); + dma_free_coherent(&ioc->pcidev->dev, + hdr.PageLength * 4, pbuf, + buf_dma); pbuf = NULL; } } @@ -2519,7 +2526,7 @@ retry_wait: SET_MGMT_MSG_CONTEXT(ioc->ioctl_cmds.msg_context, 0); if (pbuf) - pci_free_consistent(ioc->pcidev, 4, pbuf, buf_dma); + dma_free_coherent(&ioc->pcidev->dev, 4, pbuf, buf_dma); /* Copy the data from kernel memory to user memory */ @@ -2623,7 +2630,8 @@ mptctl_hp_targetinfo(MPT_ADAPTER *ioc, unsigned long arg) karg.negotiated_speed = HP_DEV_SPEED_ASYNC; } - pci_free_consistent(ioc->pcidev, data_sz, (u8 *) pg0_alloc, page_dma); + dma_free_coherent(&ioc->pcidev->dev, data_sz, (u8 *)pg0_alloc, + page_dma); } /* Set defaults @@ -2658,7 +2666,8 @@ mptctl_hp_targetinfo(MPT_ADAPTER *ioc, unsigned long arg) karg.phase_errors = (u32) le16_to_cpu(pg3_alloc->PhaseErrorCount); karg.parity_errors = (u32) le16_to_cpu(pg3_alloc->ParityErrorCount); } - pci_free_consistent(ioc->pcidev, data_sz, (u8 *) pg3_alloc, page_dma); + dma_free_coherent(&ioc->pcidev->dev, data_sz, + (u8 *)pg3_alloc, page_dma); } } hd = shost_priv(ioc->sh); diff --git a/drivers/message/fusion/mptlan.c b/drivers/message/fusion/mptlan.c index 117fa4ebf6d7..142eb5d5d9df 100644 --- a/drivers/message/fusion/mptlan.c +++ b/drivers/message/fusion/mptlan.c @@ -516,9 +516,9 @@ mpt_lan_close(struct net_device *dev) if (priv->RcvCtl[i].skb != NULL) { /**/ dlprintk((KERN_INFO MYNAM "/lan_close: bucket %05x " /**/ "is still out\n", i)); - pci_unmap_single(mpt_dev->pcidev, priv->RcvCtl[i].dma, - priv->RcvCtl[i].len, - PCI_DMA_FROMDEVICE); + dma_unmap_single(&mpt_dev->pcidev->dev, + priv->RcvCtl[i].dma, + priv->RcvCtl[i].len, DMA_FROM_DEVICE); dev_kfree_skb(priv->RcvCtl[i].skb); } } @@ -528,9 +528,9 @@ mpt_lan_close(struct net_device *dev) for (i = 0; i < priv->tx_max_out; i++) { if (priv->SendCtl[i].skb != NULL) { - pci_unmap_single(mpt_dev->pcidev, priv->SendCtl[i].dma, - priv->SendCtl[i].len, - PCI_DMA_TODEVICE); + dma_unmap_single(&mpt_dev->pcidev->dev, + priv->SendCtl[i].dma, + priv->SendCtl[i].len, DMA_TO_DEVICE); dev_kfree_skb(priv->SendCtl[i].skb); } } @@ -582,8 +582,8 @@ mpt_lan_send_turbo(struct net_device *dev, u32 tmsg) __func__, sent)); priv->SendCtl[ctx].skb = NULL; - pci_unmap_single(mpt_dev->pcidev, priv->SendCtl[ctx].dma, - priv->SendCtl[ctx].len, PCI_DMA_TODEVICE); + dma_unmap_single(&mpt_dev->pcidev->dev, priv->SendCtl[ctx].dma, + priv->SendCtl[ctx].len, DMA_TO_DEVICE); dev_kfree_skb_irq(sent); spin_lock_irqsave(&priv->txfidx_lock, flags); @@ -648,8 +648,9 @@ mpt_lan_send_reply(struct net_device *dev, LANSendReply_t *pSendRep) __func__, sent)); priv->SendCtl[ctx].skb = NULL; - pci_unmap_single(mpt_dev->pcidev, priv->SendCtl[ctx].dma, - priv->SendCtl[ctx].len, PCI_DMA_TODEVICE); + dma_unmap_single(&mpt_dev->pcidev->dev, + priv->SendCtl[ctx].dma, + priv->SendCtl[ctx].len, DMA_TO_DEVICE); dev_kfree_skb_irq(sent); priv->mpt_txfidx[++priv->mpt_txfidx_tail] = ctx; @@ -720,8 +721,8 @@ mpt_lan_sdu_send (struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb); skb_pull(skb, 12); - dma = pci_map_single(mpt_dev->pcidev, skb->data, skb->len, - PCI_DMA_TODEVICE); + dma = dma_map_single(&mpt_dev->pcidev->dev, skb->data, skb->len, + DMA_TO_DEVICE); priv->SendCtl[ctx].skb = skb; priv->SendCtl[ctx].dma = dma; @@ -868,13 +869,17 @@ mpt_lan_receive_post_turbo(struct net_device *dev, u32 tmsg) return -ENOMEM; } - pci_dma_sync_single_for_cpu(mpt_dev->pcidev, priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, PCI_DMA_FROMDEVICE); + dma_sync_single_for_cpu(&mpt_dev->pcidev->dev, + priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, + DMA_FROM_DEVICE); skb_copy_from_linear_data(old_skb, skb_put(skb, len), len); - pci_dma_sync_single_for_device(mpt_dev->pcidev, priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, PCI_DMA_FROMDEVICE); + dma_sync_single_for_device(&mpt_dev->pcidev->dev, + priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, + DMA_FROM_DEVICE); goto out; } @@ -882,8 +887,8 @@ mpt_lan_receive_post_turbo(struct net_device *dev, u32 tmsg) priv->RcvCtl[ctx].skb = NULL; - pci_unmap_single(mpt_dev->pcidev, priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, PCI_DMA_FROMDEVICE); + dma_unmap_single(&mpt_dev->pcidev->dev, priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, DMA_FROM_DEVICE); out: spin_lock_irqsave(&priv->rxfidx_lock, flags); @@ -927,8 +932,8 @@ mpt_lan_receive_post_free(struct net_device *dev, // dlprintk((KERN_INFO MYNAM "@rpr[2] TC + 3\n")); priv->RcvCtl[ctx].skb = NULL; - pci_unmap_single(mpt_dev->pcidev, priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, PCI_DMA_FROMDEVICE); + dma_unmap_single(&mpt_dev->pcidev->dev, priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, DMA_FROM_DEVICE); dev_kfree_skb_any(skb); priv->mpt_rxfidx[++priv->mpt_rxfidx_tail] = ctx; @@ -1028,16 +1033,16 @@ mpt_lan_receive_post_reply(struct net_device *dev, // IOC_AND_NETDEV_NAMES_s_s(dev), // i, l)); - pci_dma_sync_single_for_cpu(mpt_dev->pcidev, - priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, - PCI_DMA_FROMDEVICE); + dma_sync_single_for_cpu(&mpt_dev->pcidev->dev, + priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, + DMA_FROM_DEVICE); skb_copy_from_linear_data(old_skb, skb_put(skb, l), l); - pci_dma_sync_single_for_device(mpt_dev->pcidev, - priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, - PCI_DMA_FROMDEVICE); + dma_sync_single_for_device(&mpt_dev->pcidev->dev, + priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, + DMA_FROM_DEVICE); priv->mpt_rxfidx[++priv->mpt_rxfidx_tail] = ctx; szrem -= l; @@ -1056,17 +1061,17 @@ mpt_lan_receive_post_reply(struct net_device *dev, return -ENOMEM; } - pci_dma_sync_single_for_cpu(mpt_dev->pcidev, - priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, - PCI_DMA_FROMDEVICE); + dma_sync_single_for_cpu(&mpt_dev->pcidev->dev, + priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, + DMA_FROM_DEVICE); skb_copy_from_linear_data(old_skb, skb_put(skb, len), len); - pci_dma_sync_single_for_device(mpt_dev->pcidev, - priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, - PCI_DMA_FROMDEVICE); + dma_sync_single_for_device(&mpt_dev->pcidev->dev, + priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, + DMA_FROM_DEVICE); spin_lock_irqsave(&priv->rxfidx_lock, flags); priv->mpt_rxfidx[++priv->mpt_rxfidx_tail] = ctx; @@ -1077,8 +1082,8 @@ mpt_lan_receive_post_reply(struct net_device *dev, priv->RcvCtl[ctx].skb = NULL; - pci_unmap_single(mpt_dev->pcidev, priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, PCI_DMA_FROMDEVICE); + dma_unmap_single(&mpt_dev->pcidev->dev, priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, DMA_FROM_DEVICE); priv->RcvCtl[ctx].dma = 0; priv->mpt_rxfidx[++priv->mpt_rxfidx_tail] = ctx; @@ -1199,10 +1204,10 @@ mpt_lan_post_receive_buckets(struct mpt_lan_priv *priv) skb = priv->RcvCtl[ctx].skb; if (skb && (priv->RcvCtl[ctx].len != len)) { - pci_unmap_single(mpt_dev->pcidev, + dma_unmap_single(&mpt_dev->pcidev->dev, priv->RcvCtl[ctx].dma, priv->RcvCtl[ctx].len, - PCI_DMA_FROMDEVICE); + DMA_FROM_DEVICE); dev_kfree_skb(priv->RcvCtl[ctx].skb); skb = priv->RcvCtl[ctx].skb = NULL; } @@ -1218,8 +1223,9 @@ mpt_lan_post_receive_buckets(struct mpt_lan_priv *priv) break; } - dma = pci_map_single(mpt_dev->pcidev, skb->data, - len, PCI_DMA_FROMDEVICE); + dma = dma_map_single(&mpt_dev->pcidev->dev, + skb->data, len, + DMA_FROM_DEVICE); priv->RcvCtl[ctx].skb = skb; priv->RcvCtl[ctx].dma = dma; diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index 091b45024d34..0363b2a2264d 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -769,8 +769,8 @@ mptsas_add_device_component_starget_ir(MPT_ADAPTER *ioc, out: if (buffer) - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, buffer, - dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + buffer, dma_handle); } /** @@ -1426,8 +1426,8 @@ mptsas_sas_enclosure_pg0(MPT_ADAPTER *ioc, struct mptsas_enclosure *enclosure, enclosure->sep_channel = buffer->SEPBus; out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return error; } @@ -2081,8 +2081,8 @@ static int mptsas_get_linkerrors(struct sas_phy *phy) le32_to_cpu(buffer->PhyResetProblemCount); out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); return error; } @@ -2301,7 +2301,7 @@ static void mptsas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost, << MPI_SGE_FLAGS_SHIFT; if (!dma_map_sg(&ioc->pcidev->dev, job->request_payload.sg_list, - 1, PCI_DMA_BIDIRECTIONAL)) + 1, DMA_BIDIRECTIONAL)) goto put_mf; flagsLength |= (sg_dma_len(job->request_payload.sg_list) - 4); @@ -2318,7 +2318,7 @@ static void mptsas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost, flagsLength = flagsLength << MPI_SGE_FLAGS_SHIFT; if (!dma_map_sg(&ioc->pcidev->dev, job->reply_payload.sg_list, - 1, PCI_DMA_BIDIRECTIONAL)) + 1, DMA_BIDIRECTIONAL)) goto unmap_out; flagsLength |= sg_dma_len(job->reply_payload.sg_list) + 4; ioc->add_sge(psge, flagsLength, @@ -2356,10 +2356,10 @@ static void mptsas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost, unmap_in: dma_unmap_sg(&ioc->pcidev->dev, job->reply_payload.sg_list, 1, - PCI_DMA_BIDIRECTIONAL); + DMA_BIDIRECTIONAL); unmap_out: dma_unmap_sg(&ioc->pcidev->dev, job->request_payload.sg_list, 1, - PCI_DMA_BIDIRECTIONAL); + DMA_BIDIRECTIONAL); put_mf: if (mf) mpt_free_msg_frame(ioc, mf); @@ -2452,8 +2452,8 @@ mptsas_sas_io_unit_pg0(MPT_ADAPTER *ioc, struct mptsas_portinfo *port_info) } out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return error; } @@ -2509,8 +2509,8 @@ mptsas_sas_io_unit_pg1(MPT_ADAPTER *ioc) device_missing_delay & MPI_SAS_IOUNIT1_REPORT_MISSING_TIMEOUT_MASK; out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return error; } @@ -2573,8 +2573,8 @@ mptsas_sas_phy_pg0(MPT_ADAPTER *ioc, struct mptsas_phyinfo *phy_info, phy_info->attached.handle = le16_to_cpu(buffer->AttachedDevHandle); out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return error; } @@ -2654,8 +2654,8 @@ mptsas_sas_device_pg0(MPT_ADAPTER *ioc, struct mptsas_devinfo *device_info, device_info->flags = le16_to_cpu(buffer->Flags); out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return error; } @@ -2737,8 +2737,8 @@ mptsas_sas_expander_pg0(MPT_ADAPTER *ioc, struct mptsas_portinfo *port_info, } out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return error; } @@ -2810,8 +2810,8 @@ mptsas_sas_expander_pg1(MPT_ADAPTER *ioc, struct mptsas_phyinfo *phy_info, phy_info->attached.handle = le16_to_cpu(buffer->AttachedDevHandle); out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return error; } @@ -2987,7 +2987,8 @@ mptsas_exp_repmanufacture_info(MPT_ADAPTER *ioc, } out_free: if (data_out_dma) - pci_free_consistent(ioc->pcidev, sz, data_out, data_out_dma); + dma_free_coherent(&ioc->pcidev->dev, sz, data_out, + data_out_dma); put_mf: if (mf) mpt_free_msg_frame(ioc, mf); @@ -4318,8 +4319,8 @@ mptsas_adding_inactive_raid_components(MPT_ADAPTER *ioc, u8 channel, u8 id) out: if (buffer) - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, buffer, - dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + buffer, dma_handle); } /* * Work queue thread to handle SAS hotplug events From 2d50607260a6c142f49222346814fee30eeaba9e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 Jan 2022 22:54:13 +0100 Subject: [PATCH 0046/1295] scsi: message: fusion: Use dma_alloc_coherent() in mpt_alloc_fw_memory() In [1], Christoph Hellwig has proposed to remove the wrappers in include/linux/pci-dma-compat.h. Some reasons why this API should be removed have been given by Julia Lawall in [2]. mpt_alloc_fw_memory() should still use GFP_ATOMIC, because it can be called from mpt_do_upload() which might sleep. [1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/ [2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/ Link: https://lore.kernel.org/r/db3db9db219005b75659561d08117d312d0cfb13.1641500561.git.christophe.jaillet@wanadoo.fr Reviewed-by: Christoph Hellwig Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/message/fusion/mptbase.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c index 5a3b7b56e85a..c4702ef87897 100644 --- a/drivers/message/fusion/mptbase.c +++ b/drivers/message/fusion/mptbase.c @@ -3512,7 +3512,8 @@ mpt_alloc_fw_memory(MPT_ADAPTER *ioc, int size) rc = 0; goto out; } - ioc->cached_fw = pci_alloc_consistent(ioc->pcidev, size, &ioc->cached_fw_dma); + ioc->cached_fw = dma_alloc_coherent(&ioc->pcidev->dev, size, + &ioc->cached_fw_dma, GFP_ATOMIC); if (!ioc->cached_fw) { printk(MYIOC_s_ERR_FMT "Unable to allocate memory for the cached firmware image!\n", ioc->name); From 5c5e6b6f61e0196141db38b84b30ef9b3bcca0f2 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 Jan 2022 22:54:19 +0100 Subject: [PATCH 0047/1295] scsi: message: fusion: mptbase: Use dma_alloc_coherent() In [1], Christoph Hellwig has proposed to remove the wrappers in include/linux/pci-dma-compat.h. Some reasons why this API should be removed have been given by Julia Lawall in [2]. In all these places where some memory is allocated GFP_KERNEL can be used because they already call mpt_config() which has an explicit might_sleep(). [1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/ [2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/ Link: https://lore.kernel.org/r/3bea2452deb8cc8be65982e87efa4c6861caa01c.1641500561.git.christophe.jaillet@wanadoo.fr Reviewed-by: Christoph Hellwig Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/message/fusion/mptbase.c | 52 ++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c index c4702ef87897..e90adfa57950 100644 --- a/drivers/message/fusion/mptbase.c +++ b/drivers/message/fusion/mptbase.c @@ -300,8 +300,8 @@ mpt_is_discovery_complete(MPT_ADAPTER *ioc) if (!hdr.ExtPageLength) goto out; - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) goto out; @@ -4966,7 +4966,8 @@ GetLanConfigPages(MPT_ADAPTER *ioc) if (hdr.PageLength > 0) { data_sz = hdr.PageLength * 4; - ppage0_alloc = pci_alloc_consistent(ioc->pcidev, data_sz, &page0_dma); + ppage0_alloc = dma_alloc_coherent(&ioc->pcidev->dev, data_sz, + &page0_dma, GFP_KERNEL); rc = -ENOMEM; if (ppage0_alloc) { memset((u8 *)ppage0_alloc, 0, data_sz); @@ -5013,7 +5014,8 @@ GetLanConfigPages(MPT_ADAPTER *ioc) data_sz = hdr.PageLength * 4; rc = -ENOMEM; - ppage1_alloc = pci_alloc_consistent(ioc->pcidev, data_sz, &page1_dma); + ppage1_alloc = dma_alloc_coherent(&ioc->pcidev->dev, data_sz, + &page1_dma, GFP_KERNEL); if (ppage1_alloc) { memset((u8 *)ppage1_alloc, 0, data_sz); cfg.physAddr = page1_dma; @@ -5315,7 +5317,8 @@ GetIoUnitPage2(MPT_ADAPTER *ioc) /* Read the config page */ data_sz = hdr.PageLength * 4; rc = -ENOMEM; - ppage_alloc = pci_alloc_consistent(ioc->pcidev, data_sz, &page_dma); + ppage_alloc = dma_alloc_coherent(&ioc->pcidev->dev, data_sz, + &page_dma, GFP_KERNEL); if (ppage_alloc) { memset((u8 *)ppage_alloc, 0, data_sz); cfg.physAddr = page_dma; @@ -5401,7 +5404,9 @@ mpt_GetScsiPortSettings(MPT_ADAPTER *ioc, int portnum) return -EFAULT; if (header.PageLength > 0) { - pbuf = pci_alloc_consistent(ioc->pcidev, header.PageLength * 4, &buf_dma); + pbuf = dma_alloc_coherent(&ioc->pcidev->dev, + header.PageLength * 4, &buf_dma, + GFP_KERNEL); if (pbuf) { cfg.action = MPI_CONFIG_ACTION_PAGE_READ_CURRENT; cfg.physAddr = buf_dma; @@ -5481,7 +5486,9 @@ mpt_GetScsiPortSettings(MPT_ADAPTER *ioc, int portnum) if (header.PageLength > 0) { /* Allocate memory and read SCSI Port Page 2 */ - pbuf = pci_alloc_consistent(ioc->pcidev, header.PageLength * 4, &buf_dma); + pbuf = dma_alloc_coherent(&ioc->pcidev->dev, + header.PageLength * 4, &buf_dma, + GFP_KERNEL); if (pbuf) { cfg.action = MPI_CONFIG_ACTION_PAGE_READ_NVRAM; cfg.physAddr = buf_dma; @@ -5664,8 +5671,8 @@ mpt_inactive_raid_volumes(MPT_ADAPTER *ioc, u8 channel, u8 id) if (!hdr.PageLength) goto out; - buffer = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) goto out; @@ -5757,8 +5764,8 @@ mpt_raid_phys_disk_pg0(MPT_ADAPTER *ioc, u8 phys_disk_num, goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { rc = -ENOMEM; @@ -5824,8 +5831,8 @@ mpt_raid_phys_disk_get_num_paths(MPT_ADAPTER *ioc, u8 phys_disk_num) goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { rc = 0; @@ -5896,8 +5903,8 @@ mpt_raid_phys_disk_pg1(MPT_ADAPTER *ioc, u8 phys_disk_num, goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { rc = -ENOMEM; @@ -5991,7 +5998,8 @@ mpt_findImVolumes(MPT_ADAPTER *ioc) return -EFAULT; iocpage2sz = header.PageLength * 4; - pIoc2 = pci_alloc_consistent(ioc->pcidev, iocpage2sz, &ioc2_dma); + pIoc2 = dma_alloc_coherent(&ioc->pcidev->dev, iocpage2sz, &ioc2_dma, + GFP_KERNEL); if (!pIoc2) return -ENOMEM; @@ -6058,7 +6066,8 @@ mpt_read_ioc_pg_3(MPT_ADAPTER *ioc) /* Read Header good, alloc memory */ iocpage3sz = header.PageLength * 4; - pIoc3 = pci_alloc_consistent(ioc->pcidev, iocpage3sz, &ioc3_dma); + pIoc3 = dma_alloc_coherent(&ioc->pcidev->dev, iocpage3sz, &ioc3_dma, + GFP_KERNEL); if (!pIoc3) return 0; @@ -6109,7 +6118,8 @@ mpt_read_ioc_pg_4(MPT_ADAPTER *ioc) if ( (pIoc4 = ioc->spi_data.pIocPg4) == NULL ) { iocpage4sz = (header.PageLength + 4) * 4; /* Allow 4 additional SEP's */ - pIoc4 = pci_alloc_consistent(ioc->pcidev, iocpage4sz, &ioc4_dma); + pIoc4 = dma_alloc_coherent(&ioc->pcidev->dev, iocpage4sz, + &ioc4_dma, GFP_KERNEL); if (!pIoc4) return; ioc->alloc_total += iocpage4sz; @@ -6165,7 +6175,8 @@ mpt_read_ioc_pg_1(MPT_ADAPTER *ioc) /* Read Header good, alloc memory */ iocpage1sz = header.PageLength * 4; - pIoc1 = pci_alloc_consistent(ioc->pcidev, iocpage1sz, &ioc1_dma); + pIoc1 = dma_alloc_coherent(&ioc->pcidev->dev, iocpage1sz, &ioc1_dma, + GFP_KERNEL); if (!pIoc1) return; @@ -6245,7 +6256,8 @@ mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc) goto out; cfg.action = MPI_CONFIG_ACTION_PAGE_READ_CURRENT; - pbuf = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, &buf_dma); + pbuf = dma_alloc_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + &buf_dma, GFP_KERNEL); if (!pbuf) goto out; From 7a960b3a5e37f05d7d319bf257a8cf6ee9d18e7c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 Jan 2022 22:54:26 +0100 Subject: [PATCH 0048/1295] scsi: message: fusion: Use dma_alloc_coherent() in mptsas_exp_repmanufacture_info() In [1], Christoph Hellwig has proposed to remove the wrappers in include/linux/pci-dma-compat.h. Some reasons why this API should be removed have been given by Julia Lawall in [2]. The only caller of mptsas_exp_repmanufacture_info() is mptsas_probe_one_phy(). This function already calls sas_end_device_alloc() or sas_expander_alloc(). They both already use GFP_KERNEL. As no spin_lock is held at this point, it is safe to also use GFP_KERNEL here. [1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/ [2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/ Link: https://lore.kernel.org/r/d78d4a5b096897932808ed7e3a4540db1687c25d.1641500561.git.christophe.jaillet@wanadoo.fr Reviewed-by: Christoph Hellwig Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/message/fusion/mptsas.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index 0363b2a2264d..9b40be04710c 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -2896,7 +2896,8 @@ mptsas_exp_repmanufacture_info(MPT_ADAPTER *ioc, sz = sizeof(struct rep_manu_request) + sizeof(struct rep_manu_reply); - data_out = pci_alloc_consistent(ioc->pcidev, sz, &data_out_dma); + data_out = dma_alloc_coherent(&ioc->pcidev->dev, sz, &data_out_dma, + GFP_KERNEL); if (!data_out) { printk(KERN_ERR "Memory allocation failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); From 76a334d756c596f2f283e1d0054477ba0f0eacf6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 Jan 2022 22:54:33 +0100 Subject: [PATCH 0049/1295] scsi: message: fusion: mptsas: Use dma_alloc_coherent() In [1], Christoph Hellwig has proposed to remove the wrappers in include/linux/pci-dma-compat.h. Some reasons why this API should be removed have been given by Julia Lawall in [2]. In all these places where some memory is allocated GFP_KERNEL can be used because they already call mpt_config() which has an explicit might_sleep(). [1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/ [2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/ Link: https://lore.kernel.org/r/443b81ecb08b2fe6f789bb2fdff13a53c809e401.1641500561.git.christophe.jaillet@wanadoo.fr Reviewed-by: Christoph Hellwig Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/message/fusion/mptsas.c | 40 ++++++++++++++++----------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index 9b40be04710c..4acd8f9a48e1 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -702,8 +702,8 @@ mptsas_add_device_component_starget_ir(MPT_ADAPTER *ioc, if (!hdr.PageLength) goto out; - buffer = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) goto out; @@ -1399,8 +1399,8 @@ mptsas_sas_enclosure_pg0(MPT_ADAPTER *ioc, struct mptsas_enclosure *enclosure, goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { error = -ENOMEM; goto out; @@ -2058,8 +2058,8 @@ static int mptsas_get_linkerrors(struct sas_phy *phy) if (!hdr.ExtPageLength) return -ENXIO; - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) return -ENOMEM; @@ -2412,8 +2412,8 @@ mptsas_sas_io_unit_pg0(MPT_ADAPTER *ioc, struct mptsas_portinfo *port_info) goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { error = -ENOMEM; goto out; @@ -2487,8 +2487,8 @@ mptsas_sas_io_unit_pg1(MPT_ADAPTER *ioc) goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { error = -ENOMEM; goto out; @@ -2551,8 +2551,8 @@ mptsas_sas_phy_pg0(MPT_ADAPTER *ioc, struct mptsas_phyinfo *phy_info, goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { error = -ENOMEM; goto out; @@ -2614,8 +2614,8 @@ mptsas_sas_device_pg0(MPT_ADAPTER *ioc, struct mptsas_devinfo *device_info, goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { error = -ENOMEM; goto out; @@ -2697,8 +2697,8 @@ mptsas_sas_expander_pg0(MPT_ADAPTER *ioc, struct mptsas_portinfo *port_info, goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { error = -ENOMEM; goto out; @@ -2777,8 +2777,8 @@ mptsas_sas_expander_pg1(MPT_ADAPTER *ioc, struct mptsas_phyinfo *phy_info, goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { error = -ENOMEM; goto out; @@ -4273,8 +4273,8 @@ mptsas_adding_inactive_raid_components(MPT_ADAPTER *ioc, u8 channel, u8 id) if (!hdr.PageLength) goto out; - buffer = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) goto out; From 706dc3b91989a1286b0eb75a1cd816596c590e5c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 Jan 2022 22:54:39 +0100 Subject: [PATCH 0050/1295] scsi: message: fusion: mptctl: Use dma_alloc_coherent() In [1], Christoph Hellwig has proposed to remove the wrappers in include/linux/pci-dma-compat.h. Some reasons why this API should be removed have been given by Julia Lawall in [2]. When memory is allocated in kbuf_alloc_2_sgl() GFP_KERNEL can be used because this function already uses the GFP_USER flag for some memory allocation and not spin_lock is taken in the between. When memory is allocated in mptctl_do_mpt_command() GFP_KERNEL can be used because this function already uses copy_from_user() and this function can sleep. When memory is allocated in mptctl_hp_hostinfo() GFP_KERNEL can be used because this function already uses mpt_config() and this function has an explicit might_sleep(). When memory is allocated in mptctl_hp_targetinfo() GFP_KERNEL can be used because this function already uses mpt_config() and this function has an explicit might_sleep(). [1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/ [2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/ Link: https://lore.kernel.org/r/516375d6d06114484533baf03aae351306100246.1641500561.git.christophe.jaillet@wanadoo.fr Reviewed-by: Christoph Hellwig Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/message/fusion/mptctl.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/drivers/message/fusion/mptctl.c b/drivers/message/fusion/mptctl.c index 0f447179e3f5..03c8fb1795c2 100644 --- a/drivers/message/fusion/mptctl.c +++ b/drivers/message/fusion/mptctl.c @@ -1041,7 +1041,8 @@ kbuf_alloc_2_sgl(int bytes, u32 sgdir, int sge_offset, int *frags, * copying the data in this array into the correct place in the * request and chain buffers. */ - sglbuf = pci_alloc_consistent(ioc->pcidev, MAX_SGL_BYTES, sglbuf_dma); + sglbuf = dma_alloc_coherent(&ioc->pcidev->dev, MAX_SGL_BYTES, + sglbuf_dma, GFP_KERNEL); if (sglbuf == NULL) goto free_and_fail; @@ -1062,9 +1063,9 @@ kbuf_alloc_2_sgl(int bytes, u32 sgdir, int sge_offset, int *frags, while (bytes_allocd < bytes) { this_alloc = min(alloc_sz, bytes-bytes_allocd); buflist[buflist_ent].len = this_alloc; - buflist[buflist_ent].kptr = pci_alloc_consistent(ioc->pcidev, - this_alloc, - &pa); + buflist[buflist_ent].kptr = dma_alloc_coherent(&ioc->pcidev->dev, + this_alloc, + &pa, GFP_KERNEL); if (buflist[buflist_ent].kptr == NULL) { alloc_sz = alloc_sz / 2; if (alloc_sz == 0) { @@ -2105,8 +2106,9 @@ mptctl_do_mpt_command (MPT_ADAPTER *ioc, struct mpt_ioctl_command karg, void __u } flagsLength |= karg.dataOutSize; bufOut.len = karg.dataOutSize; - bufOut.kptr = pci_alloc_consistent( - ioc->pcidev, bufOut.len, &dma_addr_out); + bufOut.kptr = dma_alloc_coherent(&ioc->pcidev->dev, + bufOut.len, + &dma_addr_out, GFP_KERNEL); if (bufOut.kptr == NULL) { rc = -ENOMEM; @@ -2139,8 +2141,9 @@ mptctl_do_mpt_command (MPT_ADAPTER *ioc, struct mpt_ioctl_command karg, void __u flagsLength |= karg.dataInSize; bufIn.len = karg.dataInSize; - bufIn.kptr = pci_alloc_consistent(ioc->pcidev, - bufIn.len, &dma_addr_in); + bufIn.kptr = dma_alloc_coherent(&ioc->pcidev->dev, + bufIn.len, + &dma_addr_in, GFP_KERNEL); if (bufIn.kptr == NULL) { rc = -ENOMEM; @@ -2400,7 +2403,9 @@ mptctl_hp_hostinfo(MPT_ADAPTER *ioc, unsigned long arg, unsigned int data_size) /* Issue the second config page request */ cfg.action = MPI_CONFIG_ACTION_PAGE_READ_CURRENT; - pbuf = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, &buf_dma); + pbuf = dma_alloc_coherent(&ioc->pcidev->dev, + hdr.PageLength * 4, + &buf_dma, GFP_KERNEL); if (pbuf) { cfg.physAddr = buf_dma; if (mpt_config(ioc, &cfg) == 0) { @@ -2477,7 +2482,7 @@ mptctl_hp_hostinfo(MPT_ADAPTER *ioc, unsigned long arg, unsigned int data_size) else IstwiRWRequest->DeviceAddr = 0xB0; - pbuf = pci_alloc_consistent(ioc->pcidev, 4, &buf_dma); + pbuf = dma_alloc_coherent(&ioc->pcidev->dev, 4, &buf_dma, GFP_KERNEL); if (!pbuf) goto out; ioc->add_sge((char *)&IstwiRWRequest->SGL, @@ -2592,7 +2597,8 @@ mptctl_hp_targetinfo(MPT_ADAPTER *ioc, unsigned long arg) /* Get the data transfer speeds */ data_sz = ioc->spi_data.sdp0length * 4; - pg0_alloc = pci_alloc_consistent(ioc->pcidev, data_sz, &page_dma); + pg0_alloc = dma_alloc_coherent(&ioc->pcidev->dev, data_sz, &page_dma, + GFP_KERNEL); if (pg0_alloc) { hdr.PageVersion = ioc->spi_data.sdp0version; hdr.PageLength = data_sz; @@ -2657,7 +2663,8 @@ mptctl_hp_targetinfo(MPT_ADAPTER *ioc, unsigned long arg) /* Issue the second config page request */ cfg.action = MPI_CONFIG_ACTION_PAGE_READ_CURRENT; data_sz = (int) cfg.cfghdr.hdr->PageLength * 4; - pg3_alloc = pci_alloc_consistent(ioc->pcidev, data_sz, &page_dma); + pg3_alloc = dma_alloc_coherent(&ioc->pcidev->dev, data_sz, + &page_dma, GFP_KERNEL); if (pg3_alloc) { cfg.physAddr = page_dma; cfg.pageAddr = (karg.hdr.channel << 8) | karg.hdr.id; From 1aa7d9799e85addc29c06ece99bf1eae1ef9198f Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 9 Jan 2022 19:57:04 +0100 Subject: [PATCH 0051/1295] scsi: efct: Remove useless DMA-32 fallback configuration As stated in [1], dma_set_mask() with a 64-bit mask never fails if dev->dma_mask is non-NULL. So, if it fails, the 32 bits case will also fail for the same reason. Simplify code and remove some dead code accordingly. While at it, return the error code returned by dma_set_mask_and_coherent() instead of -1. [1]: https://lkml.org/lkml/2021/6/7/398 Link: https://lore.kernel.org/r/958bcb2a6e86344c14f38369e8e7079615a2b0e3.1641754613.git.christophe.jaillet@wanadoo.fr Reviewed-by: Christoph Hellwig Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/scsi/elx/efct/efct_driver.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/elx/efct/efct_driver.c b/drivers/scsi/elx/efct/efct_driver.c index ae62fc3c9ee3..b08fc8839808 100644 --- a/drivers/scsi/elx/efct/efct_driver.c +++ b/drivers/scsi/elx/efct/efct_driver.c @@ -541,13 +541,10 @@ efct_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_drvdata(pdev, efct); - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) != 0) { - dev_warn(&pdev->dev, "trying DMA_BIT_MASK(32)\n"); - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)) != 0) { - dev_err(&pdev->dev, "setting DMA_BIT_MASK failed\n"); - rc = -1; - goto dma_mask_out; - } + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (rc) { + dev_err(&pdev->dev, "setting DMA_BIT_MASK failed\n"); + goto dma_mask_out; } num_interrupts = efct_device_interrupts_required(efct); From 9008661e19606bdf6dddd33073b70872da400590 Mon Sep 17 00:00:00 2001 From: SEO HOYOUNG Date: Fri, 7 Jan 2022 06:39:24 +0900 Subject: [PATCH 0052/1295] scsi: ufs: Modify Tactive time setting conditions The Tactive time determines the waiting time before burst at hibern8 exit and is determined by hardware at linkup time. However, in the case of Samsung devices, increase host's Tactive time +100us for stability. If the HCI's Tactive time is equal or greater than the device, +100us should be set. Link: https://lore.kernel.org/r/20220106213924.186263-1-hy50.seo@samsung.com Reviewed-by: Alim Akhtar Acked-by: Avri Altman Signed-off-by: SEO HOYOUNG Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 1049e41abd5b..460d2b440d2e 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -7815,7 +7815,7 @@ static int ufshcd_quirk_tune_host_pa_tactivate(struct ufs_hba *hba) peer_pa_tactivate_us = peer_pa_tactivate * gran_to_us_table[peer_granularity - 1]; - if (pa_tactivate_us > peer_pa_tactivate_us) { + if (pa_tactivate_us >= peer_pa_tactivate_us) { u32 new_peer_pa_tactivate; new_peer_pa_tactivate = pa_tactivate_us / From 3ba880a12df5aa4488c18281701b5b1bc3d4531a Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Wed, 22 Dec 2021 07:09:30 +0000 Subject: [PATCH 0053/1295] scsi: ufs: ufs-mediatek: Fix error checking in ufs_mtk_init_va09_pwr_ctrl() The function regulator_get() returns an error pointer. Use IS_ERR() to validate the return value. Link: https://lore.kernel.org/r/20211222070930.9449-1-linmq006@gmail.com Fixes: cf137b3ea49a ("scsi: ufs-mediatek: Support VA09 regulator operations") Signed-off-by: Miaoqian Lin Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufs-mediatek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufs-mediatek.c b/drivers/scsi/ufs/ufs-mediatek.c index 5393b5c9dd9c..86a938075f30 100644 --- a/drivers/scsi/ufs/ufs-mediatek.c +++ b/drivers/scsi/ufs/ufs-mediatek.c @@ -557,7 +557,7 @@ static void ufs_mtk_init_va09_pwr_ctrl(struct ufs_hba *hba) struct ufs_mtk_host *host = ufshcd_get_variant(hba); host->reg_va09 = regulator_get(hba->dev, "va09"); - if (!host->reg_va09) + if (IS_ERR(host->reg_va09)) dev_info(hba->dev, "failed to get va09"); else host->caps |= UFS_MTK_CAP_VA09_PWR_CTRL; From 2576e153cd982d540b212e989458edc42ad1b390 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Thu, 30 Dec 2021 10:11:37 +0800 Subject: [PATCH 0054/1295] scsi: nsp_cs: Check of ioremap return value Since it is possible for ioremap() to fail, 'data->MmioAddress' could be NULL. Skip entry if ioremap() fails. Link: https://lore.kernel.org/r/20211230021137.1823352-1-jiasheng@iscas.ac.cn Fixes: 0e6f9d270840 ("pcmcia: use pcmcia_loop_config in scsi pcmcia drivers") Signed-off-by: Jiasheng Jiang Signed-off-by: Martin K. Petersen --- drivers/scsi/pcmcia/nsp_cs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/pcmcia/nsp_cs.c b/drivers/scsi/pcmcia/nsp_cs.c index 8b9e889bc306..92c818a8a84a 100644 --- a/drivers/scsi/pcmcia/nsp_cs.c +++ b/drivers/scsi/pcmcia/nsp_cs.c @@ -1557,6 +1557,9 @@ static int nsp_cs_config_check(struct pcmcia_device *p_dev, void *priv_data) data->MmioAddress = (unsigned long) ioremap(p_dev->resource[2]->start, resource_size(p_dev->resource[2])); + if (!data->MmioAddress) + goto next_entry; + data->MmioLength = resource_size(p_dev->resource[2]); } /* If we got this far, we're cool! */ From 0e906607b9c5ee22312c9af4d8adb45c617ea38a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 15:51:38 +0100 Subject: [PATCH 0055/1295] netfilter: nf_conntrack_netbios_ns: fix helper module alias The helper gets registered as 'netbios-ns', not netbios_ns. Intentionally not adding a fixes-tag because i don't want this to go to stable. This wasn't noticed for a very long time so no so no need to risk regressions. Reported-by: Yi Chen Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netbios_ns.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c index 7f19ee259609..55415f011943 100644 --- a/net/netfilter/nf_conntrack_netbios_ns.c +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -20,13 +20,14 @@ #include #include +#define HELPER_NAME "netbios-ns" #define NMBD_PORT 137 MODULE_AUTHOR("Patrick McHardy "); MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ip_conntrack_netbios_ns"); -MODULE_ALIAS_NFCT_HELPER("netbios_ns"); +MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); static unsigned int timeout __read_mostly = 3; module_param(timeout, uint, 0400); @@ -44,7 +45,7 @@ static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff, } static struct nf_conntrack_helper helper __read_mostly = { - .name = "netbios-ns", + .name = HELPER_NAME, .tuple.src.l3num = NFPROTO_IPV4, .tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT), .tuple.dst.protonum = IPPROTO_UDP, From cf46eacbc156a82d6643eb10afe8969abad5a35f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 11 Jan 2022 10:40:30 +0100 Subject: [PATCH 0056/1295] netfilter: nf_tables: remove unused variable > Remove unused variable and fix missing initialization. > > >> net/netfilter/nf_tables_api.c:8266:6: warning: variable 'i' set but not used [-Wunused-but-set-variable] > int i; > ^ Fixes: 2c865a8a28a1 ("netfilter: nf_tables: add rule blob layout") Reported-by: kernel test robot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 77938b1042f3..1cde8cd0d1a7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8264,14 +8264,12 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha void *data, *data_boundary; struct nft_rule_dp *prule; struct nft_rule *rule; - int i; /* already handled or inactive chain? */ if (chain->blob_next || !nft_is_active_next(net, chain)) return 0; rule = list_entry(&chain->rules, struct nft_rule, list); - i = 0; data_size = 0; list_for_each_entry_continue(rule, &chain->rules, list) { From a64067f4cecaaa4deed8e33d3266bc0bcc189142 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Fri, 7 Jan 2022 15:47:10 -0600 Subject: [PATCH 0057/1295] ASoC: simple-card: fix probe failure on platform component A previous change to simple-card resulted in asoc_simple_parse_dai attempting to retrieve the dai_name for platform components, which are unlikely to have a valid DAI name. This caused simple-card to fail to probe when using the xlnx_formatter_pcm as the platform component, since it does not register any DAI components. Since the dai_name is not used for platform components, just skip trying to retrieve it for those. Fixes: f107294c6422 ("ASoC: simple-card: support snd_soc_dai_link_component style for cpu") Signed-off-by: Robert Hancock Link: https://lore.kernel.org/r/20220107214711.1100162-6-robert.hancock@calian.com Signed-off-by: Mark Brown --- sound/soc/generic/simple-card.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/sound/soc/generic/simple-card.c b/sound/soc/generic/simple-card.c index a89d1cfdda32..78419e18717d 100644 --- a/sound/soc/generic/simple-card.c +++ b/sound/soc/generic/simple-card.c @@ -28,6 +28,30 @@ static const struct snd_soc_ops simple_ops = { .hw_params = asoc_simple_hw_params, }; +static int asoc_simple_parse_platform(struct device_node *node, + struct snd_soc_dai_link_component *dlc) +{ + struct of_phandle_args args; + int ret; + + if (!node) + return 0; + + /* + * Get node via "sound-dai = <&phandle port>" + * it will be used as xxx_of_node on soc_bind_dai_link() + */ + ret = of_parse_phandle_with_args(node, DAI, CELL, 0, &args); + if (ret) + return ret; + + /* dai_name is not required and may not exist for plat component */ + + dlc->of_node = args.np; + + return 0; +} + static int asoc_simple_parse_dai(struct device_node *node, struct snd_soc_dai_link_component *dlc, int *is_single_link) @@ -289,7 +313,7 @@ static int simple_dai_link_of(struct asoc_simple_priv *priv, if (ret < 0) goto dai_link_of_err; - ret = asoc_simple_parse_dai(plat, platforms, NULL); + ret = asoc_simple_parse_platform(plat, platforms); if (ret < 0) goto dai_link_of_err; From 879cf8006475642b747aaaa4d06f7044ab2de794 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 11 Jan 2022 10:26:58 +0300 Subject: [PATCH 0058/1295] regulator: max20086: fix error code in max20086_parse_regulators_dt() This code accidentally returns PTR_ERR(NULL) which is success. It should return a negative error code. Fixes: bfff546aae50 ("regulator: Add MAX20086-MAX20089 driver") Signed-off-by: Dan Carpenter Reviewed-by: Laurent Pinchart Link: https://lore.kernel.org/r/20220111072657.GK11243@kili Signed-off-by: Mark Brown --- drivers/regulator/max20086-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/max20086-regulator.c b/drivers/regulator/max20086-regulator.c index fbc56b043071..63aa6ec3254a 100644 --- a/drivers/regulator/max20086-regulator.c +++ b/drivers/regulator/max20086-regulator.c @@ -140,7 +140,7 @@ static int max20086_parse_regulators_dt(struct max20086 *chip, bool *boot_on) node = of_get_child_by_name(chip->dev->of_node, "regulators"); if (!node) { dev_err(chip->dev, "regulators node not found\n"); - return PTR_ERR(node); + return -ENODEV; } for (i = 0; i < chip->info->num_outputs; ++i) From fe75e84a8fe17449ea16b73cfcfc9e7d06a49130 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 12 Jan 2022 12:29:05 +0100 Subject: [PATCH 0059/1295] netfilter: nf_tables: set last expression in register tracking area nft_rule_for_each_expr() sets on last to nft_rule_last(), however, this is coming after track.last field is set on. Use nft_expr_last() to set track.last accordingly. Fixes: 12e4ecfa244b ("netfilter: nf_tables: add register tracking infrastructure") Reported-by: Dan Carpenter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1cde8cd0d1a7..cf454f8ca2b0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8299,7 +8299,7 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha return -ENOMEM; size = 0; - track.last = last; + track.last = nft_expr_last(rule); nft_rule_for_each_expr(expr, last, rule) { track.cur = expr; From f7a6021aaf02088870559f82fc13c58cda7fea1a Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Tue, 11 Jan 2022 10:50:48 +0800 Subject: [PATCH 0060/1295] ASoC: cpcap: Check for NULL pointer after calling of_get_child_by_name If the device does not exist, of_get_child_by_name() will return NULL pointer. And devm_snd_soc_register_component() does not check it. Also, I have noticed that cpcap_codec_driver has not been used yet. Therefore, it should be better to check it in order to avoid the future dereference of the NULL pointer. Fixes: f6cdf2d3445d ("ASoC: cpcap: new codec") Signed-off-by: Jiasheng Jiang Link: https://lore.kernel.org/r/20220111025048.524134-1-jiasheng@iscas.ac.cn Signed-off-by: Mark Brown --- sound/soc/codecs/cpcap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/codecs/cpcap.c b/sound/soc/codecs/cpcap.c index 598e09024e23..ffdf8b615efa 100644 --- a/sound/soc/codecs/cpcap.c +++ b/sound/soc/codecs/cpcap.c @@ -1667,6 +1667,8 @@ static int cpcap_codec_probe(struct platform_device *pdev) { struct device_node *codec_node = of_get_child_by_name(pdev->dev.parent->of_node, "audio-codec"); + if (!codec_node) + return -ENODEV; pdev->dev.of_node = codec_node; From 90e12a3191040bd3854d3e236c35921e4e92a044 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 9 Dec 2021 14:53:29 -0500 Subject: [PATCH 0061/1295] NFSv4 remove zero number of fs_locations entries error check Remove the check for the zero length fs_locations reply in the xdr decoding, and instead check for that in the migration code. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4state.c | 3 +++ fs/nfs/nfs4xdr.c | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f63dfa01001c..f3265575c28d 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2106,6 +2106,9 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred } result = -NFS4ERR_NXIO; + if (!locations->nlocations) + goto out; + if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { dprintk("<-- %s: No fs_locations data, migration skipped\n", __func__); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index a9487c840052..8e70b92df4cc 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3732,8 +3732,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st if (unlikely(!p)) goto out_eio; n = be32_to_cpup(p); - if (n <= 0) - goto out_eio; for (res->nlocations = 0; res->nlocations < n; res->nlocations++) { u32 m; struct nfs4_fs_location *loc; From 8a59bb93b7e3cca389af44781a429ac12ac49be6 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 9 Dec 2021 14:53:30 -0500 Subject: [PATCH 0062/1295] NFSv4 store server support for fs_location attribute Define and store if server returns it supports fs_locations attribute as a capability. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 2 ++ include/linux/nfs_fs_sb.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 44ab27f20f04..7d4c63282793 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3875,6 +3875,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL) server->caps |= NFS_CAP_SECURITY_LABEL; #endif + if (res.attr_bitmask[0] & FATTR4_WORD0_FS_LOCATIONS) + server->caps |= NFS_CAP_FS_LOCATIONS; if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID)) server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID; if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE)) diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index f24fc67af42d..8c08b356c8ca 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -289,5 +289,5 @@ struct nfs_server { #define NFS_CAP_COPY_NOTIFY (1U << 27) #define NFS_CAP_XATTR (1U << 28) #define NFS_CAP_READ_PLUS (1U << 29) - +#define NFS_CAP_FS_LOCATIONS (1U << 30) #endif From d068eebbd4822b6c14a7ea375dfe53ca5c69c776 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 17 Dec 2021 16:48:54 +0100 Subject: [PATCH 0063/1295] cgroup/cpuset: Make child cpusets restrict parents on v1 hierarchy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 1f1562fcd04a ("cgroup/cpuset: Don't let child cpusets restrict parent in default hierarchy") inteded to relax the check only on the default hierarchy (or v2 mode) but it dropped the check in v1 too. This patch returns and separates the legacy-only validations so that they can be considered only in the v1 mode, which should enforce the old constraints for the sake of compatibility. Fixes: 1f1562fcd04a ("cgroup/cpuset: Don't let child cpusets restrict parent in default hierarchy") Suggested-by: Waiman Long Signed-off-by: Michal Koutný Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 52 ++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index dc653ab26e50..bb3531e7fda7 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -590,6 +590,35 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); } +/* + * validate_change_legacy() - Validate conditions specific to legacy (v1) + * behavior. + */ +static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) +{ + struct cgroup_subsys_state *css; + struct cpuset *c, *par; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* Each of our child cpusets must be a subset of us */ + ret = -EBUSY; + cpuset_for_each_child(c, css, cur) + if (!is_cpuset_subset(c, trial)) + goto out; + + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ + ret = -EACCES; + par = parent_cs(cur); + if (par && !is_cpuset_subset(trial, par)) + goto out; + + ret = 0; +out: + return ret; +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -614,20 +643,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; - int ret; - - /* The checks don't apply to root cpuset */ - if (cur == &top_cpuset) - return 0; + int ret = 0; rcu_read_lock(); - par = parent_cs(cur); - /* On legacy hierarchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) + if (!is_in_v2_mode()) + ret = validate_change_legacy(cur, trial); + if (ret) goto out; + /* Remaining checks don't apply to root cpuset */ + if (cur == &top_cpuset) + goto out; + + par = parent_cs(cur); + /* * If either I or some sibling (!= me) is exclusive, we can't * overlap @@ -1175,9 +1205,7 @@ enum subparts_cmd { * * Because of the implicit cpu exclusive nature of a partition root, * cpumask changes that violates the cpu exclusivity rule will not be - * permitted when checked by validate_change(). The validate_change() - * function will also prevent any changes to the cpu list if it is not - * a superset of children's cpu lists. + * permitted when checked by validate_change(). */ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, struct cpumask *newmask, From 180d0eb290a5d11e6d3d99fea0841ceae2893901 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 13 Jan 2022 11:58:05 +0100 Subject: [PATCH 0064/1295] parisc: Add visible flag to toc_stack variable Add the visible flag to the toc_stack variable to make it visible for assembly code and to avoid a sparse warning. Reported-by: kernel test robot Signed-off-by: Helge Deller --- arch/parisc/kernel/toc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/toc.c b/arch/parisc/kernel/toc.c index be9a0bebe61e..fa5a10eaf0aa 100644 --- a/arch/parisc/kernel/toc.c +++ b/arch/parisc/kernel/toc.c @@ -12,7 +12,7 @@ #include static unsigned int __aligned(16) toc_lock = 1; -DEFINE_PER_CPU_PAGE_ALIGNED(char [16384], toc_stack); +DEFINE_PER_CPU_PAGE_ALIGNED(char [16384], toc_stack) __visible; static void toc20_to_pt_regs(struct pt_regs *regs, struct pdc_toc_pim_20 *toc) { From 7d70984a1ad4c445dff08edb9aacce8906b6a222 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 13 Jan 2022 12:22:38 +0100 Subject: [PATCH 0065/1295] netfilter: nft_connlimit: memleak if nf_ct_netns_get() fails Check if nf_ct_netns_get() fails then release the limit object previously allocated via kmalloc(). Fixes: 37f319f37d90 ("netfilter: nft_connlimit: move stateful fields out of expression data") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_connlimit.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 7d00a1452b1d..3362417ebfdb 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -62,6 +62,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx, { bool invert = false; u32 flags, limit; + int err; if (!tb[NFTA_CONNLIMIT_COUNT]) return -EINVAL; @@ -84,7 +85,15 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx, priv->limit = limit; priv->invert = invert; - return nf_ct_netns_get(ctx->net, ctx->family); + err = nf_ct_netns_get(ctx->net, ctx->family); + if (err < 0) + goto err_netns; + + return 0; +err_netns: + kfree(priv->list); + + return err; } static void nft_connlimit_do_destroy(const struct nft_ctx *ctx, From 1976b2b31462151403c9fc110204fcc2a77bdfd1 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 12 Jan 2022 10:27:38 -0500 Subject: [PATCH 0066/1295] NFSv4.1 query for fs_location attr on a new file system Query the server for other possible trunkable locations for a given file system on a 4.1+ mount. v2: -- added missing static to nfs4_discover_trunking, reported by the kernel test robot Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 7 ++++ fs/nfs/nfs4_fs.h | 9 ++--- fs/nfs/nfs4proc.c | 76 +++++++++++++++++++++++++++++++++++------ fs/nfs/nfs4state.c | 3 +- include/linux/nfs_xdr.h | 1 + 5 files changed, 81 insertions(+), 15 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 1e4dc1ab9312..f7e39cc4472b 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -860,6 +860,13 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str server->namelen = pathinfo.max_namelen; } + if (clp->rpc_ops->discover_trunking != NULL && + (server->caps & NFS_CAP_FS_LOCATIONS)) { + error = clp->rpc_ops->discover_trunking(server, mntfh); + if (error < 0) + return error; + } + return 0; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 89076dd32334..166ea6112e61 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -260,8 +260,8 @@ struct nfs4_state_maintenance_ops { }; struct nfs4_mig_recovery_ops { - int (*get_locations)(struct inode *, struct nfs4_fs_locations *, - struct page *, const struct cred *); + int (*get_locations)(struct nfs_server *, struct nfs_fh *, + struct nfs4_fs_locations *, struct page *, const struct cred *); int (*fsid_present)(struct inode *, const struct cred *); }; @@ -302,8 +302,9 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *, struct nfs4_fs_locations *, struct page *); -extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *, - struct page *page, const struct cred *); +extern int nfs4_proc_get_locations(struct nfs_server *, struct nfs_fh *, + struct nfs4_fs_locations *, + struct page *page, const struct cred *); extern int nfs4_proc_fsid_present(struct inode *, const struct cred *); extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct dentry *, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7d4c63282793..fc4629d2d2ab 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3935,6 +3935,60 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) return err; } +static int _nfs4_discover_trunking(struct nfs_server *server, + struct nfs_fh *fhandle) +{ + struct nfs4_fs_locations *locations = NULL; + struct page *page; + const struct cred *cred; + struct nfs_client *clp = server->nfs_client; + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + int status = -ENOMEM; + + cred = ops->get_state_renewal_cred(clp); + if (cred == NULL) { + cred = nfs4_get_clid_cred(clp); + if (cred == NULL) + return -ENOKEY; + } + + page = alloc_page(GFP_KERNEL); + locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (page == NULL || locations == NULL) + goto out; + + status = nfs4_proc_get_locations(server, fhandle, locations, page, + cred); + if (status) + goto out; +out: + if (page) + __free_page(page); + kfree(locations); + return status; +} + +static int nfs4_discover_trunking(struct nfs_server *server, + struct nfs_fh *fhandle) +{ + struct nfs4_exception exception = { + .interruptible = true, + }; + struct nfs_client *clp = server->nfs_client; + int err = 0; + + if (!nfs4_has_session(clp)) + goto out; + do { + err = nfs4_handle_exception(server, + _nfs4_discover_trunking(server, fhandle), + &exception); + } while (exception.retry); +out: + return err; +} + static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { @@ -7823,18 +7877,18 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, * appended to this compound to identify the client ID which is * performing recovery. */ -static int _nfs40_proc_get_locations(struct inode *inode, +static int _nfs40_proc_get_locations(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs4_fs_locations *locations, struct page *page, const struct cred *cred) { - struct nfs_server *server = NFS_SERVER(inode); struct rpc_clnt *clnt = server->client; u32 bitmask[2] = { [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, }; struct nfs4_fs_locations_arg args = { .clientid = server->nfs_client->cl_clientid, - .fh = NFS_FH(inode), + .fh = fhandle, .page = page, .bitmask = bitmask, .migration = 1, /* skip LOOKUP */ @@ -7880,17 +7934,17 @@ static int _nfs40_proc_get_locations(struct inode *inode, * When the client supports GETATTR(fs_locations_info), it can * be plumbed in here. */ -static int _nfs41_proc_get_locations(struct inode *inode, +static int _nfs41_proc_get_locations(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs4_fs_locations *locations, struct page *page, const struct cred *cred) { - struct nfs_server *server = NFS_SERVER(inode); struct rpc_clnt *clnt = server->client; u32 bitmask[2] = { [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, }; struct nfs4_fs_locations_arg args = { - .fh = NFS_FH(inode), + .fh = fhandle, .page = page, .bitmask = bitmask, .migration = 1, /* skip LOOKUP */ @@ -7939,11 +7993,11 @@ static int _nfs41_proc_get_locations(struct inode *inode, * -NFS4ERR_LEASE_MOVED is returned if the server still has leases * from this client that require migration recovery. */ -int nfs4_proc_get_locations(struct inode *inode, +int nfs4_proc_get_locations(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs4_fs_locations *locations, struct page *page, const struct cred *cred) { - struct nfs_server *server = NFS_SERVER(inode); struct nfs_client *clp = server->nfs_client; const struct nfs4_mig_recovery_ops *ops = clp->cl_mvops->mig_recovery_ops; @@ -7956,10 +8010,11 @@ int nfs4_proc_get_locations(struct inode *inode, (unsigned long long)server->fsid.major, (unsigned long long)server->fsid.minor, clp->cl_hostname); - nfs_display_fhandle(NFS_FH(inode), __func__); + nfs_display_fhandle(fhandle, __func__); do { - status = ops->get_locations(inode, locations, page, cred); + status = ops->get_locations(server, fhandle, locations, page, + cred); if (status != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, status, &exception); @@ -10428,6 +10483,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .free_client = nfs4_free_client, .create_server = nfs4_create_server, .clone_server = nfs_clone_server, + .discover_trunking = nfs4_discover_trunking, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f3265575c28d..499bef9fe118 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2098,7 +2098,8 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred } inode = d_inode(server->super->s_root); - result = nfs4_proc_get_locations(inode, locations, page, cred); + result = nfs4_proc_get_locations(server, NFS_FH(inode), locations, + page, cred); if (result) { dprintk("<-- %s: failed to retrieve fs_locations: %d\n", __func__, result); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index ddff92396a3f..728cb0c1f0b6 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1797,6 +1797,7 @@ struct nfs_rpc_ops { struct nfs_server *(*create_server)(struct fs_context *); struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, rpc_authflavor_t); + int (*discover_trunking)(struct nfs_server *, struct nfs_fh *); }; /* From f5b27cc6761e27ee6387a24df1a99ca77b360fea Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 9 Dec 2021 14:53:32 -0500 Subject: [PATCH 0067/1295] NFSv4 expose nfs_parse_server_name function Make nfs_parse_server_name available outside of nfs4namespace.c. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4_fs.h | 3 ++- fs/nfs/nfs4namespace.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 166ea6112e61..4f998dbeb73f 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -280,7 +280,8 @@ struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, int nfs4_submount(struct fs_context *, struct nfs_server *); int nfs4_replace_transport(struct nfs_server *server, const struct nfs4_fs_locations *locations); - +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, + size_t salen, struct net *net); /* nfs4proc.c */ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); extern int nfs4_async_handle_error(struct rpc_task *task, diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 873342308dc0..f1ed4f60a7f3 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -164,8 +164,8 @@ static int nfs4_validate_fspath(struct dentry *dentry, return 0; } -static size_t nfs_parse_server_name(char *string, size_t len, - struct sockaddr *sa, size_t salen, struct net *net) +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, + size_t salen, struct net *net) { ssize_t ret; From a8d54baba7c65db2d3278873def61f8d3753d766 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 9 Dec 2021 14:53:33 -0500 Subject: [PATCH 0068/1295] NFSv4 handle port presence in fs_location server string An fs_location attribute returns a string that can be ipv4, ipv6, or DNS name. An ip location can have a port appended to it and if no port is present a default port needs to be set. If rpc_pton() fails to parse, try calling rpc_uaddr2socaddr() that can convert an universal address. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4_fs.h | 2 +- fs/nfs/nfs4namespace.c | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4f998dbeb73f..84f39b6f1b1e 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -281,7 +281,7 @@ int nfs4_submount(struct fs_context *, struct nfs_server *); int nfs4_replace_transport(struct nfs_server *server, const struct nfs4_fs_locations *locations); size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, - size_t salen, struct net *net); + size_t salen, struct net *net, int port); /* nfs4proc.c */ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); extern int nfs4_async_handle_error(struct rpc_task *task, diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index f1ed4f60a7f3..3680c8da510c 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -165,15 +165,20 @@ static int nfs4_validate_fspath(struct dentry *dentry, } size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, - size_t salen, struct net *net) + size_t salen, struct net *net, int port) { ssize_t ret; ret = rpc_pton(net, string, len, sa, salen); if (ret == 0) { - ret = nfs_dns_resolve_name(net, string, len, sa, salen); - if (ret < 0) - ret = 0; + ret = rpc_uaddr2sockaddr(net, string, len, sa, salen); + if (ret == 0) { + ret = nfs_dns_resolve_name(net, string, len, sa, salen); + if (ret < 0) + ret = 0; + } + } else if (port) { + rpc_set_port(sa, port); } return ret; } @@ -328,7 +333,7 @@ static int try_location(struct fs_context *fc, nfs_parse_server_name(buf->data, buf->len, &ctx->nfs_server.address, sizeof(ctx->nfs_server._address), - fc->net_ns); + fc->net_ns, 0); if (ctx->nfs_server.addrlen == 0) continue; @@ -496,7 +501,7 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server, continue; salen = nfs_parse_server_name(buf->data, buf->len, - sap, addr_bufsize, net); + sap, addr_bufsize, net, 0); if (salen == 0) continue; rpc_set_port(sap, NFS_PORT); From b8a09619a56334414cbd7f935a0796240d0cc07e Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 9 Dec 2021 14:53:34 -0500 Subject: [PATCH 0069/1295] SUNRPC allow for unspecified transport time in rpc_clnt_add_xprt If the supplied argument doesn't specify the transport type, use the type of the existing rpc clnt and its existing transport. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index a312ea2bc440..c83fe618767c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2900,7 +2900,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, unsigned long connect_timeout; unsigned long reconnect_timeout; unsigned char resvport, reuseport; - int ret = 0; + int ret = 0, ident; rcu_read_lock(); xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); @@ -2914,8 +2914,11 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, reuseport = xprt->reuseport; connect_timeout = xprt->connect_timeout; reconnect_timeout = xprt->max_reconnect_timeout; + ident = xprt->xprt_class->ident; rcu_read_unlock(); + if (!xprtargs->ident) + xprtargs->ident = ident; xprt = xprt_create_transport(xprtargs); if (IS_ERR(xprt)) { ret = PTR_ERR(xprt); From 4ca9f31a2be66d5fbf34b5b91ef17de7480992e1 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 9 Dec 2021 14:53:35 -0500 Subject: [PATCH 0070/1295] NFSv4.1 test and add 4.1 trunking transport For each location returned in FS_LOCATION query, establish a transport to the server, send EXCHANGE_ID and test for trunking, if successful, add the transport to the exiting client. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index fc4629d2d2ab..b18f31b2c9e7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3935,6 +3935,56 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) return err; } +static void test_fs_location_for_trunking(struct nfs4_fs_location *location, + struct nfs_client *clp, + struct nfs_server *server) +{ + int i; + + for (i = 0; i < location->nservers; i++) { + struct nfs4_string *srv_loc = &location->servers[i]; + struct sockaddr addr; + size_t addrlen; + struct xprt_create xprt_args = { + .ident = 0, + .net = clp->cl_net, + }; + struct nfs4_add_xprt_data xprtdata = { + .clp = clp, + }; + struct rpc_add_xprt_test rpcdata = { + .add_xprt_test = clp->cl_mvops->session_trunk, + .data = &xprtdata, + }; + char *servername = NULL; + + if (!srv_loc->len) + continue; + + addrlen = nfs_parse_server_name(srv_loc->data, srv_loc->len, + &addr, sizeof(addr), + clp->cl_net, server->port); + if (!addrlen) + return; + xprt_args.dstaddr = &addr; + xprt_args.addrlen = addrlen; + servername = kmalloc(srv_loc->len + 1, GFP_KERNEL); + if (!servername) + return; + memcpy(servername, srv_loc->data, srv_loc->len); + servername[srv_loc->len] = '\0'; + xprt_args.servername = servername; + + xprtdata.cred = nfs4_get_clid_cred(clp); + rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, + rpc_clnt_setup_test_and_add_xprt, + &rpcdata); + if (xprtdata.cred) + put_cred(xprtdata.cred); + kfree(servername); + } +} + static int _nfs4_discover_trunking(struct nfs_server *server, struct nfs_fh *fhandle) { @@ -3944,7 +3994,7 @@ static int _nfs4_discover_trunking(struct nfs_server *server, struct nfs_client *clp = server->nfs_client; const struct nfs4_state_maintenance_ops *ops = clp->cl_mvops->state_renewal_ops; - int status = -ENOMEM; + int status = -ENOMEM, i; cred = ops->get_state_renewal_cred(clp); if (cred == NULL) { @@ -3962,6 +4012,10 @@ static int _nfs4_discover_trunking(struct nfs_server *server, cred); if (status) goto out; + + for (i = 0; i < locations->nlocations; i++) + test_fs_location_for_trunking(&locations->locations[i], clp, + server); out: if (page) __free_page(page); From 776d794f28c95051bc70405a7b1fa40115658a18 Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Thu, 9 Sep 2021 12:32:38 +0800 Subject: [PATCH 0071/1295] net/sunrpc: fix reference count leaks in rpc_sysfs_xprt_state_change The refcount leak issues take place in an error handling path. When the 3rd argument buf doesn't match with "offline", "online" or "remove", the function simply returns -EINVAL and forgets to decrease the reference count of a rpc_xprt object and a rpc_xprt_switch object increased by rpc_sysfs_xprt_kobj_get_xprt() and rpc_sysfs_xprt_kobj_get_xprt_switch(), causing reference count leaks of both unused objects. Fix this issue by jumping to the error handling path labelled with out_put when buf matches none of "offline", "online" or "remove". Signed-off-by: Xiyu Yang Signed-off-by: Xin Xiong Signed-off-by: Xin Tan Signed-off-by: Anna Schumaker --- net/sunrpc/sysfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index b1aea3419218..49fd0339dc59 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -295,8 +295,10 @@ static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj, online = 1; else if (!strncmp(buf, "remove", 6)) remove = 1; - else - return -EINVAL; + else { + count = -EINVAL; + goto out_put; + } if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { count = -EINTR; From 1a48db3fef499f615b56093947ec4b0d3d8e3021 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 15 Nov 2021 11:54:25 -0500 Subject: [PATCH 0072/1295] sunrpc: Fix potential race conditions in rpc_sysfs_xprt_state_change() We need to use test_and_set_bit() when changing xprt state flags to avoid potentially getting xps->xps_nactive out of sync. Signed-off-by: Anna Schumaker --- net/sunrpc/sysfs.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 49fd0339dc59..b64a0286b182 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -309,25 +309,28 @@ static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj, goto release_tasks; } if (offline) { - set_bit(XPRT_OFFLINE, &xprt->state); - spin_lock(&xps->xps_lock); - xps->xps_nactive--; - spin_unlock(&xps->xps_lock); + if (!test_and_set_bit(XPRT_OFFLINE, &xprt->state)) { + spin_lock(&xps->xps_lock); + xps->xps_nactive--; + spin_unlock(&xps->xps_lock); + } } else if (online) { - clear_bit(XPRT_OFFLINE, &xprt->state); - spin_lock(&xps->xps_lock); - xps->xps_nactive++; - spin_unlock(&xps->xps_lock); + if (test_and_clear_bit(XPRT_OFFLINE, &xprt->state)) { + spin_lock(&xps->xps_lock); + xps->xps_nactive++; + spin_unlock(&xps->xps_lock); + } } else if (remove) { if (test_bit(XPRT_OFFLINE, &xprt->state)) { - set_bit(XPRT_REMOVE, &xprt->state); - xprt_force_disconnect(xprt); - if (test_bit(XPRT_CONNECTED, &xprt->state)) { - if (!xprt->sending.qlen && - !xprt->pending.qlen && - !xprt->backlog.qlen && - !atomic_long_read(&xprt->queuelen)) - rpc_xprt_switch_remove_xprt(xps, xprt); + if (!test_and_set_bit(XPRT_REMOVE, &xprt->state)) { + xprt_force_disconnect(xprt); + if (test_bit(XPRT_CONNECTED, &xprt->state)) { + if (!xprt->sending.qlen && + !xprt->pending.qlen && + !xprt->backlog.qlen && + !atomic_long_read(&xprt->queuelen)) + rpc_xprt_switch_remove_xprt(xps, xprt); + } } } else { count = -EINVAL; From 6840f9094f2bd788a316d8cb0a4e42538d3e47dd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Jan 2022 16:44:19 -0500 Subject: [PATCH 0073/1295] pagevec: Initialise folio_batch->percpu_pvec_drained When UBSAN is enabled, it reports an invalid value in __pagevec_release() when accessing pvec->percpu_pvec_drained, which is simply whatever garbage was on the stack. Initialise it when initialising the rest of the folio_batch. Fixes: 10331795fb79 ("pagevec: Add folio_batch") Reported-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/pagevec.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index dda8d5868c81..67b1246f136b 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -111,6 +111,7 @@ static_assert(offsetof(struct pagevec, pages) == static inline void folio_batch_init(struct folio_batch *fbatch) { fbatch->nr = 0; + fbatch->percpu_pvec_drained = false; } static inline unsigned int folio_batch_count(struct folio_batch *fbatch) From 094d00f8ca58c5d29b25e23b4daaed1ff1f13b41 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 14 Jan 2022 08:57:58 +0000 Subject: [PATCH 0074/1295] KVM: arm64: pkvm: Use the mm_ops indirection for cache maintenance CMOs issued from EL2 cannot directly use the kernel helpers, as EL2 doesn't have a mapping of the guest pages. Oops. Instead, use the mm_ops indirection to use helpers that will perform a mapping at EL2 and allow the CMO to be effective. Fixes: 25aa28691bb9 ("KVM: arm64: Move guest CMOs to the fault handlers") Reviewed-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220114125038.1336965-1-maz@kernel.org --- arch/arm64/kvm/hyp/pgtable.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 844a6f003fd5..2cb3867eb7c2 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -983,13 +983,9 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, */ stage2_put_pte(ptep, mmu, addr, level, mm_ops); - if (need_flush) { - kvm_pte_t *pte_follow = kvm_pte_follow(pte, mm_ops); - - dcache_clean_inval_poc((unsigned long)pte_follow, - (unsigned long)pte_follow + - kvm_granule_size(level)); - } + if (need_flush && mm_ops->dcache_clean_inval_poc) + mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops), + kvm_granule_size(level)); if (childp) mm_ops->put_page(childp); @@ -1151,15 +1147,13 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct kvm_pgtable *pgt = arg; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; kvm_pte_t pte = *ptep; - kvm_pte_t *pte_follow; if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte)) return 0; - pte_follow = kvm_pte_follow(pte, mm_ops); - dcache_clean_inval_poc((unsigned long)pte_follow, - (unsigned long)pte_follow + - kvm_granule_size(level)); + if (mm_ops->dcache_clean_inval_poc) + mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops), + kvm_granule_size(level)); return 0; } From 4ee7e4a6c9b298da44029ed9ec8ed23ae49cc209 Mon Sep 17 00:00:00 2001 From: Christoph Fritz Date: Wed, 12 Jan 2022 19:33:21 +0100 Subject: [PATCH 0075/1295] ovl: fix NULL pointer dereference in copy up warning This patch is fixing a NULL pointer dereference to get a recently introduced warning message working. Fixes: 5b0a414d06c3 ("ovl: fix filattr copy-up failure") Signed-off-by: Christoph Fritz Cc: # v5.15 Signed-off-by: Miklos Szeredi --- fs/overlayfs/copy_up.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index b193d08a3dc3..347b06479663 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -145,7 +145,7 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old, if (err == -ENOTTY || err == -EINVAL) return 0; pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n", - old, err); + old->dentry, err); return err; } @@ -168,7 +168,7 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old, err = ovl_real_fileattr_get(new, &newfa); if (err) { pr_warn("failed to retrieve upper fileattr (%pd2, err=%i)\n", - new, err); + new->dentry, err); return err; } From c03061e7a210b4fe37440a1940fc198744a55ca4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 13 Jan 2022 12:20:23 -0500 Subject: [PATCH 0076/1295] xprtrdma: Remove final dprintk call sites from xprtrdma Deprecated. This information is available via tracepoints. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3d3673ba9e1e..1d6e85fe3133 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -274,8 +274,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ep->re_connect_status = -ENETUNREACH; goto wake_connect_worker; case RDMA_CM_EVENT_REJECTED: - dprintk("rpcrdma: connection to %pISpc rejected: %s\n", - sap, rdma_reject_msg(id, event->status)); ep->re_connect_status = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) ep->re_connect_status = -ENOTCONN; @@ -291,8 +289,6 @@ disconnected: break; } - dprintk("RPC: %s: %pISpc on %s/frwr: %s\n", __func__, sap, - ep->re_id->device->name, rdma_event_msg(event->event)); return 0; } @@ -419,14 +415,6 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) ep->re_attr.qp_type = IB_QPT_RC; ep->re_attr.port_num = ~0; - dprintk("RPC: %s: requested max: dtos: send %d recv %d; " - "iovs: send %d recv %d\n", - __func__, - ep->re_attr.cap.max_send_wr, - ep->re_attr.cap.max_recv_wr, - ep->re_attr.cap.max_send_sge, - ep->re_attr.cap.max_recv_sge); - ep->re_send_batch = ep->re_max_requests >> 3; ep->re_send_count = ep->re_send_batch; init_waitqueue_head(&ep->re_connect_wait); From c0f26167ddcf94fb94e80fcb20aaac7f7db13c1a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 13 Jan 2022 12:20:30 -0500 Subject: [PATCH 0077/1295] xprtrdma: Remove definitions of RPCDBG_FACILITY Deprecated. dprintk is no longer used in xprtrdma. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 4 ---- net/sunrpc/xprtrdma/frwr_ops.c | 4 ---- net/sunrpc/xprtrdma/rpc_rdma.c | 4 ---- net/sunrpc/xprtrdma/transport.c | 4 ---- net/sunrpc/xprtrdma/verbs.c | 11 ----------- 5 files changed, 27 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 17f174d6ea3b..faba7136dd9a 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -13,10 +13,6 @@ #include "xprt_rdma.h" #include -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - #undef RPCRDMA_BACKCHANNEL_DEBUG /** diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index ff699307e820..515dd7a66a04 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -45,10 +45,6 @@ #include "xprt_rdma.h" #include -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - static void frwr_cid_init(struct rpcrdma_ep *ep, struct rpcrdma_mr *mr) { diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 8035a983c8ce..281ddb87ac8d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -54,10 +54,6 @@ #include "xprt_rdma.h" #include -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - /* Returns size of largest RPC-over-RDMA header in a Call message * * The largest Call header contains a full-size Read list and a diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 16e5696314a4..42e375dbdadb 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -60,10 +60,6 @@ #include "xprt_rdma.h" #include -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - /* * tunables */ diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1d6e85fe3133..f172d1298013 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -63,17 +63,6 @@ #include "xprt_rdma.h" #include -/* - * Globals/Macros - */ - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - -/* - * internal functions - */ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt); static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, From aed28b7a2d620cb5cd0c554cb889075c02e25e8e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 13 Jan 2022 12:20:36 -0500 Subject: [PATCH 0078/1295] SUNRPC: Don't dereference xprt->snd_task if it's a cookie Fixes: e26d9972720e ("SUNRPC: Clean up scheduling of autoclose") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/sunrpc.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index a6af347c935d..7cdcbc6dc38e 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -965,7 +965,8 @@ TRACE_EVENT(rpc_socket_nospace, { BIT(XPRT_REMOVE), "REMOVE" }, \ { BIT(XPRT_CONGESTED), "CONGESTED" }, \ { BIT(XPRT_CWND_WAIT), "CWND_WAIT" }, \ - { BIT(XPRT_WRITE_SPACE), "WRITE_SPACE" }) + { BIT(XPRT_WRITE_SPACE), "WRITE_SPACE" }, \ + { BIT(XPRT_SND_IS_COOKIE), "SND_IS_COOKIE" }) DECLARE_EVENT_CLASS(rpc_xprt_lifetime_class, TP_PROTO( @@ -1162,8 +1163,11 @@ DECLARE_EVENT_CLASS(xprt_writelock_event, __entry->task_id = -1; __entry->client_id = -1; } - __entry->snd_task_id = xprt->snd_task ? - xprt->snd_task->tk_pid : -1; + if (xprt->snd_task && + !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) + __entry->snd_task_id = xprt->snd_task->tk_pid; + else + __entry->snd_task_id = -1; ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER @@ -1208,8 +1212,12 @@ DECLARE_EVENT_CLASS(xprt_cong_event, __entry->task_id = -1; __entry->client_id = -1; } - __entry->snd_task_id = xprt->snd_task ? - xprt->snd_task->tk_pid : -1; + if (xprt->snd_task && + !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) + __entry->snd_task_id = xprt->snd_task->tk_pid; + else + __entry->snd_task_id = -1; + __entry->cong = xprt->cong; __entry->cwnd = xprt->cwnd; __entry->wait = test_bit(XPRT_CWND_WAIT, &xprt->state); From 94fd19752b28aa66c98e7991734af91dfc529f8f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 14 Jan 2022 16:57:56 +0100 Subject: [PATCH 0079/1295] ovl: don't fail copy up if no fileattr support on upper Christoph Fritz is reporting that failure to copy up fileattr when upper doesn't support fileattr or xattr results in a regression. Return success in these failure cases; this reverts overlayfs to the old behavior. Add a pr_warn_once() in these cases to still let the user know about the copy up failures. Reported-by: Christoph Fritz Fixes: 72db82115d2b ("ovl: copy up sync/noatime fileattr flags") Cc: # v5.15 Signed-off-by: Miklos Szeredi --- fs/overlayfs/copy_up.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 347b06479663..e040970408d4 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -157,7 +157,9 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old, */ if (oldfa.flags & OVL_PROT_FS_FLAGS_MASK) { err = ovl_set_protattr(inode, new->dentry, &oldfa); - if (err) + if (err == -EPERM) + pr_warn_once("copying fileattr: no xattr on upper\n"); + else if (err) return err; } @@ -167,6 +169,14 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old, err = ovl_real_fileattr_get(new, &newfa); if (err) { + /* + * Returning an error if upper doesn't support fileattr will + * result in a regression, so revert to the old behavior. + */ + if (err == -ENOTTY || err == -EINVAL) { + pr_warn_once("copying fileattr: no support on upper\n"); + return 0; + } pr_warn("failed to retrieve upper fileattr (%pd2, err=%i)\n", new->dentry, err); return err; From b992f01e66150fc5e90be4a96f5eb8e634c8249e Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 6 Jan 2022 17:15:05 +0530 Subject: [PATCH 0080/1295] bpf: Guard against accessing NULL pt_regs in bpf_get_task_stack() task_pt_regs() can return NULL on powerpc for kernel threads. This is then used in __bpf_get_stack() to check for user mode, resulting in a kernel oops. Guard against this by checking return value of task_pt_regs() before trying to obtain the call chain. Fixes: fa28dcb82a38f8 ("bpf: Introduce helper bpf_get_task_stack()") Cc: stable@vger.kernel.org # v5.9+ Signed-off-by: Naveen N. Rao Acked-by: Daniel Borkmann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d5ef83c361cc255494afd15ff1b4fb02a36e1dcf.1641468127.git.naveen.n.rao@linux.vnet.ibm.com --- kernel/bpf/stackmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 49e567209c6b..22c8ae94e4c1 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -472,13 +472,14 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, u32, size, u64, flags) { struct pt_regs *regs; - long res; + long res = -EINVAL; if (!try_get_task_stack(task)) return -EFAULT; regs = task_pt_regs(task); - res = __bpf_get_stack(regs, task, NULL, buf, size, flags); + if (regs) + res = __bpf_get_stack(regs, task, NULL, buf, size, flags); put_task_stack(task); return res; From fab07611fb2e6a15fac05c4583045ca5582fd826 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 6 Jan 2022 17:15:06 +0530 Subject: [PATCH 0081/1295] powerpc32/bpf: Fix codegen for bpf-to-bpf calls Pad instructions emitted for BPF_CALL so that the number of instructions generated does not change for different function addresses. This is especially important for calls to other bpf functions, whose address will only be known during extra pass. Fixes: 51c66ad849a703 ("powerpc/bpf: Implement extended BPF on PPC32") Cc: stable@vger.kernel.org # v5.13+ Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/52d8fe51f7620a6f27f377791564d79d75463576.1641468127.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/net/bpf_jit_comp32.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index faaebd446cad..c20b49bf8f5b 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -191,6 +191,9 @@ void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 fun if (image && rel < 0x2000000 && rel >= -0x2000000) { PPC_BL_ABS(func); + EMIT(PPC_RAW_NOP()); + EMIT(PPC_RAW_NOP()); + EMIT(PPC_RAW_NOP()); } else { /* Load function address into r0 */ EMIT(PPC_RAW_LIS(_R0, IMM_H(func))); From f9320c49993ca3c0ec0f9a7026b313735306bb8b Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 6 Jan 2022 17:15:07 +0530 Subject: [PATCH 0082/1295] powerpc/bpf: Update ldimm64 instructions during extra pass These instructions are updated after the initial JIT, so redo codegen during the extra pass. Rename bpf_jit_fixup_subprog_calls() to clarify that this is more than just subprog calls. Fixes: 69c087ba6225b5 ("bpf: Add bpf_for_each_map_elem() helper") Cc: stable@vger.kernel.org # v5.15 Signed-off-by: Naveen N. Rao Tested-by: Jiri Olsa Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/7cc162af77ba918eb3ecd26ec9e7824bc44b1fae.1641468127.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/net/bpf_jit_comp.c | 29 +++++++++++++++++++++++------ arch/powerpc/net/bpf_jit_comp32.c | 6 ++++++ arch/powerpc/net/bpf_jit_comp64.c | 7 ++++++- 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index d6ffdd0f2309..56dd1f4e3e44 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -23,15 +23,15 @@ static void bpf_jit_fill_ill_insns(void *area, unsigned int size) memset32(area, BREAKPOINT_INSTRUCTION, size / 4); } -/* Fix the branch target addresses for subprog calls */ -static int bpf_jit_fixup_subprog_calls(struct bpf_prog *fp, u32 *image, - struct codegen_context *ctx, u32 *addrs) +/* Fix updated addresses (for subprog calls, ldimm64, et al) during extra pass */ +static int bpf_jit_fixup_addresses(struct bpf_prog *fp, u32 *image, + struct codegen_context *ctx, u32 *addrs) { const struct bpf_insn *insn = fp->insnsi; bool func_addr_fixed; u64 func_addr; u32 tmp_idx; - int i, ret; + int i, j, ret; for (i = 0; i < fp->len; i++) { /* @@ -66,6 +66,23 @@ static int bpf_jit_fixup_subprog_calls(struct bpf_prog *fp, u32 *image, * of the JITed sequence remains unchanged. */ ctx->idx = tmp_idx; + } else if (insn[i].code == (BPF_LD | BPF_IMM | BPF_DW)) { + tmp_idx = ctx->idx; + ctx->idx = addrs[i] / 4; +#ifdef CONFIG_PPC32 + PPC_LI32(ctx->b2p[insn[i].dst_reg] - 1, (u32)insn[i + 1].imm); + PPC_LI32(ctx->b2p[insn[i].dst_reg], (u32)insn[i].imm); + for (j = ctx->idx - addrs[i] / 4; j < 4; j++) + EMIT(PPC_RAW_NOP()); +#else + func_addr = ((u64)(u32)insn[i].imm) | (((u64)(u32)insn[i + 1].imm) << 32); + PPC_LI64(b2p[insn[i].dst_reg], func_addr); + /* overwrite rest with nops */ + for (j = ctx->idx - addrs[i] / 4; j < 5; j++) + EMIT(PPC_RAW_NOP()); +#endif + ctx->idx = tmp_idx; + i++; } } @@ -200,13 +217,13 @@ skip_init_ctx: /* * Do not touch the prologue and epilogue as they will remain * unchanged. Only fix the branch target address for subprog - * calls in the body. + * calls in the body, and ldimm64 instructions. * * This does not change the offsets and lengths of the subprog * call instruction sequences and hence, the size of the JITed * image as well. */ - bpf_jit_fixup_subprog_calls(fp, code_base, &cgctx, addrs); + bpf_jit_fixup_addresses(fp, code_base, &cgctx, addrs); /* There is no need to perform the usual passes. */ goto skip_codegen_passes; diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index c20b49bf8f5b..cf8dd8aea386 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -293,6 +293,8 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * bool func_addr_fixed; u64 func_addr; u32 true_cond; + u32 tmp_idx; + int j; /* * addrs[] maps a BPF bytecode address into a real offset from @@ -908,8 +910,12 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * * 16 byte instruction that uses two 'struct bpf_insn' */ case BPF_LD | BPF_IMM | BPF_DW: /* dst = (u64) imm */ + tmp_idx = ctx->idx; PPC_LI32(dst_reg_h, (u32)insn[i + 1].imm); PPC_LI32(dst_reg, (u32)insn[i].imm); + /* padding to allow full 4 instructions for later patching */ + for (j = ctx->idx - tmp_idx; j < 4; j++) + EMIT(PPC_RAW_NOP()); /* Adjust for two bpf instructions */ addrs[++i] = ctx->idx * 4; break; diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 9eae8d8ed340..bfdd417c6b81 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -319,6 +319,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * u64 imm64; u32 true_cond; u32 tmp_idx; + int j; /* * addrs[] maps a BPF bytecode address into a real offset from @@ -848,9 +849,13 @@ emit_clear: case BPF_LD | BPF_IMM | BPF_DW: /* dst = (u64) imm */ imm64 = ((u64)(u32) insn[i].imm) | (((u64)(u32) insn[i+1].imm) << 32); + tmp_idx = ctx->idx; + PPC_LI64(dst_reg, imm64); + /* padding to allow full 5 instructions for later patching */ + for (j = ctx->idx - tmp_idx; j < 5; j++) + EMIT(PPC_RAW_NOP()); /* Adjust for two bpf instructions */ addrs[++i] = ctx->idx * 4; - PPC_LI64(dst_reg, imm64); break; /* From 88a71086c48ae98e93c0208044827621e9717f7e Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 6 Jan 2022 17:15:08 +0530 Subject: [PATCH 0083/1295] tools/bpf: Rename 'struct event' to avoid naming conflict On ppc64le, trying to build bpf seltests throws the below warning: In file included from runqslower.bpf.c:5: ./runqslower.h:7:8: error: redefinition of 'event' struct event { ^ /home/naveen/linux/tools/testing/selftests/bpf/tools/build/runqslower/vmlinux.h:156602:8: note: previous definition is here struct event { ^ This happens since 'struct event' is defined in drivers/net/ethernet/alteon/acenic.h . Rename the one in runqslower to a more appropriate 'runq_event' to avoid the naming conflict. Signed-off-by: Naveen N. Rao Acked-by: Daniel Borkmann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c13cb3767d26257ca4387b8296b632b433a58db6.1641468127.git.naveen.n.rao@linux.vnet.ibm.com --- tools/bpf/runqslower/runqslower.bpf.c | 2 +- tools/bpf/runqslower/runqslower.c | 2 +- tools/bpf/runqslower/runqslower.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c index ab9353f2fd46..9a5c1f008fe6 100644 --- a/tools/bpf/runqslower/runqslower.bpf.c +++ b/tools/bpf/runqslower/runqslower.bpf.c @@ -68,7 +68,7 @@ int handle__sched_switch(u64 *ctx) */ struct task_struct *prev = (struct task_struct *)ctx[1]; struct task_struct *next = (struct task_struct *)ctx[2]; - struct event event = {}; + struct runq_event event = {}; u64 *tsp, delta_us; long state; u32 pid; diff --git a/tools/bpf/runqslower/runqslower.c b/tools/bpf/runqslower/runqslower.c index 2414cc764461..d78f4148597f 100644 --- a/tools/bpf/runqslower/runqslower.c +++ b/tools/bpf/runqslower/runqslower.c @@ -100,7 +100,7 @@ static int bump_memlock_rlimit(void) void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) { - const struct event *e = data; + const struct runq_event *e = data; struct tm *tm; char ts[32]; time_t t; diff --git a/tools/bpf/runqslower/runqslower.h b/tools/bpf/runqslower/runqslower.h index 9db225425e5f..4f70f07200c2 100644 --- a/tools/bpf/runqslower/runqslower.h +++ b/tools/bpf/runqslower/runqslower.h @@ -4,7 +4,7 @@ #define TASK_COMM_LEN 16 -struct event { +struct runq_event { char task[TASK_COMM_LEN]; __u64 delta_us; pid_t pid; From 3f5f766d5f7f95a69a630da3544a1a0cee1cdddf Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 6 Jan 2022 17:15:12 +0530 Subject: [PATCH 0084/1295] powerpc64/bpf: Limit 'ldbrx' to processors compliant with ISA v2.06 Johan reported the below crash with test_bpf on ppc64 e5500: test_bpf: #296 ALU_END_FROM_LE 64: 0x0123456789abcdef -> 0x67452301 jited:1 Oops: Exception in kernel mode, sig: 4 [#1] BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500 Modules linked in: test_bpf(+) CPU: 0 PID: 76 Comm: insmod Not tainted 5.14.0-03771-g98c2059e008a-dirty #1 NIP: 8000000000061c3c LR: 80000000006dea64 CTR: 8000000000061c18 REGS: c0000000032d3420 TRAP: 0700 Not tainted (5.14.0-03771-g98c2059e008a-dirty) MSR: 0000000080089000 CR: 88002822 XER: 20000000 IRQMASK: 0 <...> NIP [8000000000061c3c] 0x8000000000061c3c LR [80000000006dea64] .__run_one+0x104/0x17c [test_bpf] Call Trace: .__run_one+0x60/0x17c [test_bpf] (unreliable) .test_bpf_init+0x6a8/0xdc8 [test_bpf] .do_one_initcall+0x6c/0x28c .do_init_module+0x68/0x28c .load_module+0x2460/0x2abc .__do_sys_init_module+0x120/0x18c .system_call_exception+0x110/0x1b8 system_call_common+0xf0/0x210 --- interrupt: c00 at 0x101d0acc <...> ---[ end trace 47b2bf19090bb3d0 ]--- Illegal instruction The illegal instruction turned out to be 'ldbrx' emitted for BPF_FROM_[L|B]E, which was only introduced in ISA v2.06. Guard use of the same and implement an alternative approach for older processors. Fixes: 156d0e290e969c ("powerpc/ebpf/jit: Implement JIT compiler for extended BPF") Reported-by: Johan Almbladh Signed-off-by: Naveen N. Rao Tested-by: Johan Almbladh Acked-by: Johan Almbladh Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d1e51c6fdf572062cf3009a751c3406bda01b832.1641468127.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/include/asm/ppc-opcode.h | 1 + arch/powerpc/net/bpf_jit_comp64.c | 22 +++++++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index efad07081cc0..9675303b724e 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -500,6 +500,7 @@ #define PPC_RAW_LDX(r, base, b) (0x7c00002a | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) #define PPC_RAW_LHZ(r, base, i) (0xa0000000 | ___PPC_RT(r) | ___PPC_RA(base) | IMM_L(i)) #define PPC_RAW_LHBRX(r, base, b) (0x7c00062c | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) +#define PPC_RAW_LWBRX(r, base, b) (0x7c00042c | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) #define PPC_RAW_LDBRX(r, base, b) (0x7c000428 | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) #define PPC_RAW_STWCX(s, a, b) (0x7c00012d | ___PPC_RS(s) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_CMPWI(a, i) (0x2c000000 | ___PPC_RA(a) | IMM_L(i)) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index bfdd417c6b81..e1e8c934308a 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -634,17 +634,21 @@ bpf_alu32_trunc: EMIT(PPC_RAW_MR(dst_reg, b2p[TMP_REG_1])); break; case 64: - /* - * Way easier and faster(?) to store the value - * into stack and then use ldbrx - * - * ctx->seen will be reliable in pass2, but - * the instructions generated will remain the - * same across all passes - */ + /* Store the value to stack and then use byte-reverse loads */ PPC_BPF_STL(dst_reg, 1, bpf_jit_stack_local(ctx)); EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], 1, bpf_jit_stack_local(ctx))); - EMIT(PPC_RAW_LDBRX(dst_reg, 0, b2p[TMP_REG_1])); + if (cpu_has_feature(CPU_FTR_ARCH_206)) { + EMIT(PPC_RAW_LDBRX(dst_reg, 0, b2p[TMP_REG_1])); + } else { + EMIT(PPC_RAW_LWBRX(dst_reg, 0, b2p[TMP_REG_1])); + if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) + EMIT(PPC_RAW_SLDI(dst_reg, dst_reg, 32)); + EMIT(PPC_RAW_LI(b2p[TMP_REG_2], 4)); + EMIT(PPC_RAW_LWBRX(b2p[TMP_REG_2], b2p[TMP_REG_2], b2p[TMP_REG_1])); + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) + EMIT(PPC_RAW_SLDI(b2p[TMP_REG_2], b2p[TMP_REG_2], 32)); + EMIT(PPC_RAW_OR(dst_reg, dst_reg, b2p[TMP_REG_2])); + } break; } break; From 252745240ba0ae774d2f80c5e185ed59fbc4fb41 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 14 Jan 2022 11:26:25 +0000 Subject: [PATCH 0085/1295] powerpc/audit: Fix syscall_get_arch() Commit 770cec16cdc9 ("powerpc/audit: Simplify syscall_get_arch()") and commit 898a1ef06ad4 ("powerpc/audit: Avoid unneccessary #ifdef in syscall_get_arguments()") replaced test_tsk_thread_flag(task, TIF_32BIT)) by is_32bit_task(). But is_32bit_task() applies on current task while be want the test done on task 'task' So add a new macro is_tsk_32bit_task() to check any task. Fixes: 770cec16cdc9 ("powerpc/audit: Simplify syscall_get_arch()") Fixes: 898a1ef06ad4 ("powerpc/audit: Avoid unneccessary #ifdef in syscall_get_arguments()") Cc: stable@vger.kernel.org Reported-by: Dmitry V. Levin Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c55cddb8f65713bf5859ed675d75a50cb37d5995.1642159570.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/syscall.h | 4 ++-- arch/powerpc/include/asm/thread_info.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index 52d05b465e3e..25fc8ad9a27a 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -90,7 +90,7 @@ static inline void syscall_get_arguments(struct task_struct *task, unsigned long val, mask = -1UL; unsigned int n = 6; - if (is_32bit_task()) + if (is_tsk_32bit_task(task)) mask = 0xffffffff; while (n--) { @@ -105,7 +105,7 @@ static inline void syscall_get_arguments(struct task_struct *task, static inline int syscall_get_arch(struct task_struct *task) { - if (is_32bit_task()) + if (is_tsk_32bit_task(task)) return AUDIT_ARCH_PPC; else if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) return AUDIT_ARCH_PPC64LE; diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 5725029aaa29..d6e649b3c70b 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -168,8 +168,10 @@ static inline bool test_thread_local_flags(unsigned int flags) #ifdef CONFIG_COMPAT #define is_32bit_task() (test_thread_flag(TIF_32BIT)) +#define is_tsk_32bit_task(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT)) #else #define is_32bit_task() (IS_ENABLED(CONFIG_PPC32)) +#define is_tsk_32bit_task(tsk) (IS_ENABLED(CONFIG_PPC32)) #endif #if defined(CONFIG_PPC64) From b7ec62d7ee0f0b8af6ba190501dff7f9ee6545ca Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:16:57 -0700 Subject: [PATCH 0086/1295] bitops: protect find_first_{,zero}_bit properly find_first_bit() and find_first_zero_bit() are not protected with ifdefs as other functions in find.h. It causes build errors on some platforms if CONFIG_GENERIC_FIND_FIRST_BIT is enabled. Signed-off-by: Yury Norov Fixes: 2cc7b6a44ac2 ("lib: add fast path for find_first_*_bit() and find_last_bit()") Reported-by: kernel test robot Tested-by: Wolfram Sang --- include/asm-generic/bitops/find.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 0d132ee2a291..835f959a25f2 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -97,6 +97,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, #ifdef CONFIG_GENERIC_FIND_FIRST_BIT +#ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at @@ -116,7 +117,9 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) return _find_first_bit(addr, size); } +#endif +#ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region * @addr: The address to start the search at @@ -136,6 +139,8 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) return _find_first_zero_bit(addr, size); } +#endif + #else /* CONFIG_GENERIC_FIND_FIRST_BIT */ #ifndef find_first_bit From 6b8ecb84f8f64017ae6e56cd745ad88e48f68779 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:16:58 -0700 Subject: [PATCH 0087/1295] bitops: move find_bit_*_le functions from le.h to find.h It's convenient to have all find_bit declarations in one place. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- include/asm-generic/bitops/find.h | 69 +++++++++++++++++++++++++++++++ include/asm-generic/bitops/le.h | 64 ---------------------------- 2 files changed, 69 insertions(+), 64 deletions(-) diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 835f959a25f2..91b1b23f2b0c 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -190,4 +190,73 @@ extern unsigned long find_next_clump8(unsigned long *clump, #define find_first_clump8(clump, bits, size) \ find_next_clump8((clump), (bits), (size), 0) +#if defined(__LITTLE_ENDIAN) + +static inline unsigned long find_next_zero_bit_le(const void *addr, + unsigned long size, unsigned long offset) +{ + return find_next_zero_bit(addr, size, offset); +} + +static inline unsigned long find_next_bit_le(const void *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +static inline unsigned long find_first_zero_bit_le(const void *addr, + unsigned long size) +{ + return find_first_zero_bit(addr, size); +} + +#elif defined(__BIG_ENDIAN) + +#ifndef find_next_zero_bit_le +static inline +unsigned long find_next_zero_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + + return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); +} +#endif + +#ifndef find_next_bit_le +static inline +unsigned long find_next_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_bit(addr, NULL, size, offset, 0UL, 1); +} +#endif + +#ifndef find_first_zero_bit_le +#define find_first_zero_bit_le(addr, size) \ + find_next_zero_bit_le((addr), (size), 0) +#endif + +#else +#error "Please fix " +#endif + #endif /*_ASM_GENERIC_BITOPS_FIND_H_ */ diff --git a/include/asm-generic/bitops/le.h b/include/asm-generic/bitops/le.h index 5a28629cbf4d..d51beff60375 100644 --- a/include/asm-generic/bitops/le.h +++ b/include/asm-generic/bitops/le.h @@ -2,83 +2,19 @@ #ifndef _ASM_GENERIC_BITOPS_LE_H_ #define _ASM_GENERIC_BITOPS_LE_H_ -#include #include #include -#include #if defined(__LITTLE_ENDIAN) #define BITOP_LE_SWIZZLE 0 -static inline unsigned long find_next_zero_bit_le(const void *addr, - unsigned long size, unsigned long offset) -{ - return find_next_zero_bit(addr, size, offset); -} - -static inline unsigned long find_next_bit_le(const void *addr, - unsigned long size, unsigned long offset) -{ - return find_next_bit(addr, size, offset); -} - -static inline unsigned long find_first_zero_bit_le(const void *addr, - unsigned long size) -{ - return find_first_zero_bit(addr, size); -} - #elif defined(__BIG_ENDIAN) #define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) -#ifndef find_next_zero_bit_le -static inline -unsigned long find_next_zero_bit_le(const void *addr, unsigned - long size, unsigned long offset) -{ - if (small_const_nbits(size)) { - unsigned long val = *(const unsigned long *)addr; - - if (unlikely(offset >= size)) - return size; - - val = swab(val) | ~GENMASK(size - 1, offset); - return val == ~0UL ? size : ffz(val); - } - - return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); -} #endif -#ifndef find_next_bit_le -static inline -unsigned long find_next_bit_le(const void *addr, unsigned - long size, unsigned long offset) -{ - if (small_const_nbits(size)) { - unsigned long val = *(const unsigned long *)addr; - - if (unlikely(offset >= size)) - return size; - - val = swab(val) & GENMASK(size - 1, offset); - return val ? __ffs(val) : size; - } - - return _find_next_bit(addr, NULL, size, offset, 0UL, 1); -} -#endif - -#ifndef find_first_zero_bit_le -#define find_first_zero_bit_le(addr, size) \ - find_next_zero_bit_le((addr), (size), 0) -#endif - -#else -#error "Please fix " -#endif static inline int test_bit_le(int nr, const void *addr) { From 47d8c15615c0a2046d2d90b04cb80b81ddf31fb1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:16:59 -0700 Subject: [PATCH 0088/1295] include: move find.h from asm_generic to linux find_bit API and bitmap API are closely related, but inclusion paths are different - include/asm-generic and include/linux, correspondingly. In the past it made a lot of troubles due to circular dependencies and/or undefined symbols. Fix this by moving find.h under include/linux. Signed-off-by: Yury Norov Tested-by: Wolfram Sang Acked-by: Geert Uytterhoeven --- MAINTAINERS | 2 +- arch/alpha/include/asm/bitops.h | 2 -- arch/arc/include/asm/bitops.h | 1 - arch/arm/include/asm/bitops.h | 1 - arch/arm64/include/asm/bitops.h | 1 - arch/csky/include/asm/bitops.h | 1 - arch/h8300/include/asm/bitops.h | 1 - arch/hexagon/include/asm/bitops.h | 1 - arch/ia64/include/asm/bitops.h | 2 -- arch/m68k/include/asm/bitops.h | 2 -- arch/mips/include/asm/bitops.h | 1 - arch/openrisc/include/asm/bitops.h | 1 - arch/parisc/include/asm/bitops.h | 1 - arch/powerpc/include/asm/bitops.h | 2 -- arch/riscv/include/asm/bitops.h | 1 - arch/s390/include/asm/bitops.h | 1 - arch/sh/include/asm/bitops.h | 1 - arch/sparc/include/asm/bitops_32.h | 1 - arch/sparc/include/asm/bitops_64.h | 2 -- arch/x86/include/asm/bitops.h | 2 -- arch/xtensa/include/asm/bitops.h | 1 - include/asm-generic/bitops.h | 1 - include/linux/bitmap.h | 1 + include/{asm-generic/bitops => linux}/find.h | 12 +++++++++--- 24 files changed, 11 insertions(+), 31 deletions(-) rename include/{asm-generic/bitops => linux}/find.h (97%) diff --git a/MAINTAINERS b/MAINTAINERS index dd36acc87ce6..4646786f5302 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3359,8 +3359,8 @@ M: Yury Norov R: Andy Shevchenko R: Rasmus Villemoes S: Maintained -F: include/asm-generic/bitops/find.h F: include/linux/bitmap.h +F: include/linux/find.h F: lib/bitmap.c F: lib/find_bit.c F: lib/find_bit_benchmark.c diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h index 5adca78830b5..e1d8483a45f2 100644 --- a/arch/alpha/include/asm/bitops.h +++ b/arch/alpha/include/asm/bitops.h @@ -430,8 +430,6 @@ static inline unsigned int __arch_hweight8(unsigned int w) #endif /* __KERNEL__ */ -#include - #ifdef __KERNEL__ /* diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h index a7daaf64ae34..bdb7e190a294 100644 --- a/arch/arc/include/asm/bitops.h +++ b/arch/arc/include/asm/bitops.h @@ -189,7 +189,6 @@ static inline __attribute__ ((const)) unsigned long __ffs(unsigned long x) #include #include -#include #include #include diff --git a/arch/arm/include/asm/bitops.h b/arch/arm/include/asm/bitops.h index c92e42a5c8f7..8e94fe7ab5eb 100644 --- a/arch/arm/include/asm/bitops.h +++ b/arch/arm/include/asm/bitops.h @@ -264,7 +264,6 @@ static inline int find_next_bit_le(const void *p, int size, int offset) #endif -#include #include /* diff --git a/arch/arm64/include/asm/bitops.h b/arch/arm64/include/asm/bitops.h index 81a3e519b07d..9b3c787132d2 100644 --- a/arch/arm64/include/asm/bitops.h +++ b/arch/arm64/include/asm/bitops.h @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/arch/csky/include/asm/bitops.h b/arch/csky/include/asm/bitops.h index 02b72a000767..72e1b2aa29a0 100644 --- a/arch/csky/include/asm/bitops.h +++ b/arch/csky/include/asm/bitops.h @@ -59,7 +59,6 @@ static __always_inline unsigned long __fls(unsigned long x) #include #include -#include #ifndef _LINUX_BITOPS_H #error only can be included directly diff --git a/arch/h8300/include/asm/bitops.h b/arch/h8300/include/asm/bitops.h index c867a80cab5b..4489e3d6edd3 100644 --- a/arch/h8300/include/asm/bitops.h +++ b/arch/h8300/include/asm/bitops.h @@ -168,7 +168,6 @@ static inline unsigned long __ffs(unsigned long word) return result; } -#include #include #include #include diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h index 71429f756af0..75d6ba3643b8 100644 --- a/arch/hexagon/include/asm/bitops.h +++ b/arch/hexagon/include/asm/bitops.h @@ -271,7 +271,6 @@ static inline unsigned long __fls(unsigned long word) } #include -#include #include #include diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h index 2f24ee6459d2..577be93c0818 100644 --- a/arch/ia64/include/asm/bitops.h +++ b/arch/ia64/include/asm/bitops.h @@ -441,8 +441,6 @@ static __inline__ unsigned long __arch_hweight64(unsigned long x) #endif /* __KERNEL__ */ -#include - #ifdef __KERNEL__ #include diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h index 7b93e1fd8ffa..51283db53667 100644 --- a/arch/m68k/include/asm/bitops.h +++ b/arch/m68k/include/asm/bitops.h @@ -529,6 +529,4 @@ static inline int __fls(int x) #include #endif /* __KERNEL__ */ -#include - #endif /* _M68K_BITOPS_H */ diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h index dc2a6234dd3c..c09d57f907f7 100644 --- a/arch/mips/include/asm/bitops.h +++ b/arch/mips/include/asm/bitops.h @@ -446,7 +446,6 @@ static inline int ffs(int word) } #include -#include #ifdef __KERNEL__ diff --git a/arch/openrisc/include/asm/bitops.h b/arch/openrisc/include/asm/bitops.h index 7f1ca35213d8..d773ed938acb 100644 --- a/arch/openrisc/include/asm/bitops.h +++ b/arch/openrisc/include/asm/bitops.h @@ -30,7 +30,6 @@ #include #include #include -#include #ifndef _LINUX_BITOPS_H #error only can be included directly diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h index daa2afd974fb..0ec9cfc5131f 100644 --- a/arch/parisc/include/asm/bitops.h +++ b/arch/parisc/include/asm/bitops.h @@ -203,7 +203,6 @@ static __inline__ int fls(unsigned int x) #include #include #include -#include #include #include diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 11847b6a244e..abc8f2c7c570 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -255,8 +255,6 @@ unsigned long __arch_hweight64(__u64 w); #include #endif -#include - /* wrappers that deal with KASAN instrumentation */ #include #include diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index 396a3303c537..3540b690944b 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -20,7 +20,6 @@ #include #include #include -#include #include #include diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 5a530c552c23..1d40630128a5 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -387,7 +387,6 @@ static inline int fls(unsigned int word) #endif /* CONFIG_HAVE_MARCH_Z9_109_FEATURES */ #include -#include #include #include #include diff --git a/arch/sh/include/asm/bitops.h b/arch/sh/include/asm/bitops.h index 3b6c7b5b7ec9..10ceb0d6b5a9 100644 --- a/arch/sh/include/asm/bitops.h +++ b/arch/sh/include/asm/bitops.h @@ -68,6 +68,5 @@ static inline unsigned long __ffs(unsigned long word) #include #include -#include #endif /* __ASM_SH_BITOPS_H */ diff --git a/arch/sparc/include/asm/bitops_32.h b/arch/sparc/include/asm/bitops_32.h index 0ceff3b915a8..889afa9f990f 100644 --- a/arch/sparc/include/asm/bitops_32.h +++ b/arch/sparc/include/asm/bitops_32.h @@ -100,7 +100,6 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *addr) #include #include #include -#include #include #include diff --git a/arch/sparc/include/asm/bitops_64.h b/arch/sparc/include/asm/bitops_64.h index ca7ea5913494..005a8ae858f1 100644 --- a/arch/sparc/include/asm/bitops_64.h +++ b/arch/sparc/include/asm/bitops_64.h @@ -52,8 +52,6 @@ unsigned int __arch_hweight8(unsigned int w); #include #endif /* __KERNEL__ */ -#include - #ifdef __KERNEL__ #include diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 0367efdc5b7a..a288ecd230ab 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -380,8 +380,6 @@ static __always_inline int fls64(__u64 x) #include #endif -#include - #include #include diff --git a/arch/xtensa/include/asm/bitops.h b/arch/xtensa/include/asm/bitops.h index 3f71d364ba90..cd225896c40f 100644 --- a/arch/xtensa/include/asm/bitops.h +++ b/arch/xtensa/include/asm/bitops.h @@ -205,7 +205,6 @@ BIT_OPS(change, "xor", ) #undef BIT_OP #undef TEST_AND_BIT_OP -#include #include #include diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h index df9b5bc3d282..a47b8a71d6fe 100644 --- a/include/asm-generic/bitops.h +++ b/include/asm-generic/bitops.h @@ -20,7 +20,6 @@ #include #include #include -#include #ifndef _LINUX_BITOPS_H #error only can be included directly diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index a241dcf50f39..ead4a150bd7f 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -6,6 +6,7 @@ #include #include +#include #include #include #include diff --git a/include/asm-generic/bitops/find.h b/include/linux/find.h similarity index 97% rename from include/asm-generic/bitops/find.h rename to include/linux/find.h index 91b1b23f2b0c..c5410c243e04 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/linux/find.h @@ -1,6 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_GENERIC_BITOPS_FIND_H_ -#define _ASM_GENERIC_BITOPS_FIND_H_ +#ifndef __LINUX_FIND_H_ +#define __LINUX_FIND_H_ + +#ifndef __LINUX_BITMAP_H +#error only can be included directly +#endif + +#include extern unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, @@ -259,4 +265,4 @@ unsigned long find_next_bit_le(const void *addr, unsigned #error "Please fix " #endif -#endif /*_ASM_GENERIC_BITOPS_FIND_H_ */ +#endif /*__LINUX_FIND_H_ */ From c126a53c276048125b4a950072bab37ad0fea120 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:00 -0700 Subject: [PATCH 0089/1295] arch: remove GENERIC_FIND_FIRST_BIT entirely In 5.12 cycle we enabled GENERIC_FIND_FIRST_BIT config option for ARM64 and MIPS. It increased performance and shrunk .text size; and so far I didn't receive any negative feedback on the change. https://lore.kernel.org/linux-arch/20210225135700.1381396-1-yury.norov@gmail.com/ Now I think it's a good time to switch all architectures to use find_{first,last}_bit() unconditionally, and so remove corresponding config option. The patch does't introduce functioal changes for arc, arm, arm64, mips, m68k, s390 and x86, for other architectures I expect improvement both in performance and .text size. Signed-off-by: Yury Norov Tested-by: Alexander Lobakin (mips) Reviewed-by: Alexander Lobakin (mips) Reviewed-by: Andy Shevchenko Acked-by: Will Deacon Tested-by: Wolfram Sang --- arch/arc/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/x86/um/Kconfig | 1 - include/linux/find.h | 13 ------------- lib/Kconfig | 3 --- 8 files changed, 22 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index b4ae6058902a..4bec4b0b6ce1 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -20,7 +20,6 @@ config ARC select COMMON_CLK select DMA_DIRECT_REMAP select GENERIC_ATOMIC64 if !ISA_ARCV2 || !(ARC_HAS_LL64 && ARC_HAS_LLSC) - select GENERIC_FIND_FIRST_BIT # for now, we don't need GENERIC_IRQ_PROBE, CONFIG_GENERIC_IRQ_CHIP select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c4207cf9bb17..517d26c8002d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -120,7 +120,6 @@ config ARM64 select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP - select GENERIC_FIND_FIRST_BIT select GENERIC_IDLE_POLL_SETUP select GENERIC_IRQ_IPI select GENERIC_IRQ_PROBE diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 0215dc1529e9..00951bfdbab0 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -32,7 +32,6 @@ config MIPS select GENERIC_ATOMIC64 if !64BIT select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE - select GENERIC_FIND_FIRST_BIT select GENERIC_GETTIMEOFDAY select GENERIC_IOMAP select GENERIC_IRQ_PROBE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 2a5bb4f29cfe..4f80f1c95468 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -127,7 +127,6 @@ config S390 select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES select GENERIC_ENTRY - select GENERIC_FIND_FIRST_BIT select GENERIC_GETTIMEOFDAY select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5c2ccb85f2ef..60484b39257c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -136,7 +136,6 @@ config X86 select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP select GENERIC_ENTRY - select GENERIC_FIND_FIRST_BIT select GENERIC_IOMAP select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP select GENERIC_IRQ_MATRIX_ALLOCATOR if X86_LOCAL_APIC diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 95d26a69088b..40d6a06e41c8 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -8,7 +8,6 @@ endmenu config UML_X86 def_bool y - select GENERIC_FIND_FIRST_BIT config 64BIT bool "64-bit kernel" if "$(SUBARCH)" = "x86" diff --git a/include/linux/find.h b/include/linux/find.h index c5410c243e04..ea57f7f38c49 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -101,8 +101,6 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, } #endif -#ifdef CONFIG_GENERIC_FIND_FIRST_BIT - #ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region @@ -147,17 +145,6 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) } #endif -#else /* CONFIG_GENERIC_FIND_FIRST_BIT */ - -#ifndef find_first_bit -#define find_first_bit(addr, size) find_next_bit((addr), (size), 0) -#endif -#ifndef find_first_zero_bit -#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) -#endif - -#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */ - #ifndef find_last_bit /** * find_last_bit - find the last set bit in a memory region diff --git a/lib/Kconfig b/lib/Kconfig index 5e7165e6a346..6a6ae5312fa0 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -65,9 +65,6 @@ config GENERIC_STRNLEN_USER config GENERIC_NET_UTILS bool -config GENERIC_FIND_FIRST_BIT - bool - source "lib/math/Kconfig" config NO_GENERIC_PCI_IOPORT_MAP From f68edc9297bf3f7c94abb54b9b0b053607f7587b Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:01 -0700 Subject: [PATCH 0090/1295] lib: add find_first_and_bit() Currently find_first_and_bit() is an alias to find_next_and_bit(). However, it is widely used in cpumask, so it worth to optimize it. This patch adds its own implementation for find_first_and_bit(). On x86_64 find_bit_benchmark says: Before (#define find_first_and_bit(...) find_next_and_bit(..., 0): Start testing find_bit() with random-filled bitmap [ 140.291468] find_first_and_bit: 46890919 ns, 32671 iterations Start testing find_bit() with sparse bitmap [ 140.295028] find_first_and_bit: 7103 ns, 1 iterations After: Start testing find_bit() with random-filled bitmap [ 162.574907] find_first_and_bit: 25045813 ns, 32846 iterations Start testing find_bit() with sparse bitmap [ 162.578458] find_first_and_bit: 4900 ns, 1 iterations (Thanks to Alexey Klimov for thorough testing.) Signed-off-by: Yury Norov Tested-by: Wolfram Sang Tested-by: Alexey Klimov --- include/linux/find.h | 27 +++++++++++++++++++++++++++ lib/find_bit.c | 21 +++++++++++++++++++++ lib/find_bit_benchmark.c | 21 +++++++++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/include/linux/find.h b/include/linux/find.h index ea57f7f38c49..6048f8c97418 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -12,6 +12,8 @@ extern unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start, unsigned long invert, unsigned long le); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); @@ -123,6 +125,31 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) } #endif +#ifndef find_first_and_bit +/** + * find_first_and_bit - find the first set bit in both memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_and_bit(addr1, addr2, size); +} +#endif + #ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region diff --git a/lib/find_bit.c b/lib/find_bit.c index 0f8e2e369b1d..1b8e4b2a9cba 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -89,6 +89,27 @@ unsigned long _find_first_bit(const unsigned long *addr, unsigned long size) EXPORT_SYMBOL(_find_first_bit); #endif +#ifndef find_first_and_bit +/* + * Find the first set bit in two memory regions. + */ +unsigned long _find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + unsigned long idx, val; + + for (idx = 0; idx * BITS_PER_LONG < size; idx++) { + val = addr1[idx] & addr2[idx]; + if (val) + return min(idx * BITS_PER_LONG + __ffs(val), size); + } + + return size; +} +EXPORT_SYMBOL(_find_first_and_bit); +#endif + #ifndef find_first_zero_bit /* * Find the first cleared bit in a memory region. diff --git a/lib/find_bit_benchmark.c b/lib/find_bit_benchmark.c index 5637c5711db9..db904b57d4b8 100644 --- a/lib/find_bit_benchmark.c +++ b/lib/find_bit_benchmark.c @@ -49,6 +49,25 @@ static int __init test_find_first_bit(void *bitmap, unsigned long len) return 0; } +static int __init test_find_first_and_bit(void *bitmap, const void *bitmap2, unsigned long len) +{ + static DECLARE_BITMAP(cp, BITMAP_LEN) __initdata; + unsigned long i, cnt; + ktime_t time; + + bitmap_copy(cp, bitmap, BITMAP_LEN); + + time = ktime_get(); + for (cnt = i = 0; i < len; cnt++) { + i = find_first_and_bit(cp, bitmap2, len); + __clear_bit(i, cp); + } + time = ktime_get() - time; + pr_err("find_first_and_bit: %18llu ns, %6ld iterations\n", time, cnt); + + return 0; +} + static int __init test_find_next_bit(const void *bitmap, unsigned long len) { unsigned long i, cnt; @@ -129,6 +148,7 @@ static int __init find_bit_test(void) * traverse only part of bitmap to avoid soft lockup. */ test_find_first_bit(bitmap, BITMAP_LEN / 10); + test_find_first_and_bit(bitmap, bitmap2, BITMAP_LEN / 2); test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN); pr_err("\nStart testing find_bit() with sparse bitmap\n"); @@ -145,6 +165,7 @@ static int __init find_bit_test(void) test_find_next_zero_bit(bitmap, BITMAP_LEN); test_find_last_bit(bitmap, BITMAP_LEN); test_find_first_bit(bitmap, BITMAP_LEN); + test_find_first_and_bit(bitmap, bitmap2, BITMAP_LEN); test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN); /* From 93ba139ba8190c33009c5353ca43c8519443f467 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:02 -0700 Subject: [PATCH 0091/1295] cpumask: use find_first_and_bit() Now we have an efficient implementation for find_first_and_bit(), so switch cpumask to use it where appropriate. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- include/linux/cpumask.h | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 1e7399fc69c0..c4e1b9ea0ba4 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -123,6 +123,12 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return 0; } +static inline unsigned int cpumask_first_and(const struct cpumask *srcp1, + const struct cpumask *srcp2) +{ + return 0; +} + static inline unsigned int cpumask_last(const struct cpumask *srcp) { return 0; @@ -167,7 +173,7 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node) static inline int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { - return cpumask_next_and(-1, src1p, src2p); + return cpumask_first_and(src1p, src2p); } static inline int cpumask_any_distribute(const struct cpumask *srcp) @@ -195,6 +201,19 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } +/** + * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 + * @src1p: the first input + * @src2p: the second input + * + * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). + */ +static inline +unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) +{ + return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits); +} + /** * cpumask_last - get the last CPU in a cpumask * @srcp: - the cpumask pointer @@ -585,15 +604,6 @@ static inline void cpumask_copy(struct cpumask *dstp, */ #define cpumask_any(srcp) cpumask_first(srcp) -/** - * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 - * @src1p: the first input - * @src2p: the second input - * - * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). - */ -#define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p)) - /** * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2 * @mask1: the first input cpumask From b5c7e7ec7d3418af2544452b45cc67297c857a86 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:03 -0700 Subject: [PATCH 0092/1295] all: replace find_next{,_zero}_bit with find_first{,_zero}_bit where appropriate find_first{,_zero}_bit is a more effective analogue of 'next' version if start == 0. This patch replaces 'next' with 'first' where things look trivial. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- arch/powerpc/platforms/pasemi/dma_lib.c | 4 ++-- arch/s390/kvm/kvm-s390.c | 2 +- drivers/block/rnbd/rnbd-clt.c | 2 +- drivers/dma/ti/edma.c | 2 +- drivers/iio/adc/ad7124.c | 2 +- drivers/infiniband/hw/irdma/hw.c | 16 ++++++++-------- drivers/media/cec/core/cec-core.c | 2 +- drivers/media/mc/mc-devnode.c | 2 +- drivers/pci/controller/dwc/pci-dra7xx.c | 2 +- drivers/scsi/lpfc/lpfc_sli.c | 10 +++++----- drivers/soc/ti/k3-ringacc.c | 4 ++-- drivers/tty/n_tty.c | 2 +- drivers/virt/acrn/ioreq.c | 3 +-- fs/f2fs/segment.c | 8 ++++---- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/ocfs2/dlm/dlmdomain.c | 4 ++-- fs/ocfs2/dlm/dlmmaster.c | 18 +++++++++--------- fs/ocfs2/dlm/dlmrecovery.c | 2 +- fs/ocfs2/dlm/dlmthread.c | 2 +- lib/genalloc.c | 2 +- net/ncsi/ncsi-manage.c | 4 ++-- 21 files changed, 47 insertions(+), 48 deletions(-) diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c b/arch/powerpc/platforms/pasemi/dma_lib.c index 270fa3c0d372..26427311fc72 100644 --- a/arch/powerpc/platforms/pasemi/dma_lib.c +++ b/arch/powerpc/platforms/pasemi/dma_lib.c @@ -375,7 +375,7 @@ int pasemi_dma_alloc_flag(void) int bit; retry: - bit = find_next_bit(flags_free, MAX_FLAGS, 0); + bit = find_first_bit(flags_free, MAX_FLAGS); if (bit >= MAX_FLAGS) return -ENOSPC; if (!test_and_clear_bit(bit, flags_free)) @@ -440,7 +440,7 @@ int pasemi_dma_alloc_fun(void) int bit; retry: - bit = find_next_bit(fun_free, MAX_FLAGS, 0); + bit = find_first_bit(fun_free, MAX_FLAGS); if (bit >= MAX_FLAGS) return -ENOSPC; if (!test_and_clear_bit(bit, fun_free)) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 14a18ba5ff2c..9aba96d621b9 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2021,7 +2021,7 @@ static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, while ((slotidx > 0) && (ofs >= ms->npages)) { slotidx--; ms = slots->memslots + slotidx; - ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0); + ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages); } return ms->base_gfn + ofs; } diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 2df0657cdf00..cef1058ec2fd 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -196,7 +196,7 @@ rnbd_get_cpu_qlist(struct rnbd_clt_session *sess, int cpu) return per_cpu_ptr(sess->cpu_queues, bit); } else if (cpu != 0) { /* Search from 0 to cpu */ - bit = find_next_bit(sess->cpu_queues_bm, cpu, 0); + bit = find_first_bit(sess->cpu_queues_bm, cpu); if (bit < cpu) return per_cpu_ptr(sess->cpu_queues, bit); } diff --git a/drivers/dma/ti/edma.c b/drivers/dma/ti/edma.c index 35d81bd857f1..caa4050ecc02 100644 --- a/drivers/dma/ti/edma.c +++ b/drivers/dma/ti/edma.c @@ -1681,7 +1681,7 @@ static irqreturn_t dma_ccerr_handler(int irq, void *data) dev_dbg(ecc->dev, "EMR%d 0x%08x\n", j, val); emr = val; - for (i = find_next_bit(&emr, 32, 0); i < 32; + for (i = find_first_bit(&emr, 32); i < 32; i = find_next_bit(&emr, 32, i + 1)) { int k = (j << 5) + i; diff --git a/drivers/iio/adc/ad7124.c b/drivers/iio/adc/ad7124.c index e45c600fccc0..bc2cfa5f9592 100644 --- a/drivers/iio/adc/ad7124.c +++ b/drivers/iio/adc/ad7124.c @@ -347,7 +347,7 @@ static int ad7124_find_free_config_slot(struct ad7124_state *st) { unsigned int free_cfg_slot; - free_cfg_slot = find_next_zero_bit(&st->cfg_slots_status, AD7124_MAX_CONFIGS, 0); + free_cfg_slot = find_first_zero_bit(&st->cfg_slots_status, AD7124_MAX_CONFIGS); if (free_cfg_slot == AD7124_MAX_CONFIGS) return -1; diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index b4c657f5f2f9..e2dbd6b1ffcd 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -1709,14 +1709,14 @@ clean_msixtbl: */ static void irdma_get_used_rsrc(struct irdma_device *iwdev) { - iwdev->rf->used_pds = find_next_zero_bit(iwdev->rf->allocated_pds, - iwdev->rf->max_pd, 0); - iwdev->rf->used_qps = find_next_zero_bit(iwdev->rf->allocated_qps, - iwdev->rf->max_qp, 0); - iwdev->rf->used_cqs = find_next_zero_bit(iwdev->rf->allocated_cqs, - iwdev->rf->max_cq, 0); - iwdev->rf->used_mrs = find_next_zero_bit(iwdev->rf->allocated_mrs, - iwdev->rf->max_mr, 0); + iwdev->rf->used_pds = find_first_zero_bit(iwdev->rf->allocated_pds, + iwdev->rf->max_pd); + iwdev->rf->used_qps = find_first_zero_bit(iwdev->rf->allocated_qps, + iwdev->rf->max_qp); + iwdev->rf->used_cqs = find_first_zero_bit(iwdev->rf->allocated_cqs, + iwdev->rf->max_cq); + iwdev->rf->used_mrs = find_first_zero_bit(iwdev->rf->allocated_mrs, + iwdev->rf->max_mr); } void irdma_ctrl_deinit_hw(struct irdma_pci_f *rf) diff --git a/drivers/media/cec/core/cec-core.c b/drivers/media/cec/core/cec-core.c index 551689d371a7..7322e7cd9753 100644 --- a/drivers/media/cec/core/cec-core.c +++ b/drivers/media/cec/core/cec-core.c @@ -106,7 +106,7 @@ static int __must_check cec_devnode_register(struct cec_devnode *devnode, /* Part 1: Find a free minor number */ mutex_lock(&cec_devnode_lock); - minor = find_next_zero_bit(cec_devnode_nums, CEC_NUM_DEVICES, 0); + minor = find_first_zero_bit(cec_devnode_nums, CEC_NUM_DEVICES); if (minor == CEC_NUM_DEVICES) { mutex_unlock(&cec_devnode_lock); pr_err("could not get a free minor\n"); diff --git a/drivers/media/mc/mc-devnode.c b/drivers/media/mc/mc-devnode.c index f11382afe23b..680fbb3a9340 100644 --- a/drivers/media/mc/mc-devnode.c +++ b/drivers/media/mc/mc-devnode.c @@ -217,7 +217,7 @@ int __must_check media_devnode_register(struct media_device *mdev, /* Part 1: Find a free minor number */ mutex_lock(&media_devnode_lock); - minor = find_next_zero_bit(media_devnode_nums, MEDIA_NUM_DEVICES, 0); + minor = find_first_zero_bit(media_devnode_nums, MEDIA_NUM_DEVICES); if (minor == MEDIA_NUM_DEVICES) { mutex_unlock(&media_devnode_lock); pr_err("could not get a free minor\n"); diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c b/drivers/pci/controller/dwc/pci-dra7xx.c index a4221f6f3629..279a6fa56584 100644 --- a/drivers/pci/controller/dwc/pci-dra7xx.c +++ b/drivers/pci/controller/dwc/pci-dra7xx.c @@ -213,7 +213,7 @@ static int dra7xx_pcie_handle_msi(struct pcie_port *pp, int index) if (!val) return 0; - pos = find_next_bit(&val, MAX_MSI_IRQS_PER_CTRL, 0); + pos = find_first_bit(&val, MAX_MSI_IRQS_PER_CTRL); while (pos != MAX_MSI_IRQS_PER_CTRL) { generic_handle_domain_irq(pp->irq_domain, (index * MAX_MSI_IRQS_PER_CTRL) + pos); diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 5dedb3de271d..77dfe293bf23 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -17990,8 +17990,8 @@ lpfc_sli4_alloc_xri(struct lpfc_hba *phba) * the driver starts at 0 each time. */ spin_lock_irq(&phba->hbalock); - xri = find_next_zero_bit(phba->sli4_hba.xri_bmask, - phba->sli4_hba.max_cfg_param.max_xri, 0); + xri = find_first_zero_bit(phba->sli4_hba.xri_bmask, + phba->sli4_hba.max_cfg_param.max_xri); if (xri >= phba->sli4_hba.max_cfg_param.max_xri) { spin_unlock_irq(&phba->hbalock); return NO_XRI; @@ -19668,7 +19668,7 @@ lpfc_sli4_alloc_rpi(struct lpfc_hba *phba) max_rpi = phba->sli4_hba.max_cfg_param.max_rpi; rpi_limit = phba->sli4_hba.next_rpi; - rpi = find_next_zero_bit(phba->sli4_hba.rpi_bmask, rpi_limit, 0); + rpi = find_first_zero_bit(phba->sli4_hba.rpi_bmask, rpi_limit); if (rpi >= rpi_limit) rpi = LPFC_RPI_ALLOC_ERROR; else { @@ -20311,8 +20311,8 @@ next_priority: * have been tested so that we can detect when we should * change the priority level. */ - next_fcf_index = find_next_bit(phba->fcf.fcf_rr_bmask, - LPFC_SLI4_FCF_TBL_INDX_MAX, 0); + next_fcf_index = find_first_bit(phba->fcf.fcf_rr_bmask, + LPFC_SLI4_FCF_TBL_INDX_MAX); } diff --git a/drivers/soc/ti/k3-ringacc.c b/drivers/soc/ti/k3-ringacc.c index 312ba0f98ad7..573be88f8191 100644 --- a/drivers/soc/ti/k3-ringacc.c +++ b/drivers/soc/ti/k3-ringacc.c @@ -358,8 +358,8 @@ struct k3_ring *k3_ringacc_request_ring(struct k3_ringacc *ringacc, goto out; if (flags & K3_RINGACC_RING_USE_PROXY) { - proxy_id = find_next_zero_bit(ringacc->proxy_inuse, - ringacc->num_proxies, 0); + proxy_id = find_first_zero_bit(ringacc->proxy_inuse, + ringacc->num_proxies); if (proxy_id == ringacc->num_proxies) goto error; } diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 5be6d02dc690..9fc2319a394d 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -1975,7 +1975,7 @@ static bool canon_copy_from_read_buf(struct tty_struct *tty, more = n - (size - tail); if (eol == N_TTY_BUF_SIZE && more) { /* scan wrapped without finding set bit */ - eol = find_next_bit(ldata->read_flags, more, 0); + eol = find_first_bit(ldata->read_flags, more); found = eol != more; } else found = eol != size; diff --git a/drivers/virt/acrn/ioreq.c b/drivers/virt/acrn/ioreq.c index 80b2e3f0e276..5ff1c53740c0 100644 --- a/drivers/virt/acrn/ioreq.c +++ b/drivers/virt/acrn/ioreq.c @@ -246,8 +246,7 @@ void acrn_ioreq_request_clear(struct acrn_vm *vm) spin_lock_bh(&vm->ioreq_clients_lock); client = vm->default_client; if (client) { - vcpu = find_next_bit(client->ioreqs_map, - ACRN_IO_REQUEST_MAX, 0); + vcpu = find_first_bit(client->ioreqs_map, ACRN_IO_REQUEST_MAX); while (vcpu < ACRN_IO_REQUEST_MAX) { acrn_ioreq_complete_request(client, vcpu, NULL); vcpu = find_next_bit(client->ioreqs_map, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index df9ed75f0b7a..913552c98171 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2558,8 +2558,8 @@ find_other_zone: secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); if (secno >= MAIN_SECS(sbi)) { if (dir == ALLOC_RIGHT) { - secno = find_next_zero_bit(free_i->free_secmap, - MAIN_SECS(sbi), 0); + secno = find_first_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi)); f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi)); } else { go_left = 1; @@ -2574,8 +2574,8 @@ find_other_zone: left_start--; continue; } - left_start = find_next_zero_bit(free_i->free_secmap, - MAIN_SECS(sbi), 0); + left_start = find_first_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi)); f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi)); break; } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f89ffcbd585f..a17be1618bf7 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -379,7 +379,7 @@ static void o2hb_nego_timeout(struct work_struct *work) o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); /* lowest node as master node to make negotiate decision. */ - master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0); + master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES); if (master_node == o2nm_this_node()) { if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 9f90fc9551e1..c4eccd499db8 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1045,7 +1045,7 @@ static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map) int status, ret = 0, i; char *p; - if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + if (find_first_bit(node_map, O2NM_MAX_NODES) >= O2NM_MAX_NODES) goto bail; qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL); @@ -1217,7 +1217,7 @@ static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map) struct o2nm_node *node; int ret = 0, status, count, i; - if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + if (find_first_bit(node_map, O2NM_MAX_NODES) >= O2NM_MAX_NODES) goto bail; qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9b88219febb5..227da5b1b6ab 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -861,7 +861,7 @@ lookup: * to see if there are any nodes that still need to be * considered. these will not appear in the mle nodemap * but they might own this lockres. wait on them. */ - bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) { mlog(0, "%s: res %.*s, At least one node (%d) " "to recover before lock mastery can begin\n", @@ -912,7 +912,7 @@ redo_request: dlm_wait_for_recovery(dlm); spin_lock(&dlm->spinlock); - bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) { mlog(0, "%s: res %.*s, At least one node (%d) " "to recover before lock mastery can begin\n", @@ -1079,7 +1079,7 @@ recheck: sleep = 1; /* have all nodes responded? */ if (voting_done && !*blocked) { - bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (dlm->node_num <= bit) { /* my node number is lowest. * now tell other nodes that I am @@ -1234,8 +1234,8 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, } else { mlog(ML_ERROR, "node down! %d\n", node); if (blocked) { - int lowest = find_next_bit(mle->maybe_map, - O2NM_MAX_NODES, 0); + int lowest = find_first_bit(mle->maybe_map, + O2NM_MAX_NODES); /* act like it was never there */ clear_bit(node, mle->maybe_map); @@ -1795,7 +1795,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, "MLE for it! (%.*s)\n", assert->node_idx, namelen, name); } else { - int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0); + int bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (bit >= O2NM_MAX_NODES) { /* not necessarily an error, though less likely. * could be master just re-asserting. */ @@ -2521,7 +2521,7 @@ static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm, } if (!nonlocal) { - node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + node_ref = find_first_bit(res->refmap, O2NM_MAX_NODES); if (node_ref >= O2NM_MAX_NODES) return 0; } @@ -3303,7 +3303,7 @@ static void dlm_clean_block_mle(struct dlm_ctxt *dlm, BUG_ON(mle->type != DLM_MLE_BLOCK); spin_lock(&mle->spinlock); - bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (bit != dead_node) { mlog(0, "mle found, but dead node %u would not have been " "master\n", dead_node); @@ -3542,7 +3542,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm) spin_lock(&dlm->master_lock); BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING); - BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES)); + BUG_ON((find_first_bit(dlm->domain_map, O2NM_MAX_NODES) < O2NM_MAX_NODES)); for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_master_hash(dlm, i); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 5cd5f7511dac..52ad342fec3e 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -451,7 +451,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { int bit; - bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit >= O2NM_MAX_NODES || bit < 0) dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); else diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index c350bd4df770..eedf07ca23ca 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -92,7 +92,7 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) return 0; /* Another node has this resource with this node as the master */ - bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + bit = find_first_bit(res->refmap, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) return 0; diff --git a/lib/genalloc.c b/lib/genalloc.c index 9a57257988c7..00fc50d0a640 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -251,7 +251,7 @@ void gen_pool_destroy(struct gen_pool *pool) list_del(&chunk->next_chunk); end_bit = chunk_size(chunk) >> order; - bit = find_next_bit(chunk->bits, end_bit, 0); + bit = find_first_bit(chunk->bits, end_bit); BUG_ON(bit < end_bit); vfree(chunk); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 7121ce2a47c0..78814417d753 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -608,7 +608,7 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, bitmap = &ncf->bitmap; spin_lock_irqsave(&nc->lock, flags); - index = find_next_bit(bitmap, ncf->n_vids, 0); + index = find_first_bit(bitmap, ncf->n_vids); if (index >= ncf->n_vids) { spin_unlock_irqrestore(&nc->lock, flags); return -1; @@ -667,7 +667,7 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, return -1; } - index = find_next_zero_bit(bitmap, ncf->n_vids, 0); + index = find_first_zero_bit(bitmap, ncf->n_vids); if (index < 0 || index >= ncf->n_vids) { netdev_err(ndp->ndev.dev, "Channel %u already has all VLAN filters set\n", From 4ade0818cf048bb166e875ed4f8b456e6c2c7b3c Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:04 -0700 Subject: [PATCH 0093/1295] tools: sync tools/bitmap with mother linux Remove tools/include/asm-generic/bitops/find.h and copy include/linux/bitmap.h to tools. find_*_le() functions are not copied because not needed in tools. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- MAINTAINERS | 2 +- tools/include/asm-generic/bitops.h | 1 - tools/include/linux/bitmap.h | 7 +- .../{asm-generic/bitops => linux}/find.h | 81 +++++++++++++++++-- tools/lib/find_bit.c | 20 +++++ 5 files changed, 100 insertions(+), 11 deletions(-) rename tools/include/{asm-generic/bitops => linux}/find.h (63%) diff --git a/MAINTAINERS b/MAINTAINERS index 4646786f5302..092b1c6fd2f1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3365,8 +3365,8 @@ F: lib/bitmap.c F: lib/find_bit.c F: lib/find_bit_benchmark.c F: lib/test_bitmap.c -F: tools/include/asm-generic/bitops/find.h F: tools/include/linux/bitmap.h +F: tools/include/linux/find.h F: tools/lib/bitmap.c F: tools/lib/find_bit.c diff --git a/tools/include/asm-generic/bitops.h b/tools/include/asm-generic/bitops.h index 5d2ab38965cc..9ab313e93555 100644 --- a/tools/include/asm-generic/bitops.h +++ b/tools/include/asm-generic/bitops.h @@ -18,7 +18,6 @@ #include #include #include -#include #ifndef _TOOLS_LINUX_BITOPS_H_ #error only can be included directly diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 95611df1d26e..ea97804d04d4 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -1,9 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PERF_BITOPS_H -#define _PERF_BITOPS_H +#ifndef _TOOLS_LINUX_BITMAP_H +#define _TOOLS_LINUX_BITMAP_H #include #include +#include #include #include @@ -181,4 +182,4 @@ static inline int bitmap_intersects(const unsigned long *src1, return __bitmap_intersects(src1, src2, nbits); } -#endif /* _PERF_BITOPS_H */ +#endif /* _TOOLS_LINUX_BITMAP_H */ diff --git a/tools/include/asm-generic/bitops/find.h b/tools/include/linux/find.h similarity index 63% rename from tools/include/asm-generic/bitops/find.h rename to tools/include/linux/find.h index 6481fd11012a..47e2bd6c5174 100644 --- a/tools/include/asm-generic/bitops/find.h +++ b/tools/include/linux/find.h @@ -1,11 +1,19 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ -#define _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ +#ifndef _TOOLS_LINUX_FIND_H_ +#define _TOOLS_LINUX_FIND_H_ + +#ifndef _TOOLS_LINUX_BITMAP_H +#error tools: only can be included directly +#endif + +#include extern unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start, unsigned long invert, unsigned long le); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); @@ -96,7 +104,6 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, #endif #ifndef find_first_bit - /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at @@ -116,11 +123,34 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) return _find_first_bit(addr, size); } +#endif -#endif /* find_first_bit */ +#ifndef find_first_and_bit +/** + * find_first_and_bit - find the first set bit in both memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_and_bit(addr1, addr2, size); +} +#endif #ifndef find_first_zero_bit - /** * find_first_zero_bit - find the first cleared bit in a memory region * @addr: The address to start the search at @@ -142,4 +172,43 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) } #endif -#endif /*_TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ */ +#ifndef find_last_bit +/** + * find_last_bit - find the last set bit in a memory region + * @addr: The address to start the search at + * @size: The number of bits to search + * + * Returns the bit number of the last set bit, or size. + */ +static inline +unsigned long find_last_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __fls(val) : size; + } + + return _find_last_bit(addr, size); +} +#endif + +/** + * find_next_clump8 - find next 8-bit clump with set bits in a memory region + * @clump: location to store copy of found clump + * @addr: address to base the search on + * @size: bitmap size in number of bits + * @offset: bit offset at which to start searching + * + * Returns the bit offset for the next set clump; the found clump value is + * copied to the location pointed by @clump. If no bits are set, returns @size. + */ +extern unsigned long find_next_clump8(unsigned long *clump, + const unsigned long *addr, + unsigned long size, unsigned long offset); + +#define find_first_clump8(clump, bits, size) \ + find_next_clump8((clump), (bits), (size), 0) + + +#endif /*__LINUX_FIND_H_ */ diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c index 109aa7ffcf97..ba4b8d94e004 100644 --- a/tools/lib/find_bit.c +++ b/tools/lib/find_bit.c @@ -96,6 +96,26 @@ unsigned long _find_first_bit(const unsigned long *addr, unsigned long size) } #endif +#ifndef find_first_and_bit +/* + * Find the first set bit in two memory regions. + */ +unsigned long _find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + unsigned long idx, val; + + for (idx = 0; idx * BITS_PER_LONG < size; idx++) { + val = addr1[idx] & addr2[idx]; + if (val) + return min(idx * BITS_PER_LONG + __ffs(val), size); + } + + return size; +} +#endif + #ifndef find_first_zero_bit /* * Find the first cleared bit in a memory region. From 9b51d9d866482a703646fd4c07e433c3d9d88efd Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:05 -0700 Subject: [PATCH 0094/1295] cpumask: replace cpumask_next_* with cpumask_first_* where appropriate cpumask_first() is a more effective analogue of 'next' version if n == -1 (which means start == 0). This patch replaces 'next' with 'first' where things look trivial. There's no cpumask_first_zero() function, so create it. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- block/blk-mq.c | 2 +- drivers/net/virtio_net.c | 2 +- drivers/soc/fsl/qbman/bman_portal.c | 2 +- drivers/soc/fsl/qbman/qman_portal.c | 2 +- include/linux/cpumask.h | 16 ++++++++++++++++ kernel/time/clocksource.c | 4 ++-- 6 files changed, 22 insertions(+), 6 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 8874a63ae952..ef56e753ab38 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2967,7 +2967,7 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, struct blk_mq_hw_ctx *hctx) { - if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) + if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu) return false; if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) return false; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b107835242ad..8c70ab3c436a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2101,7 +2101,7 @@ static void virtnet_set_affinity(struct virtnet_info *vi) stragglers = num_cpu >= vi->curr_queue_pairs ? num_cpu % vi->curr_queue_pairs : 0; - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); for (i = 0; i < vi->curr_queue_pairs; i++) { group_size = stride + (i < stragglers ? 1 : 0); diff --git a/drivers/soc/fsl/qbman/bman_portal.c b/drivers/soc/fsl/qbman/bman_portal.c index acda8a5637c5..4d7b9caee1c4 100644 --- a/drivers/soc/fsl/qbman/bman_portal.c +++ b/drivers/soc/fsl/qbman/bman_portal.c @@ -155,7 +155,7 @@ static int bman_portal_probe(struct platform_device *pdev) } spin_lock(&bman_lock); - cpu = cpumask_next_zero(-1, &portal_cpus); + cpu = cpumask_first_zero(&portal_cpus); if (cpu >= nr_cpu_ids) { __bman_portals_probed = 1; /* unassigned portal, skip init */ diff --git a/drivers/soc/fsl/qbman/qman_portal.c b/drivers/soc/fsl/qbman/qman_portal.c index 96f74a1dc603..e23b60618c1a 100644 --- a/drivers/soc/fsl/qbman/qman_portal.c +++ b/drivers/soc/fsl/qbman/qman_portal.c @@ -248,7 +248,7 @@ static int qman_portal_probe(struct platform_device *pdev) pcfg->pools = qm_get_pools_sdqcr(); spin_lock(&qman_lock); - cpu = cpumask_next_zero(-1, &portal_cpus); + cpu = cpumask_first_zero(&portal_cpus); if (cpu >= nr_cpu_ids) { __qman_portals_probed = 1; /* unassigned portal, skip init */ diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index c4e1b9ea0ba4..64dae70d31f5 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -123,6 +123,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return 0; } +static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) +{ + return 0; +} + static inline unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { @@ -201,6 +206,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } +/** + * cpumask_first_zero - get the first unset cpu in a cpumask + * @srcp: the cpumask pointer + * + * Returns >= nr_cpu_ids if all cpus are set. + */ +static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) +{ + return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits); +} + /** * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 * @src1p: the first input diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b8a14d2fb5ba..f29d1a524fa5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -257,7 +257,7 @@ static void clocksource_verify_choose_cpus(void) return; /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); if (cpu == smp_processor_id()) cpu = cpumask_next(cpu, cpu_online_mask); if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) @@ -279,7 +279,7 @@ static void clocksource_verify_choose_cpus(void) cpu = prandom_u32() % nr_cpu_ids; cpu = cpumask_next(cpu - 1, cpu_online_mask); if (cpu >= nr_cpu_ids) - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) cpumask_set_cpu(cpu, &cpus_chosen); } From bc9d6635c293a2ac30c6319f7cfd08860ab7948a Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:06 -0700 Subject: [PATCH 0095/1295] include/linux: move for_each_bit() macros from bitops.h to find.h for_each_bit() macros depend on find_bit() machinery, and so the proper place for them is the find.h header. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- include/linux/bitops.h | 34 ---------------------------------- include/linux/find.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 5e62e2383b7f..7aaed501f768 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -32,40 +32,6 @@ extern unsigned long __sw_hweight64(__u64 w); */ #include -#define for_each_set_bit(bit, addr, size) \ - for ((bit) = find_first_bit((addr), (size)); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) - -/* same as for_each_set_bit() but use bit as value to start with */ -#define for_each_set_bit_from(bit, addr, size) \ - for ((bit) = find_next_bit((addr), (size), (bit)); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) - -#define for_each_clear_bit(bit, addr, size) \ - for ((bit) = find_first_zero_bit((addr), (size)); \ - (bit) < (size); \ - (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) - -/* same as for_each_clear_bit() but use bit as value to start with */ -#define for_each_clear_bit_from(bit, addr, size) \ - for ((bit) = find_next_zero_bit((addr), (size), (bit)); \ - (bit) < (size); \ - (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) - -/** - * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits - * @start: bit offset to start search and to store the current iteration offset - * @clump: location to store copy of current 8-bit clump - * @bits: bitmap address to base the search on - * @size: bitmap size in number of bits - */ -#define for_each_set_clump8(start, clump, bits, size) \ - for ((start) = find_first_clump8(&(clump), (bits), (size)); \ - (start) < (size); \ - (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) - static inline int get_bitmask_order(unsigned int count) { int order; diff --git a/include/linux/find.h b/include/linux/find.h index 6048f8c97418..4500e8ab93e2 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -279,4 +279,38 @@ unsigned long find_next_bit_le(const void *addr, unsigned #error "Please fix " #endif +#define for_each_set_bit(bit, addr, size) \ + for ((bit) = find_first_bit((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + +/* same as for_each_set_bit() but use bit as value to start with */ +#define for_each_set_bit_from(bit, addr, size) \ + for ((bit) = find_next_bit((addr), (size), (bit)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + +#define for_each_clear_bit(bit, addr, size) \ + for ((bit) = find_first_zero_bit((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) + +/* same as for_each_clear_bit() but use bit as value to start with */ +#define for_each_clear_bit_from(bit, addr, size) \ + for ((bit) = find_next_zero_bit((addr), (size), (bit)); \ + (bit) < (size); \ + (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) + +/** + * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits + * @start: bit offset to start search and to store the current iteration offset + * @clump: location to store copy of current 8-bit clump + * @bits: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_clump8(start, clump, bits, size) \ + for ((start) = find_first_clump8(&(clump), (bits), (size)); \ + (start) < (size); \ + (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) + #endif /*__LINUX_FIND_H_ */ From 7516be9931b8bc8bcaac8531f490b42ab11ded1e Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:07 -0700 Subject: [PATCH 0096/1295] find: micro-optimize for_each_{set,clear}_bit() The macros iterate thru all set/clear bits in a bitmap. They search a first bit using find_first_bit(), and the rest bits using find_next_bit(). Since find_next_bit() is called shortly after find_first_bit(), we can save few lines of I-cache by not using find_first_bit(). Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- include/linux/find.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/find.h b/include/linux/find.h index 4500e8ab93e2..ae9ed52b52b8 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -280,7 +280,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned #endif #define for_each_set_bit(bit, addr, size) \ - for ((bit) = find_first_bit((addr), (size)); \ + for ((bit) = find_next_bit((addr), (size), 0); \ (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) @@ -291,7 +291,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned (bit) = find_next_bit((addr), (size), (bit) + 1)) #define for_each_clear_bit(bit, addr, size) \ - for ((bit) = find_first_zero_bit((addr), (size)); \ + for ((bit) = find_next_zero_bit((addr), (size), 0); \ (bit) < (size); \ (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) From 749443de8dde3b8b420ee8b4daac4d929a6adeb9 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:08 -0700 Subject: [PATCH 0097/1295] Replace for_each_*_bit_from() with for_each_*_bit() where appropriate A couple of kernel functions call for_each_*_bit_from() with start bit equal to 0. Replace them with for_each_*_bit(). No functional changes, but might improve on readability. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- arch/x86/kernel/apic/vector.c | 4 ++-- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 4 ++-- drivers/hwmon/ltc2992.c | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index c132daabe615..3e6f6b448f6a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -760,9 +760,9 @@ void __init lapic_update_legacy_vectors(void) void __init lapic_assign_system_vectors(void) { - unsigned int i, vector = 0; + unsigned int i, vector; - for_each_set_bit_from(vector, system_vectors, NR_VECTORS) + for_each_set_bit(vector, system_vectors, NR_VECTORS) irq_matrix_assign_system(vector_matrix, vector, false); if (nr_legacy_irqs() > 1) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index 242a5fd8b932..06bde46df451 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -1047,7 +1047,7 @@ pm_put: void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) { - unsigned int i = 0; + unsigned int i; dev_err(gpu->dev, "recover hung GPU!\n"); @@ -1060,7 +1060,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) /* complete all events, the GPU won't do it after the reset */ spin_lock(&gpu->event_spinlock); - for_each_set_bit_from(i, gpu->event_bitmap, ETNA_NR_EVENTS) + for_each_set_bit(i, gpu->event_bitmap, ETNA_NR_EVENTS) complete(&gpu->event_free); bitmap_zero(gpu->event_bitmap, ETNA_NR_EVENTS); spin_unlock(&gpu->event_spinlock); diff --git a/drivers/hwmon/ltc2992.c b/drivers/hwmon/ltc2992.c index 2a4bed0ab226..7352d2b3c756 100644 --- a/drivers/hwmon/ltc2992.c +++ b/drivers/hwmon/ltc2992.c @@ -248,8 +248,7 @@ static int ltc2992_gpio_get_multiple(struct gpio_chip *chip, unsigned long *mask gpio_status = reg; - gpio_nr = 0; - for_each_set_bit_from(gpio_nr, mask, LTC2992_GPIO_NR) { + for_each_set_bit(gpio_nr, mask, LTC2992_GPIO_NR) { if (test_bit(LTC2992_GPIO_BIT(gpio_nr), &gpio_status)) set_bit(gpio_nr, bits); } From 801a57365fc836d7ec866e2069d0b21d79925c1e Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:10 -0700 Subject: [PATCH 0098/1295] mm/percpu: micro-optimize pcpu_is_populated() bitmap_next_clear_region() calls find_next_zero_bit() and find_next_bit() sequentially to find a range of clear bits. In case of pcpu_is_populated() there's a chance to return earlier if bitmap has all bits set. Signed-off-by: Yury Norov Tested-by: Wolfram Sang Acked-by: Dennis Zhou --- mm/percpu.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index f5b2c2ea5a54..0c6f85c5514f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1070,17 +1070,18 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, int *next_off) { - unsigned int page_start, page_end, rs, re; + unsigned int start, end; - page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); - page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); + start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); + end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); - rs = page_start; - bitmap_next_clear_region(chunk->populated, &rs, &re, page_end); - if (rs >= page_end) + start = find_next_zero_bit(chunk->populated, end, start); + if (start >= end) return true; - *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; + end = find_next_bit(chunk->populated, end, start + 1); + + *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; return false; } From ec288a2cf7ca40a939316b6df206ab845bb112d1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:11 -0700 Subject: [PATCH 0099/1295] bitmap: unify find_bit operations bitmap_for_each_{set,clear}_region() are similar to for_each_bit() macros in include/linux/find.h, but interface and implementation of them are different. This patch adds for_each_bitrange() macros and drops unused bitmap_*_region() API in sake of unification. Signed-off-by: Yury Norov Tested-by: Wolfram Sang Acked-by: Dennis Zhou Acked-by: Ulf Hansson # For MMC --- drivers/mmc/host/renesas_sdhi_core.c | 2 +- include/linux/bitmap.h | 33 ---------------- include/linux/find.h | 56 ++++++++++++++++++++++++++++ mm/percpu.c | 20 ++++------ 4 files changed, 65 insertions(+), 46 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index f5b2684ad805..6de30a2719a3 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -628,7 +628,7 @@ static int renesas_sdhi_select_tuning(struct tmio_mmc_host *host) * is at least SH_MOBILE_SDHI_MIN_TAP_ROW probes long then use the * center index as the tap, otherwise bail out. */ - bitmap_for_each_set_region(bitmap, rs, re, 0, taps_size) { + for_each_set_bitrange(rs, re, bitmap, taps_size) { if (re - rs > tap_cnt) { tap_end = re; tap_start = rs; diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index ead4a150bd7f..7dba0847510c 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -55,12 +55,6 @@ struct device; * bitmap_clear(dst, pos, nbits) Clear specified bit area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area * bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off) as above - * bitmap_next_clear_region(map, &start, &end, nbits) Find next clear region - * bitmap_next_set_region(map, &start, &end, nbits) Find next set region - * bitmap_for_each_clear_region(map, rs, re, start, end) - * Iterate over all clear regions - * bitmap_for_each_set_region(map, rs, re, start, end) - * Iterate over all set regions * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n * bitmap_cut(dst, src, first, n, nbits) Cut n bits from first, copy rest @@ -467,14 +461,6 @@ static inline void bitmap_replace(unsigned long *dst, __bitmap_replace(dst, old, new, mask, nbits); } -static inline void bitmap_next_clear_region(unsigned long *bitmap, - unsigned int *rs, unsigned int *re, - unsigned int end) -{ - *rs = find_next_zero_bit(bitmap, end, *rs); - *re = find_next_bit(bitmap, end, *rs + 1); -} - static inline void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs, unsigned int *re, unsigned int end) @@ -483,25 +469,6 @@ static inline void bitmap_next_set_region(unsigned long *bitmap, *re = find_next_zero_bit(bitmap, end, *rs + 1); } -/* - * Bitmap region iterators. Iterates over the bitmap between [@start, @end). - * @rs and @re should be integer variables and will be set to start and end - * index of the current clear or set region. - */ -#define bitmap_for_each_clear_region(bitmap, rs, re, start, end) \ - for ((rs) = (start), \ - bitmap_next_clear_region((bitmap), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, \ - bitmap_next_clear_region((bitmap), &(rs), &(re), (end))) - -#define bitmap_for_each_set_region(bitmap, rs, re, start, end) \ - for ((rs) = (start), \ - bitmap_next_set_region((bitmap), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, \ - bitmap_next_set_region((bitmap), &(rs), &(re), (end))) - /** * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap. * @n: u64 value diff --git a/include/linux/find.h b/include/linux/find.h index ae9ed52b52b8..5bb6db213bcb 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -301,6 +301,62 @@ unsigned long find_next_bit_le(const void *addr, unsigned (bit) < (size); \ (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) +/** + * for_each_set_bitrange - iterate over all set bit ranges [b; e) + * @b: bit offset of start of current bitrange (first set bit) + * @e: bit offset of end of current bitrange (first unset bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_bitrange(b, e, addr, size) \ + for ((b) = find_next_bit((addr), (size), 0), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_bit((addr), (size), (e) + 1), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1)) + +/** + * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) + * @b: bit offset of start of current bitrange (first set bit); must be initialized + * @e: bit offset of end of current bitrange (first unset bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_bitrange_from(b, e, addr, size) \ + for ((b) = find_next_bit((addr), (size), (b)), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_bit((addr), (size), (e) + 1), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1)) + +/** + * for_each_clear_bitrange - iterate over all unset bit ranges [b; e) + * @b: bit offset of start of current bitrange (first unset bit) + * @e: bit offset of end of current bitrange (first set bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_clear_bitrange(b, e, addr, size) \ + for ((b) = find_next_zero_bit((addr), (size), 0), \ + (e) = find_next_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_zero_bit((addr), (size), (e) + 1), \ + (e) = find_next_bit((addr), (size), (b) + 1)) + +/** + * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e) + * @b: bit offset of start of current bitrange (first set bit); must be initialized + * @e: bit offset of end of current bitrange (first unset bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_clear_bitrange_from(b, e, addr, size) \ + for ((b) = find_next_zero_bit((addr), (size), (b)), \ + (e) = find_next_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_zero_bit((addr), (size), (e) + 1), \ + (e) = find_next_bit((addr), (size), (b) + 1)) + /** * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits * @start: bit offset to start search and to store the current iteration offset diff --git a/mm/percpu.c b/mm/percpu.c index 0c6f85c5514f..293009cc03ef 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -779,7 +779,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) { struct pcpu_block_md *block = chunk->md_blocks + index; unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); - unsigned int rs, re, start; /* region start, region end */ + unsigned int start, end; /* region start, region end */ /* promote scan_hint to contig_hint */ if (block->scan_hint) { @@ -795,9 +795,8 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) block->right_free = 0; /* iterate over free areas and update the contig hints */ - bitmap_for_each_clear_region(alloc_map, rs, re, start, - PCPU_BITMAP_BLOCK_BITS) - pcpu_block_update(block, rs, re); + for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS) + pcpu_block_update(block, start, end); } /** @@ -1852,13 +1851,12 @@ area_found: /* populate if not all pages are already there */ if (!is_atomic) { - unsigned int page_start, page_end, rs, re; + unsigned int page_end, rs, re; - page_start = PFN_DOWN(off); + rs = PFN_DOWN(off); page_end = PFN_UP(off + size); - bitmap_for_each_clear_region(chunk->populated, rs, re, - page_start, page_end) { + for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) { WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); @@ -2014,8 +2012,7 @@ static void pcpu_balance_free(bool empty_only) list_for_each_entry_safe(chunk, next, &to_free, list) { unsigned int rs, re; - bitmap_for_each_set_region(chunk->populated, rs, re, 0, - chunk->nr_pages) { + for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) { pcpu_depopulate_chunk(chunk, rs, re); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, rs, re); @@ -2085,8 +2082,7 @@ retry_pop: continue; /* @chunk can't go away while pcpu_alloc_mutex is held */ - bitmap_for_each_clear_region(chunk->populated, rs, re, 0, - chunk->nr_pages) { + for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) { int nr = min_t(int, re - rs, nr_to_pop); spin_unlock_irq(&pcpu_lock); From db7313005e9c2d4e80888dd18d4a83926b920e8c Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:12 -0700 Subject: [PATCH 0100/1295] lib: bitmap: add performance test for bitmap_print_to_pagebuf Functional tests for bitmap_print_to_pagebuf() are provided in lib/test_printf.c. This patch adds performance test for a case of fully set bitmap. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- lib/test_bitmap.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index d33fa5a61b95..0c82f07f74fc 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -446,6 +446,42 @@ static void __init test_bitmap_parselist(void) } } +static void __init test_bitmap_printlist(void) +{ + unsigned long *bmap = kmalloc(PAGE_SIZE, GFP_KERNEL); + char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + char expected[256]; + int ret, slen; + ktime_t time; + + if (!buf || !bmap) + goto out; + + memset(bmap, -1, PAGE_SIZE); + slen = snprintf(expected, 256, "0-%ld", PAGE_SIZE * 8 - 1); + if (slen < 0) + goto out; + + time = ktime_get(); + ret = bitmap_print_to_pagebuf(true, buf, bmap, PAGE_SIZE * 8); + time = ktime_get() - time; + + if (ret != slen + 1) { + pr_err("bitmap_print_to_pagebuf: result is %d, expected %d\n", ret, slen); + goto out; + } + + if (strncmp(buf, expected, slen)) { + pr_err("bitmap_print_to_pagebuf: result is %s, expected %s\n", buf, expected); + goto out; + } + + pr_err("bitmap_print_to_pagebuf: input is '%s', Time: %llu\n", buf, time); +out: + kfree(buf); + kfree(bmap); +} + static const unsigned long parse_test[] __initconst = { BITMAP_FROM_U64(0), BITMAP_FROM_U64(1), @@ -818,6 +854,7 @@ static void __init selftest(void) test_bitmap_arr32(); test_bitmap_parse(); test_bitmap_parselist(); + test_bitmap_printlist(); test_mem_optimisations(); test_for_each_set_clump8(); test_bitmap_cut(); From 15325b4f768f2b27b5765489eeab6ec0d6b5e902 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:13 -0700 Subject: [PATCH 0101/1295] vsprintf: rework bitmap_list_string bitmap_list_string() is very ineffective when printing bitmaps with long ranges of set bits because it calls find_next_bit for each bit in the bitmap. We can do better by detecting ranges of set bits. In my environment, before/after is 943008/31008 ns. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- lib/vsprintf.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 58d5e567f836..d4b4e481045b 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1241,20 +1241,13 @@ char *bitmap_list_string(char *buf, char *end, unsigned long *bitmap, struct printf_spec spec, const char *fmt) { int nr_bits = max_t(int, spec.field_width, 0); - /* current bit is 'cur', most recently seen range is [rbot, rtop] */ - int cur, rbot, rtop; bool first = true; + int rbot, rtop; if (check_pointer(&buf, end, bitmap, spec)) return buf; - rbot = cur = find_first_bit(bitmap, nr_bits); - while (cur < nr_bits) { - rtop = cur; - cur = find_next_bit(bitmap, nr_bits, cur + 1); - if (cur < nr_bits && cur <= rtop + 1) - continue; - + for_each_set_bitrange(rbot, rtop, bitmap, nr_bits) { if (!first) { if (buf < end) *buf = ','; @@ -1263,15 +1256,12 @@ char *bitmap_list_string(char *buf, char *end, unsigned long *bitmap, first = false; buf = number(buf, end, rbot, default_dec_spec); - if (rbot < rtop) { - if (buf < end) - *buf = '-'; - buf++; + if (rtop == rbot + 1) + continue; - buf = number(buf, end, rtop, default_dec_spec); - } - - rbot = cur; + if (buf < end) + *buf = '-'; + buf = number(++buf, end, rtop - 1, default_dec_spec); } return buf; } From 830af2eba40327abec64325a5b08b1e85c37a2e0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Jan 2022 21:37:58 +0100 Subject: [PATCH 0102/1295] netfilter: conntrack: don't increment invalid counter on NF_REPEAT The packet isn't invalid, REPEAT means we're trying again after cleaning out a stale connection, e.g. via tcp tracker. This caused increases of invalid stat counter in a test case involving frequent connection reuse, even though no packet is actually invalid. Fixes: 56a62e2218f5 ("netfilter: conntrack: fix NF_REPEAT handling") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 894a325d39f2..d6aa5b47031e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1924,15 +1924,17 @@ repeat: pr_debug("nf_conntrack_in: Can't track with proto module\n"); nf_ct_put(ct); skb->_nfct = 0; - NF_CT_STAT_INC_ATOMIC(state->net, invalid); - if (ret == -NF_DROP) - NF_CT_STAT_INC_ATOMIC(state->net, drop); /* Special case: TCP tracker reports an attempt to reopen a * closed/aborted connection. We have to go back and create a * fresh conntrack. */ if (ret == -NF_REPEAT) goto repeat; + + NF_CT_STAT_INC_ATOMIC(state->net, invalid); + if (ret == -NF_DROP) + NF_CT_STAT_INC_ATOMIC(state->net, drop); + ret = -ret; goto out; } From 87b9d74fb0be80054c729e8d6a119ca0955cedf3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 10 Jan 2022 15:29:53 +0000 Subject: [PATCH 0103/1295] powerpc/time: Fix build failure due to do_hard_irq_enable() on PPC32 CC arch/powerpc/kernel/time.o In file included from : ./arch/powerpc/include/asm/hw_irq.h: In function 'do_hard_irq_enable': ././include/linux/compiler_types.h:335:45: error: call to '__compiletime_assert_35' declared with attribute error: BUILD_BUG failed 335 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^ ././include/linux/compiler_types.h:316:25: note: in definition of macro '__compiletime_assert' 316 | prefix ## suffix(); \ | ^~~~~~ ././include/linux/compiler_types.h:335:9: note: in expansion of macro '_compiletime_assert' 335 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^~~~~~~~~~~~~~~~~~~ ./include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert' 39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg) | ^~~~~~~~~~~~~~~~~~ ./include/linux/build_bug.h:59:21: note: in expansion of macro 'BUILD_BUG_ON_MSG' 59 | #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed") | ^~~~~~~~~~~~~~~~ ./arch/powerpc/include/asm/hw_irq.h:483:9: note: in expansion of macro 'BUILD_BUG' 483 | BUILD_BUG(); | ^~~~~~~~~ should_hard_irq_enable() returns false on PPC32 so this BUILD_BUG() shouldn't trigger. Force inlining of should_hard_irq_enable() Fixes: 0faf20a1ad16 ("powerpc/64s/interrupt: Don't enable MSR[EE] in irq handlers unless perf is in use") Signed-off-by: Christophe Leroy Acked-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/247e01e0e10f4dbc59b5ff89e81702eb1ee7641e.1641828571.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/hw_irq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index a58fb4aa6c81..674e5aaafcbd 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -473,7 +473,7 @@ static inline bool arch_irq_disabled_regs(struct pt_regs *regs) return !(regs->msr & MSR_EE); } -static inline bool should_hard_irq_enable(void) +static __always_inline bool should_hard_irq_enable(void) { return false; } From d37823c3528e5e0705fc7746bcbc2afffb619259 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 10 Jan 2022 15:29:25 +0000 Subject: [PATCH 0104/1295] powerpc/32s: Fix kasan_init_region() for KASAN It has been reported some configuration where the kernel doesn't boot with KASAN enabled. This is due to wrong BAT allocation for the KASAN area: ---[ Data Block Address Translation ]--- 0: 0xc0000000-0xcfffffff 0x00000000 256M Kernel rw m 1: 0xd0000000-0xdfffffff 0x10000000 256M Kernel rw m 2: 0xe0000000-0xefffffff 0x20000000 256M Kernel rw m 3: 0xf8000000-0xf9ffffff 0x2a000000 32M Kernel rw m 4: 0xfa000000-0xfdffffff 0x2c000000 64M Kernel rw m A BAT must have both virtual and physical addresses alignment matching the size of the BAT. This is not the case for BAT 4 above. Fix kasan_init_region() by using block_size() function that is in book3s32/mmu.c. To be able to reuse it here, make it non static and change its name to bat_block_size() in order to avoid name conflict with block_size() defined in Also reuse find_free_bat() to avoid an error message from setbat() when no BAT is available. And allocate memory outside of linear memory mapping to avoid wasting that precious space. With this change we get correct alignment for BATs and KASAN shadow memory is allocated outside the linear memory space. ---[ Data Block Address Translation ]--- 0: 0xc0000000-0xcfffffff 0x00000000 256M Kernel rw 1: 0xd0000000-0xdfffffff 0x10000000 256M Kernel rw 2: 0xe0000000-0xefffffff 0x20000000 256M Kernel rw 3: 0xf8000000-0xfbffffff 0x7c000000 64M Kernel rw 4: 0xfc000000-0xfdffffff 0x7a000000 32M Kernel rw Fixes: 7974c4732642 ("powerpc/32s: Implement dedicated kasan_init_region()") Cc: stable@vger.kernel.org Reported-by: Maxime Bizon Signed-off-by: Christophe Leroy Tested-by: Maxime Bizon Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/7a50ef902494d1325227d47d33dada01e52e5518.1641818726.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/mmu-hash.h | 2 + arch/powerpc/mm/book3s32/mmu.c | 10 ++-- arch/powerpc/mm/kasan/book3s_32.c | 55 ++++++++++--------- 3 files changed, 36 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 7be27862329f..78c6a5fde1d6 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -223,6 +223,8 @@ static __always_inline void update_user_segments(u32 val) update_user_segment(15, val); } +int __init find_free_bat(void); +unsigned int bat_block_size(unsigned long base, unsigned long top); #endif /* !__ASSEMBLY__ */ /* We happily ignore the smaller BATs on 601, we don't actually use diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 94045b265b6b..203735caf691 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -76,7 +76,7 @@ unsigned long p_block_mapped(phys_addr_t pa) return 0; } -static int __init find_free_bat(void) +int __init find_free_bat(void) { int b; int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; @@ -100,7 +100,7 @@ static int __init find_free_bat(void) * - block size has to be a power of two. This is calculated by finding the * highest bit set to 1. */ -static unsigned int block_size(unsigned long base, unsigned long top) +unsigned int bat_block_size(unsigned long base, unsigned long top) { unsigned int max_size = SZ_256M; unsigned int base_shift = (ffs(base) - 1) & 31; @@ -145,7 +145,7 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to int idx; while ((idx = find_free_bat()) != -1 && base != top) { - unsigned int size = block_size(base, top); + unsigned int size = bat_block_size(base, top); if (size < 128 << 10) break; @@ -201,12 +201,12 @@ void mmu_mark_initmem_nx(void) unsigned long size; for (i = 0; i < nb - 1 && base < top;) { - size = block_size(base, top); + size = bat_block_size(base, top); setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); base += size; } if (base < top) { - size = block_size(base, top); + size = bat_block_size(base, top); if ((top - base) > size) { size <<= 1; if (strict_kernel_rwx_enabled() && base + size > border) diff --git a/arch/powerpc/mm/kasan/book3s_32.c b/arch/powerpc/mm/kasan/book3s_32.c index 35b287b0a8da..450a67ef0bbe 100644 --- a/arch/powerpc/mm/kasan/book3s_32.c +++ b/arch/powerpc/mm/kasan/book3s_32.c @@ -10,48 +10,51 @@ int __init kasan_init_region(void *start, size_t size) { unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start); unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size); - unsigned long k_cur = k_start; - int k_size = k_end - k_start; - int k_size_base = 1 << (ffs(k_size) - 1); + unsigned long k_nobat = k_start; + unsigned long k_cur; + phys_addr_t phys; int ret; - void *block; - block = memblock_alloc(k_size, k_size_base); + while (k_nobat < k_end) { + unsigned int k_size = bat_block_size(k_nobat, k_end); + int idx = find_free_bat(); - if (block && k_size_base >= SZ_128K && k_start == ALIGN(k_start, k_size_base)) { - int shift = ffs(k_size - k_size_base); - int k_size_more = shift ? 1 << (shift - 1) : 0; + if (idx == -1) + break; + if (k_size < SZ_128K) + break; + phys = memblock_phys_alloc_range(k_size, k_size, 0, + MEMBLOCK_ALLOC_ANYWHERE); + if (!phys) + break; - setbat(-1, k_start, __pa(block), k_size_base, PAGE_KERNEL); - if (k_size_more >= SZ_128K) - setbat(-1, k_start + k_size_base, __pa(block) + k_size_base, - k_size_more, PAGE_KERNEL); - if (v_block_mapped(k_start)) - k_cur = k_start + k_size_base; - if (v_block_mapped(k_start + k_size_base)) - k_cur = k_start + k_size_base + k_size_more; - - update_bats(); + setbat(idx, k_nobat, phys, k_size, PAGE_KERNEL); + k_nobat += k_size; } + if (k_nobat != k_start) + update_bats(); - if (!block) - block = memblock_alloc(k_size, PAGE_SIZE); - if (!block) - return -ENOMEM; + if (k_nobat < k_end) { + phys = memblock_phys_alloc_range(k_end - k_nobat, PAGE_SIZE, 0, + MEMBLOCK_ALLOC_ANYWHERE); + if (!phys) + return -ENOMEM; + } ret = kasan_init_shadow_page_tables(k_start, k_end); if (ret) return ret; - kasan_update_early_region(k_start, k_cur, __pte(0)); + kasan_update_early_region(k_start, k_nobat, __pte(0)); - for (; k_cur < k_end; k_cur += PAGE_SIZE) { + for (k_cur = k_nobat; k_cur < k_end; k_cur += PAGE_SIZE) { pmd_t *pmd = pmd_off_k(k_cur); - void *va = block + k_cur - k_start; - pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); + pte_t pte = pfn_pte(PHYS_PFN(phys + k_cur - k_nobat), PAGE_KERNEL); __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0); } flush_tlb_kernel_range(k_start, k_end); + memset(kasan_mem_to_shadow(start), 0, k_end - k_start); + return 0; } From 3fe7fa5843d204e235d92902190fecb972a3f9cc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 10 Dec 2021 15:09:21 -0500 Subject: [PATCH 0105/1295] mm: Add folio_put_refs() This is like folio_put(), but puts N references at once instead of just one. It's like put_page_refs(), but does one atomic operation instead of two, and is available to more than just gup.c. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Reviewed-by: William Kucharski --- include/linux/mm.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index c768a7c81b0b..cb98f75b245e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1244,6 +1244,26 @@ static inline void folio_put(struct folio *folio) __put_page(&folio->page); } +/** + * folio_put_refs - Reduce the reference count on a folio. + * @folio: The folio. + * @refs: The amount to subtract from the folio's reference count. + * + * If the folio's reference count reaches zero, the memory will be + * released back to the page allocator and may be used by another + * allocation immediately. Do not access the memory or the struct folio + * after calling folio_put_refs() unless you can be sure that these weren't + * the last references. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folio_put_refs(struct folio *folio, int refs) +{ + if (folio_ref_sub_and_test(folio, refs)) + __put_page(&folio->page); +} + static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); From 3abb28e275bfbe60136db37eae6679c3e1928cd5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 7 Jan 2022 13:03:48 -0500 Subject: [PATCH 0106/1295] filemap: Use folio_put_refs() in filemap_free_folio() This shrinks filemap_free_folio() by 55 bytes in my .config; 24 bytes from removing the VM_BUG_ON_FOLIO() and 31 bytes from unifying the small/large folio paths. We could just use folio_ref_sub() here since the caller should hold a reference (as the VM_BUG_ON_FOLIO() was asserting), but that's fragile. Signed-off-by: Matthew Wilcox (Oracle) --- mm/filemap.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 2fd9b2f24025..afc8f5ca85ac 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -231,17 +231,15 @@ void __filemap_remove_folio(struct folio *folio, void *shadow) void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*freepage)(struct page *); + int refs = 1; freepage = mapping->a_ops->freepage; if (freepage) freepage(&folio->page); - if (folio_test_large(folio) && !folio_test_hugetlb(folio)) { - folio_ref_sub(folio, folio_nr_pages(folio)); - VM_BUG_ON_FOLIO(folio_ref_count(folio) <= 0, folio); - } else { - folio_put(folio); - } + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + refs = folio_nr_pages(folio); + folio_put_refs(folio, refs); } /** From 429a64f6e91fbfe4912d17247c27d0d66767b1c2 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 14 Jan 2022 08:43:55 +0530 Subject: [PATCH 0107/1295] powerpc/perf: Only define power_pmu_wants_prompt_pmi() for CONFIG_PPC64 power_pmu_wants_prompt_pmi() is used to decide if PMIs should be taken promptly. This is valid only for ppc64 and is used only if CONFIG_PPC_BOOK3S_64=y. Hence include the function under config check for PPC64. Fixes warning for 32-bit compilation: arch/powerpc/perf/core-book3s.c:2455:6: warning: no previous prototype for 'power_pmu_wants_prompt_pmi' 2455 | bool power_pmu_wants_prompt_pmi(void) | ^~~~~~~~~~~~~~~~~~~~~~~~~~ Fixes: 5a7745b96f43 ("powerpc/64s/perf: add power_pmu_wants_prompt_pmi to say whether perf wants PMIs to be soft-NMI") Reported-by: kernel test robot Signed-off-by: Athira Rajeev Reviewed-by: Nicholas Piggin [mpe: Move inside existing CONFIG_PPC64 ifdef block] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220114031355.87480-1-atrajeev@linux.vnet.ibm.com --- arch/powerpc/perf/core-book3s.c | 58 ++++++++++++++++----------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index a684901b6965..32b98b7a1f86 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -776,6 +776,34 @@ static void pmao_restore_workaround(bool ebb) mtspr(SPRN_PMC6, pmcs[5]); } +/* + * If the perf subsystem wants performance monitor interrupts as soon as + * possible (e.g., to sample the instruction address and stack chain), + * this should return true. The IRQ masking code can then enable MSR[EE] + * in some places (e.g., interrupt handlers) that allows PMI interrupts + * through to improve accuracy of profiles, at the cost of some performance. + * + * The PMU counters can be enabled by other means (e.g., sysfs raw SPR + * access), but in that case there is no need for prompt PMI handling. + * + * This currently returns true if any perf counter is being used. It + * could possibly return false if only events are being counted rather than + * samples being taken, but for now this is good enough. + */ +bool power_pmu_wants_prompt_pmi(void) +{ + struct cpu_hw_events *cpuhw; + + /* + * This could simply test local_paca->pmcregs_in_use if that were not + * under ifdef KVM. + */ + if (!ppmu) + return false; + + cpuhw = this_cpu_ptr(&cpu_hw_events); + return cpuhw->n_events; +} #endif /* CONFIG_PPC64 */ static void perf_event_interrupt(struct pt_regs *regs); @@ -2438,36 +2466,6 @@ static void perf_event_interrupt(struct pt_regs *regs) perf_sample_event_took(sched_clock() - start_clock); } -/* - * If the perf subsystem wants performance monitor interrupts as soon as - * possible (e.g., to sample the instruction address and stack chain), - * this should return true. The IRQ masking code can then enable MSR[EE] - * in some places (e.g., interrupt handlers) that allows PMI interrupts - * though to improve accuracy of profiles, at the cost of some performance. - * - * The PMU counters can be enabled by other means (e.g., sysfs raw SPR - * access), but in that case there is no need for prompt PMI handling. - * - * This currently returns true if any perf counter is being used. It - * could possibly return false if only events are being counted rather than - * samples being taken, but for now this is good enough. - */ -bool power_pmu_wants_prompt_pmi(void) -{ - struct cpu_hw_events *cpuhw; - - /* - * This could simply test local_paca->pmcregs_in_use if that were not - * under ifdef KVM. - */ - - if (!ppmu) - return false; - - cpuhw = this_cpu_ptr(&cpu_hw_events); - return cpuhw->n_events; -} - static int power_pmu_prepare_cpu(unsigned int cpu) { struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); From 45378cd33905966baf16d12ab0adbd56794ee075 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 8 Jan 2022 14:01:18 +0000 Subject: [PATCH 0108/1295] irqchip/apple-aic: Drop unused ipi_hwirq field This field was never used, remove it. Signed-off-by: Marc Zyngier Acked-by: Hector Martin Link: https://lore.kernel.org/r/20220108140118.3378937-1-maz@kernel.org --- drivers/irqchip/irq-apple-aic.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/irqchip/irq-apple-aic.c b/drivers/irqchip/irq-apple-aic.c index 3759dc36cc8f..53c4783b5b9b 100644 --- a/drivers/irqchip/irq-apple-aic.c +++ b/drivers/irqchip/irq-apple-aic.c @@ -178,7 +178,6 @@ struct aic_irq_chip { struct irq_domain *hw_domain; struct irq_domain *ipi_domain; int nr_hw; - int ipi_hwirq; }; static DEFINE_PER_CPU(uint32_t, aic_fiq_unmasked); From 291e79c7e2eb6fdc016453597b78482e06199d0f Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Sun, 9 Jan 2022 15:54:32 +0100 Subject: [PATCH 0109/1295] irqchip/realtek-rtl: Map control data to virq The driver assigned the irqchip and irq handler to the hardware irq, instead of the virq. This is incorrect, and only worked because these irq numbers happened to be the same on the devices used for testing the original driver. Fixes: 9f3a0f34b84a ("irqchip: Add support for Realtek RTL838x/RTL839x interrupt controller") Signed-off-by: Sander Vanheule Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/4b4936606480265db47df152f00bc2ed46340599.1641739718.git.sander@svanheule.net --- drivers/irqchip/irq-realtek-rtl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-realtek-rtl.c b/drivers/irqchip/irq-realtek-rtl.c index fd9f275592d2..d6788dd93c7b 100644 --- a/drivers/irqchip/irq-realtek-rtl.c +++ b/drivers/irqchip/irq-realtek-rtl.c @@ -62,7 +62,7 @@ static struct irq_chip realtek_ictl_irq = { static int intc_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) { - irq_set_chip_and_handler(hw, &realtek_ictl_irq, handle_level_irq); + irq_set_chip_and_handler(irq, &realtek_ictl_irq, handle_level_irq); return 0; } From 91351b5dd0fd494eb2d85e1bb6aca77b067447e0 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Sun, 9 Jan 2022 15:54:33 +0100 Subject: [PATCH 0110/1295] irqchip/realtek-rtl: Fix off-by-one in routing There is an offset between routing values (1..6) and the connected MIPS CPU interrupts (2..7), but no distinction was made between these two values. This issue was previously hidden during testing, because an interrupt mapping was used where for each required interrupt another (unused) routing was configured, with an offset of +1. Offset the CPU IRQ numbers by -1 to retrieve the correct routing value. Fixes: 9f3a0f34b84a ("irqchip: Add support for Realtek RTL838x/RTL839x interrupt controller") Signed-off-by: Sander Vanheule Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/177b920aa8d8610615692d0e657e509f363c85ca.1641739718.git.sander@svanheule.net --- drivers/irqchip/irq-realtek-rtl.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-realtek-rtl.c b/drivers/irqchip/irq-realtek-rtl.c index d6788dd93c7b..568614edd88f 100644 --- a/drivers/irqchip/irq-realtek-rtl.c +++ b/drivers/irqchip/irq-realtek-rtl.c @@ -95,7 +95,8 @@ out: * SoC interrupts are cascaded to MIPS CPU interrupts according to the * interrupt-map in the device tree. Each SoC interrupt gets 4 bits for * the CPU interrupt in an Interrupt Routing Register. Max 32 SoC interrupts - * thus go into 4 IRRs. + * thus go into 4 IRRs. A routing value of '0' means the interrupt is left + * disconnected. Routing values {1..15} connect to output lines {0..14}. */ static int __init map_interrupts(struct device_node *node, struct irq_domain *domain) { @@ -134,7 +135,7 @@ static int __init map_interrupts(struct device_node *node, struct irq_domain *do of_node_put(cpu_ictl); cpu_int = be32_to_cpup(imap + 2); - if (cpu_int > 7) + if (cpu_int > 7 || cpu_int < 2) return -EINVAL; if (!(mips_irqs_set & BIT(cpu_int))) { @@ -143,7 +144,8 @@ static int __init map_interrupts(struct device_node *node, struct irq_domain *do mips_irqs_set |= BIT(cpu_int); } - regs[(soc_int * 4) / 32] |= cpu_int << (soc_int * 4) % 32; + /* Use routing values (1..6) for CPU interrupts (2..7) */ + regs[(soc_int * 4) / 32] |= (cpu_int - 1) << (soc_int * 4) % 32; imap += 3; } From 960dd884ddf5621ae6284cd3a42724500a97ae4c Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Sun, 9 Jan 2022 15:54:34 +0100 Subject: [PATCH 0111/1295] irqchip/realtek-rtl: Service all pending interrupts Instead of only servicing the lowest pending interrupt line, make sure all pending SoC interrupts are serviced before exiting the chained handler. This adds a small overhead if only one interrupt is pending, but should prevent rapid re-triggering of the handler. Signed-off-by: Sander Vanheule Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/5082ad3cb8b4eedf55075561b93eff6570299fe1.1641739718.git.sander@svanheule.net --- drivers/irqchip/irq-realtek-rtl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-realtek-rtl.c b/drivers/irqchip/irq-realtek-rtl.c index 568614edd88f..50a56820c99b 100644 --- a/drivers/irqchip/irq-realtek-rtl.c +++ b/drivers/irqchip/irq-realtek-rtl.c @@ -76,16 +76,20 @@ static void realtek_irq_dispatch(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irq_domain *domain; - unsigned int pending; + unsigned long pending; + unsigned int soc_int; chained_irq_enter(chip, desc); pending = readl(REG(RTL_ICTL_GIMR)) & readl(REG(RTL_ICTL_GISR)); + if (unlikely(!pending)) { spurious_interrupt(); goto out; } + domain = irq_desc_get_handler_data(desc); - generic_handle_domain_irq(domain, __ffs(pending)); + for_each_set_bit(soc_int, &pending, 32) + generic_handle_domain_irq(domain, soc_int); out: chained_irq_exit(chip, desc); From c831d92890e037aafee662e66172d406804e4818 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 26 Dec 2021 15:46:21 +0100 Subject: [PATCH 0112/1295] irqchip/loongson-pch-ms: Use bitmap_free() to free bitmap kfree() and bitmap_free() are the same. But using the latter is more consistent when freeing memory allocated with bitmap_zalloc(). Signed-off-by: Christophe JAILLET Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/0b982ab54844803049c217b2899baa59602faacd.1640529916.git.christophe.jaillet@wanadoo.fr --- drivers/irqchip/irq-loongson-pch-msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-loongson-pch-msi.c b/drivers/irqchip/irq-loongson-pch-msi.c index 32562b7e681b..e3801c4a77ed 100644 --- a/drivers/irqchip/irq-loongson-pch-msi.c +++ b/drivers/irqchip/irq-loongson-pch-msi.c @@ -241,7 +241,7 @@ static int pch_msi_init(struct device_node *node, return 0; err_map: - kfree(priv->msi_map); + bitmap_free(priv->msi_map); err_priv: kfree(priv); return ret; From a21864486f7e220bd5938c6fb637613d9635739a Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 5 Jan 2022 13:15:09 +0800 Subject: [PATCH 0113/1295] KVM: x86/pmu: Fix available_event_types check for REF_CPU_CYCLES event According to CPUID 0x0A.EBX bit vector, the event [7] should be the unrealized event "Topdown Slots" instead of the *kernel* generalized common hardware event "REF_CPU_CYCLES", so we need to skip the cpuid unavaliblity check in the intel_pmc_perf_hw_id() for the last REF_CPU_CYCLES event and update the confusing comment. If the event is marked as unavailable in the Intel guest CPUID 0AH.EBX leaf, we need to avoid any perf_event creation, whether it's a gp or fixed counter. To distinguish whether it is a rejected event or an event that needs to be programmed with PERF_TYPE_RAW type, a new special returned value of "PERF_COUNT_HW_MAX + 1" is introduced. Fixes: 62079d8a43128 ("KVM: PMU: add proper support for fixed counter 2") Signed-off-by: Like Xu Message-Id: <20220105051509.69437-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 3 +++ arch/x86/kvm/vmx/pmu_intel.c | 18 ++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 8abdadb7e22a..e632693a2266 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -109,6 +109,9 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .config = config, }; + if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX) + return; + attr.sample_period = get_sample_period(pmc, pmc->counter); if (in_tx) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5e0ac57d6d1b..ffccfd9823c0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -21,7 +21,6 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) static struct kvm_event_hw_type_mapping intel_arch_events[] = { - /* Index must match CPUID 0x0A.EBX bit vector */ [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, @@ -29,6 +28,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = { [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + /* The above index must match CPUID 0x0A.EBX bit vector */ [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, }; @@ -75,11 +75,17 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; - for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) - if (intel_arch_events[i].eventsel == event_select && - intel_arch_events[i].unit_mask == unit_mask && - (pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i))) - break; + for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) { + if (intel_arch_events[i].eventsel != event_select || + intel_arch_events[i].unit_mask != unit_mask) + continue; + + /* disable event that reported as not present by cpuid */ + if ((i < 7) && !(pmu->available_event_types & (1 << i))) + return PERF_COUNT_HW_MAX + 1; + + break; + } if (i == ARRAY_SIZE(intel_arch_events)) return PERF_COUNT_HW_MAX; From ee3a5f9e3d9bf94159f3cc80da542fbe83502dd8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 17 Jan 2022 16:05:39 +0100 Subject: [PATCH 0114/1295] KVM: x86: Do runtime CPUID update before updating vcpu->arch.cpuid_entries kvm_update_cpuid_runtime() mangles CPUID data coming from userspace VMM after updating 'vcpu->arch.cpuid_entries', this makes it impossible to compare an update with what was previously supplied. Introduce __kvm_update_cpuid_runtime() version which can be used to tweak the input before it goes to 'vcpu->arch.cpuid_entries' so the upcoming update check can compare tweaked data. No functional change intended. Signed-off-by: Vitaly Kuznetsov Message-Id: <20220117150542.2176196-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c55e57b30e81..812190a707f6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -145,14 +145,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) } } -static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, + struct kvm_cpuid_entry2 *entries, int nent) { u32 base = vcpu->arch.kvm_cpuid_base; if (!base) return NULL; - return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0); + return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0); +} + +static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +{ + return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent); } void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) @@ -167,11 +174,12 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) vcpu->arch.pv_cpuid.features = best->eax; } -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, + int nent) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 1, 0); + best = cpuid_entry2_find(entries, nent, 1, 0); if (best) { /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) @@ -182,33 +190,38 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); } - best = kvm_find_cpuid_entry(vcpu, 7, 0); + best = cpuid_entry2_find(entries, nent, 7, 0); if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) cpuid_entry_change(best, X86_FEATURE_OSPKE, kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); - best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + best = cpuid_entry2_find(entries, nent, 0xD, 0); if (best) best->ebx = xstate_required_size(vcpu->arch.xcr0, false); - best = kvm_find_cpuid_entry(vcpu, 0xD, 1); + best = cpuid_entry2_find(entries, nent, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - best = kvm_find_kvm_cpuid_features(vcpu); + best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent); if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = cpuid_entry2_find(entries, nent, 0x1, 0); if (best) cpuid_entry_change(best, X86_FEATURE_MWAIT, vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } } + +void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +{ + __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); +} EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) @@ -298,6 +311,8 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, { int r; + __kvm_update_cpuid_runtime(vcpu, e2, nent); + r = kvm_check_cpuid(vcpu, e2, nent); if (r) return r; @@ -307,7 +322,6 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, vcpu->arch.cpuid_nent = nent; kvm_update_kvm_cpuid_base(vcpu); - kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); return 0; From c6617c61e8fe44b9e9fdfede921f61cac6b5149d Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 17 Jan 2022 16:05:40 +0100 Subject: [PATCH 0115/1295] KVM: x86: Partially allow KVM_SET_CPUID{,2} after KVM_RUN Commit feb627e8d6f6 ("KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN") forbade changing CPUID altogether but unfortunately this is not fully compatible with existing VMMs. In particular, QEMU reuses vCPU fds for CPU hotplug after unplug and it calls KVM_SET_CPUID2. Instead of full ban, check whether the supplied CPUID data is equal to what was previously set. Reported-by: Igor Mammedov Fixes: feb627e8d6f6 ("KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN") Signed-off-by: Vitaly Kuznetsov Message-Id: <20220117150542.2176196-3-vkuznets@redhat.com> Cc: stable@vger.kernel.org [Do not call kvm_find_cpuid_entry repeatedly. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 36 ++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 19 ------------------- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 812190a707f6..7eb046d907c6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -119,6 +119,28 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); } +/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */ +static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, + int nent) +{ + struct kvm_cpuid_entry2 *orig; + int i; + + if (nent != vcpu->arch.cpuid_nent) + return -EINVAL; + + for (i = 0; i < nent; i++) { + orig = &vcpu->arch.cpuid_entries[i]; + if (e2[i].function != orig->function || + e2[i].index != orig->index || + e2[i].eax != orig->eax || e2[i].ebx != orig->ebx || + e2[i].ecx != orig->ecx || e2[i].edx != orig->edx) + return -EINVAL; + } + + return 0; +} + static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) { u32 function; @@ -313,6 +335,20 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, __kvm_update_cpuid_runtime(vcpu, e2, nent); + /* + * KVM does not correctly handle changing guest CPUID after KVM_RUN, as + * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't + * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page + * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with + * the core vCPU model on the fly. It would've been better to forbid any + * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately + * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do + * KVM_SET_CPUID{,2} again. To support this legacy behavior, check + * whether the supplied CPUID data is equal to what's already set. + */ + if (vcpu->arch.last_vmentry_cpu != -1) + return kvm_cpuid_check_equal(vcpu, e2, nent); + r = kvm_check_cpuid(vcpu, e2, nent); if (r) return r; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 60da2331ec32..a0bc637348b2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5230,17 +5230,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; - /* - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page - * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with - * the core vCPU model on the fly, so fail. - */ - r = -EINVAL; - if (vcpu->arch.last_vmentry_cpu != -1) - goto out; - r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; @@ -5251,14 +5240,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; - /* - * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in - * KVM_SET_CPUID case above. - */ - r = -EINVAL; - if (vcpu->arch.last_vmentry_cpu != -1) - goto out; - r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; From 9e6d484f9991176269607bb3c54a494e32eab27a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 17 Jan 2022 16:05:41 +0100 Subject: [PATCH 0116/1295] KVM: selftests: Rename 'get_cpuid_test' to 'cpuid_test' In preparation to reusing the existing 'get_cpuid_test' for testing "KVM_SET_CPUID{,2} after KVM_RUN" rename it to 'cpuid_test' to avoid the confusion. No functional change intended. Signed-off-by: Vitaly Kuznetsov Message-Id: <20220117150542.2176196-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 2 +- tools/testing/selftests/kvm/Makefile | 4 ++-- .../selftests/kvm/x86_64/{get_cpuid_test.c => cpuid_test.c} | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename tools/testing/selftests/kvm/x86_64/{get_cpuid_test.c => cpuid_test.c} (100%) diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 8c129961accf..20b4c921c9a2 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -8,11 +8,11 @@ /s390x/memop /s390x/resets /s390x/sync_regs_test +/x86_64/cpuid_test /x86_64/cr4_cpuid_sync_test /x86_64/debug_regs /x86_64/evmcs_test /x86_64/emulator_error_test -/x86_64/get_cpuid_test /x86_64/get_msr_index_features /x86_64/kvm_clock_test /x86_64/kvm_pv_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index ee8cf2149824..ec78a8692899 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -43,11 +43,11 @@ LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c lib/aarch64/handler LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c LIBKVM_riscv = lib/riscv/processor.c lib/riscv/ucall.c -TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test +TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test +TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test -TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features diff --git a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c similarity index 100% rename from tools/testing/selftests/kvm/x86_64/get_cpuid_test.c rename to tools/testing/selftests/kvm/x86_64/cpuid_test.c From ecebb966acaab2466d9857d1cc435ee1fc9eee50 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 17 Jan 2022 16:05:42 +0100 Subject: [PATCH 0117/1295] KVM: selftests: Test KVM_SET_CPUID2 after KVM_RUN KVM forbids KVM_SET_CPUID2 after KVM_RUN was performed on a vCPU unless the supplied CPUID data is equal to what was previously set. Test this. Signed-off-by: Vitaly Kuznetsov Message-Id: <20220117150542.2176196-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/processor.h | 7 ++++ .../selftests/kvm/lib/x86_64/processor.c | 33 ++++++++++++++++--- .../testing/selftests/kvm/x86_64/cpuid_test.c | 30 +++++++++++++++++ 3 files changed, 66 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index e94ba0fc67d8..bb013d101c14 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -375,6 +375,8 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index); struct kvm_cpuid2 *kvm_get_supported_cpuid(void); struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid); +int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_cpuid2 *cpuid); void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_cpuid2 *cpuid); @@ -418,6 +420,11 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr); void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, uint64_t pte); +/* + * get_cpuid() - find matching CPUID entry and return pointer to it. + */ +struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function, + uint32_t index); /* * set_cpuid() - overwrites a matching cpuid entry with the provided value. * matches based on ent->function && ent->index. returns true diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index babb0f28575c..d61e2326dc85 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -886,6 +886,17 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) return entry; } + +int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_cpuid2 *cpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + return ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid); +} + /* * VM VCPU CPUID Set * @@ -903,12 +914,9 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_cpuid2 *cpuid) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); int rc; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid); + rc = __vcpu_set_cpuid(vm, vcpuid, cpuid); TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i", rc, errno); @@ -1384,6 +1392,23 @@ void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) } } +struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function, + uint32_t index) +{ + int i; + + for (i = 0; i < cpuid->nent; i++) { + struct kvm_cpuid_entry2 *cur = &cpuid->entries[i]; + + if (cur->function == function && cur->index == index) + return cur; + } + + TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index); + + return NULL; +} + bool set_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *ent) { diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index a711f83749ea..16d2465c5634 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -154,6 +154,34 @@ struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct return guest_cpuids; } +static void set_cpuid_after_run(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid) +{ + struct kvm_cpuid_entry2 *ent; + int rc; + u32 eax, ebx, x; + + /* Setting unmodified CPUID is allowed */ + rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid); + TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc); + + /* Changing CPU features is forbidden */ + ent = get_cpuid(cpuid, 0x7, 0); + ebx = ent->ebx; + ent->ebx--; + rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid); + TEST_ASSERT(rc, "Changing CPU features should fail"); + ent->ebx = ebx; + + /* Changing MAXPHYADDR is forbidden */ + ent = get_cpuid(cpuid, 0x80000008, 0); + eax = ent->eax; + x = eax & 0xff; + ent->eax = (eax & ~0xffu) | (x - 1); + rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid); + TEST_ASSERT(rc, "Changing MAXPHYADDR should fail"); + ent->eax = eax; +} + int main(void) { struct kvm_cpuid2 *supp_cpuid, *cpuid2; @@ -175,5 +203,7 @@ int main(void) for (stage = 0; stage < 3; stage++) run_vcpu(vm, VCPU_ID, stage); + set_cpuid_after_run(vm, cpuid2); + kvm_vm_free(vm); } From 4732f2444acd9b7eabc744f20eb5cc9694c0bfbd Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 11 Jan 2022 15:38:23 +0800 Subject: [PATCH 0118/1295] KVM: x86: Making the module parameter of vPMU more common The new module parameter to control PMU virtualization should apply to Intel as well as AMD, for situations where userspace is not trusted. If the module parameter allows PMU virtualization, there could be a new KVM_CAP or guest CPUID bits whereby userspace can enable/disable PMU virtualization on a per-VM basis. If the module parameter does not allow PMU virtualization, there should be no userspace override, since we have no precedent for authorizing that kind of override. If it's false, other counter-based profiling features (such as LBR including the associated CPUID bits if any) will not be exposed. Change its name from "pmu" to "enable_pmu" as we have temporary variables with the same name in our code like "struct kvm_pmu *pmu". Fixes: b1d66dad65dc ("KVM: x86/svm: Add module param to control PMU virtualization") Suggested-by : Jim Mattson Signed-off-by: Like Xu Message-Id: <20220111073823.21885-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 6 +++--- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/svm/svm.c | 8 ++------ arch/x86/kvm/svm/svm.h | 1 - arch/x86/kvm/vmx/capabilities.h | 4 ++++ arch/x86/kvm/vmx/pmu_intel.c | 2 +- arch/x86/kvm/x86.c | 5 +++++ arch/x86/kvm/x86.h | 1 + 8 files changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7eb046d907c6..a7c280d2113b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -845,10 +845,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) perf_get_x86_pmu_capability(&cap); /* - * Only support guest architectural pmu on a host - * with architectural pmu. + * The guest architecture pmu is only supported if the architecture + * pmu exists on the host and the module parameters allow it. */ - if (!cap.version) + if (!cap.version || !enable_pmu) memset(&cap, 0, sizeof(cap)); eax.split.version_id = min(cap.version, 2); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 12d8b301065a..5aa45f13b16d 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -101,7 +101,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); - if (!pmu) + if (!enable_pmu) return NULL; switch (msr) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c3d9006478a4..d8cac84fb2dc 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -192,10 +192,6 @@ module_param(vgif, int, 0444); static int lbrv = true; module_param(lbrv, int, 0444); -/* enable/disable PMU virtualization */ -bool pmu = true; -module_param(pmu, bool, 0444); - static int tsc_scaling = true; module_param(tsc_scaling, int, 0444); @@ -957,7 +953,7 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); /* AMD PMU PERFCTR_CORE CPUID */ - if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) + if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); /* CPUID 0x8000001F (SME/SEV features) */ @@ -1093,7 +1089,7 @@ static __init int svm_hardware_setup(void) pr_info("LBR virtualization supported\n"); } - if (!pmu) + if (!enable_pmu) pr_info("PMU virtualization is disabled\n"); svm_set_cpu_caps(); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 9f153c59f2c8..6d29421dd4bf 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -32,7 +32,6 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern bool intercept_smi; -extern bool pmu; /* * Clean bits in VMCB. diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index c8029b7845b6..959b59d13b5a 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -5,6 +5,7 @@ #include #include "lapic.h" +#include "x86.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -389,6 +390,9 @@ static inline u64 vmx_get_perf_capabilities(void) { u64 perf_cap = 0; + if (!enable_pmu) + return perf_cap; + if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index ffccfd9823c0..466d18fc0c5d 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -487,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits = 0xffffffff00200000ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); - if (!entry) + if (!entry || !enable_pmu) return; eax.full = entry->eax; edx.full = entry->edx; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a0bc637348b2..52df8e6eaa57 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -187,6 +187,11 @@ module_param(force_emulation_prefix, bool, S_IRUGO); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); +/* Enable/disable PMU virtualization */ +bool __read_mostly enable_pmu = true; +EXPORT_SYMBOL_GPL(enable_pmu); +module_param(enable_pmu, bool, 0444); + /* * Restoring the host value for MSRs that are only consumed when running in * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index da7031e80f23..1ebd5a7594da 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -336,6 +336,7 @@ extern u64 host_xcr0; extern u64 supported_xcr0; extern u64 host_xss; extern u64 supported_xss; +extern bool enable_pmu; static inline bool kvm_mpx_supported(void) { From e3548aaf41a200c2af359462be23bcdd76efd795 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Mon, 17 Jan 2022 00:20:47 -0600 Subject: [PATCH 0119/1295] cifs: free ntlmsspblob allocated in negotiate One of my previous fixes: cifs: send workstation name during ntlmssp session setup ...changed the prototype of build_ntlmssp_negotiate_blob from being allocated by the caller to being allocated within the function. The caller needs to free this object too. While SMB2 version of the caller did it, I forgot to free for the SMB1 version. Fixing that here. Fixes: 49bd49f983b5 ("cifs: send workstation name during ntlmssp session setup") Cc: stable@vger.kernel.org # 5.16 Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/sess.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index d12490e12be5..ffaa091e41a4 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1413,7 +1413,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) &blob_len, ses, server, sess_data->nls_cp); if (rc) - goto out; + goto out_free_ntlmsspblob; sess_data->iov[1].iov_len = blob_len; sess_data->iov[1].iov_base = ntlmsspblob; @@ -1421,7 +1421,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) rc = _sess_auth_rawntlmssp_assemble_req(sess_data); if (rc) - goto out; + goto out_free_ntlmsspblob; rc = sess_sendreceive(sess_data); @@ -1435,14 +1435,14 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) rc = 0; if (rc) - goto out; + goto out_free_ntlmsspblob; cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); if (smb_buf->WordCount != 4) { rc = -EIO; cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); - goto out; + goto out_free_ntlmsspblob; } ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ @@ -1456,10 +1456,13 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) cifs_dbg(VFS, "bad security blob length %d\n", blob_len); rc = -EINVAL; - goto out; + goto out_free_ntlmsspblob; } rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); + +out_free_ntlmsspblob: + kfree(ntlmsspblob); out: sess_free_buffer(sess_data); From 74ce6135ae6ef482715cff2ccd703b7295f900f2 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 17 Jan 2022 09:11:30 +0800 Subject: [PATCH 0120/1295] cifs: clean up an inconsistent indenting Eliminate the follow smatch warning: fs/cifs/sess.c:1581 sess_auth_rawntlmssp_authenticate() warn: inconsistent indenting Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Steve French --- fs/cifs/sess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index ffaa091e41a4..50ca479d7039 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1577,7 +1577,7 @@ out_free_ntlmsspblob: out: sess_free_buffer(sess_data); - if (!rc) + if (!rc) rc = sess_establish_session(sess_data); /* Cleanup */ From 5f51c7ce1dc36565296b3ef342585f70ec72a2a9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Jan 2022 19:26:32 +0100 Subject: [PATCH 0121/1295] ACPI: CPPC: Fix up I/O port access in cpc_read() The code as currently implemented does not work on big endian systems, so fix it up. Fixes: a2c8f92bea5f ("ACPI: CPPC: Implement support for SystemIO registers") Reported-by: Dan Carpenter Suggested-by: Dan Carpenter Link: https://lore.kernel.org/linux-acpi/20220111092928.GA24968@kili/ Signed-off-by: Rafael J. Wysocki Acked-by: Huang Rui --- drivers/acpi/cppc_acpi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index a9d2de403c0c..5bc55322e425 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -929,16 +929,18 @@ static int cpc_read(int cpu, struct cpc_register_resource *reg_res, u64 *val) if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) { u32 width = 8 << (reg->access_width - 1); + u32 val_u32; acpi_status status; status = acpi_os_read_port((acpi_io_address)reg->address, - (u32 *)val, width); + &val_u32, width); if (ACPI_FAILURE(status)) { pr_debug("Error: Failed to read SystemIO port %llx\n", reg->address); return -EFAULT; } + *val = val_u32; return 0; } else if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM && pcc_ss_id >= 0) vaddr = GET_PCC_VADDR(reg->address, pcc_ss_id); From f684b10751287e8b7e15e8f10ae40bdd74149bba Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Jan 2022 19:27:22 +0100 Subject: [PATCH 0122/1295] ACPI: CPPC: Drop redundant local variable from cpc_read() The ret_val local variable in cpc_read() is not necessary, so eliminate it. No functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Huang Rui --- drivers/acpi/cppc_acpi.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 5bc55322e425..866560cbb082 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -915,14 +915,13 @@ int __weak cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) static int cpc_read(int cpu, struct cpc_register_resource *reg_res, u64 *val) { - int ret_val = 0; void __iomem *vaddr = NULL; int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); struct cpc_reg *reg = ®_res->cpc_entry.reg; if (reg_res->type == ACPI_TYPE_INTEGER) { *val = reg_res->cpc_entry.int_value; - return ret_val; + return 0; } *val = 0; @@ -968,10 +967,10 @@ static int cpc_read(int cpu, struct cpc_register_resource *reg_res, u64 *val) default: pr_debug("Error: Cannot read %u bit width from PCC for ss: %d\n", reg->bit_width, pcc_ss_id); - ret_val = -EFAULT; + return -EFAULT; } - return ret_val; + return 0; } static int cpc_write(int cpu, struct cpc_register_resource *reg_res, u64 val) From a510c78e5b6f2f9b9add88071fdfc2b740d2e356 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 14 Jan 2022 15:24:33 -0800 Subject: [PATCH 0123/1295] ACPI: DPTF: Support Raptor Lake Add Raptor Lake ACPI IDs for DPTF devices. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/acpi/dptf/dptf_pch_fivr.c | 1 + drivers/acpi/dptf/dptf_power.c | 2 ++ drivers/acpi/dptf/int340x_thermal.c | 6 ++++++ drivers/acpi/fan.h | 1 + 4 files changed, 10 insertions(+) diff --git a/drivers/acpi/dptf/dptf_pch_fivr.c b/drivers/acpi/dptf/dptf_pch_fivr.c index e7ab0fc90db9..c0da24c9f8c3 100644 --- a/drivers/acpi/dptf/dptf_pch_fivr.c +++ b/drivers/acpi/dptf/dptf_pch_fivr.c @@ -151,6 +151,7 @@ static int pch_fivr_remove(struct platform_device *pdev) static const struct acpi_device_id pch_fivr_device_ids[] = { {"INTC1045", 0}, {"INTC1049", 0}, + {"INTC10A3", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, pch_fivr_device_ids); diff --git a/drivers/acpi/dptf/dptf_power.c b/drivers/acpi/dptf/dptf_power.c index a24d5d7aa117..dc1f52a5b3f4 100644 --- a/drivers/acpi/dptf/dptf_power.c +++ b/drivers/acpi/dptf/dptf_power.c @@ -231,6 +231,8 @@ static const struct acpi_device_id int3407_device_ids[] = { {"INTC1050", 0}, {"INTC1060", 0}, {"INTC1061", 0}, + {"INTC10A4", 0}, + {"INTC10A5", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, int3407_device_ids); diff --git a/drivers/acpi/dptf/int340x_thermal.c b/drivers/acpi/dptf/int340x_thermal.c index da5d5f0be2f2..42a556346548 100644 --- a/drivers/acpi/dptf/int340x_thermal.c +++ b/drivers/acpi/dptf/int340x_thermal.c @@ -37,6 +37,12 @@ static const struct acpi_device_id int340x_thermal_device_ids[] = { {"INTC1050"}, {"INTC1060"}, {"INTC1061"}, + {"INTC10A0"}, + {"INTC10A1"}, + {"INTC10A2"}, + {"INTC10A3"}, + {"INTC10A4"}, + {"INTC10A5"}, {""}, }; diff --git a/drivers/acpi/fan.h b/drivers/acpi/fan.h index dc9a6efa514b..dd9bb8ca2244 100644 --- a/drivers/acpi/fan.h +++ b/drivers/acpi/fan.h @@ -10,4 +10,5 @@ {"INT3404", }, /* Fan */ \ {"INTC1044", }, /* Fan for Tiger Lake generation */ \ {"INTC1048", }, /* Fan for Alder Lake generation */ \ + {"INTC10A2", }, /* Fan for Raptor Lake generation */ \ {"PNP0C0B", } /* Generic ACPI fan */ From a95be874d26bed7c12a87b5493d5dac9281f707f Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 14 Jan 2022 15:24:34 -0800 Subject: [PATCH 0124/1295] thermal: int340x: Support Raptor Lake Add Raptor Lake ACPI IDs for DPTF devices. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/int340x_thermal/int3400_thermal.c | 1 + drivers/thermal/intel/int340x_thermal/int3403_thermal.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c index 8502b7d8df89..72acb1f61849 100644 --- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c +++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c @@ -596,6 +596,7 @@ static const struct acpi_device_id int3400_thermal_match[] = { {"INT3400", 0}, {"INTC1040", 0}, {"INTC1041", 0}, + {"INTC10A0", 0}, {} }; diff --git a/drivers/thermal/intel/int340x_thermal/int3403_thermal.c b/drivers/thermal/intel/int340x_thermal/int3403_thermal.c index c3c4c4d34542..07e25321dfe3 100644 --- a/drivers/thermal/intel/int340x_thermal/int3403_thermal.c +++ b/drivers/thermal/intel/int340x_thermal/int3403_thermal.c @@ -285,6 +285,7 @@ static const struct acpi_device_id int3403_device_ids[] = { {"INT3403", 0}, {"INTC1043", 0}, {"INTC1046", 0}, + {"INTC10A1", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, int3403_device_ids); From e5b54867f47f765fcb439e09ed763b5de617af3e Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 14 Jan 2022 15:24:35 -0800 Subject: [PATCH 0125/1295] thermal: int340x: Add Raptor Lake PCI device id Add Raptor Lake PCI ID for processor thermal device. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/int340x_thermal/processor_thermal_device.h | 1 + .../thermal/intel/int340x_thermal/processor_thermal_device_pci.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h index 9b2a64ef55d0..49932a68abac 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h @@ -24,6 +24,7 @@ #define PCI_DEVICE_ID_INTEL_HSB_THERMAL 0x0A03 #define PCI_DEVICE_ID_INTEL_ICL_THERMAL 0x8a03 #define PCI_DEVICE_ID_INTEL_JSL_THERMAL 0x4E03 +#define PCI_DEVICE_ID_INTEL_RPL_THERMAL 0xA71D #define PCI_DEVICE_ID_INTEL_SKL_THERMAL 0x1903 #define PCI_DEVICE_ID_INTEL_TGL_THERMAL 0x9A03 diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c index b4bcd3fe9eb2..ca40b0967cdd 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c @@ -358,6 +358,7 @@ static SIMPLE_DEV_PM_OPS(proc_thermal_pci_pm, proc_thermal_pci_suspend, static const struct pci_device_id proc_thermal_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, ADL_THERMAL, PROC_THERMAL_FEATURE_RAPL | PROC_THERMAL_FEATURE_FIVR | PROC_THERMAL_FEATURE_DVFS | PROC_THERMAL_FEATURE_MBOX) }, + { PCI_DEVICE_DATA(INTEL, RPL_THERMAL, PROC_THERMAL_FEATURE_RAPL | PROC_THERMAL_FEATURE_FIVR | PROC_THERMAL_FEATURE_DVFS | PROC_THERMAL_FEATURE_MBOX) }, { }, }; From 7eacba3b00a3c35c1ad189f543b1995dd0bdca9c Mon Sep 17 00:00:00 2001 From: Eugene Korenevsky Date: Fri, 14 Jan 2022 22:53:00 +0300 Subject: [PATCH 0126/1295] cifs: alloc_path_with_tree_prefix: do not append sep. if the path is empty alloc_path_with_tree_prefix() concatenates tree prefix and the path. Windows CIFS client does not add separator after the tree prefix if the path is empty. Let's do the same. This fixes mounting DFS namespaces with names containing non-ASCII symbols. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215440 Signed-off-by: Eugene Korenevsky Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 8d471df69c59..625e3e9cb614 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2587,8 +2587,13 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, cp = load_nls_default(); cifs_strtoUTF16(*out_path, treename, treename_len, cp); - UniStrcat(*out_path, sep); - UniStrcat(*out_path, path); + + /* Do not append the separator if the path is empty */ + if (path[0] != cpu_to_le16(0x0000)) { + UniStrcat(*out_path, sep); + UniStrcat(*out_path, path); + } + unload_nls(cp); return 0; From a2809d0e16963fdf3984409e47f145cccb0c6821 Mon Sep 17 00:00:00 2001 From: Eugene Korenevsky Date: Fri, 14 Jan 2022 22:53:40 +0300 Subject: [PATCH 0127/1295] cifs: quirk for STATUS_OBJECT_NAME_INVALID returned for non-ASCII dfs refs Windows SMB server responds with STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request for "\\\" DFS reference, where contains non-ASCII unicode symbols. Check such DFS reference and emulate -EREMOTE if it is actual. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215440 Signed-off-by: Eugene Korenevsky Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 5 +++++ fs/cifs/connect.c | 5 +++++ fs/cifs/inode.c | 6 ++++++ fs/cifs/misc.c | 49 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index e0dc147e69a8..f2029bc46215 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -647,6 +647,11 @@ static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, int match_target_ip(struct TCP_Server_Info *server, const char *share, size_t share_len, bool *result); + +int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *dfs_link_path); #endif static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0f36deff790e..accce1b351c6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3374,6 +3374,11 @@ static int is_path_remote(struct mount_ctx *mnt_ctx) rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, full_path); +#ifdef CONFIG_CIFS_DFS_UPCALL + if (rc == -ENOENT && is_tcon_dfs(tcon)) + rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, cifs_sb, + full_path); +#endif if (rc != 0 && rc != -EREMOTE) { kfree(full_path); return rc; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 279622e4eb1c..baa197edd8c5 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -952,6 +952,12 @@ cifs_get_inode_info(struct inode **inode, rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, tmp_data, &adjust_tz, &is_reparse_point); +#ifdef CONFIG_CIFS_DFS_UPCALL + if (rc == -ENOENT && is_tcon_dfs(tcon)) + rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, + cifs_sb, + full_path); +#endif data = tmp_data; } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 5148d48d6a35..56598f7dbe00 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1302,4 +1302,53 @@ int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; return 0; } + +/** cifs_dfs_query_info_nonascii_quirk + * Handle weird Windows SMB server behaviour. It responds with + * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request + * for "\\\" DFS reference, + * where contains non-ASCII unicode symbols. + * + * Check such DFS reference and emulate -ENOENT if it is actual. + */ +int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *linkpath) +{ + char *treename, *dfspath, sep; + int treenamelen, linkpathlen, rc; + + treename = tcon->treeName; + /* MS-DFSC: All paths in REQ_GET_DFS_REFERRAL and RESP_GET_DFS_REFERRAL + * messages MUST be encoded with exactly one leading backslash, not two + * leading backslashes. + */ + sep = CIFS_DIR_SEP(cifs_sb); + if (treename[0] == sep && treename[1] == sep) + treename++; + linkpathlen = strlen(linkpath); + treenamelen = strnlen(treename, MAX_TREE_SIZE + 1); + dfspath = kzalloc(treenamelen + linkpathlen + 1, GFP_KERNEL); + if (!dfspath) + return -ENOMEM; + if (treenamelen) + memcpy(dfspath, treename, treenamelen); + memcpy(dfspath + treenamelen, linkpath, linkpathlen); + rc = dfs_cache_find(xid, tcon->ses, cifs_sb->local_nls, + cifs_remap(cifs_sb), dfspath, NULL, NULL); + if (rc == 0) { + cifs_dbg(FYI, "DFS ref '%s' is found, emulate -EREMOTE\n", + dfspath); + rc = -EREMOTE; + } else if (rc == -EEXIST) { + cifs_dbg(FYI, "DFS ref '%s' is not found, emulate -ENOENT\n", + dfspath); + rc = -ENOENT; + } else { + cifs_dbg(FYI, "%s: dfs_cache_find returned %d\n", __func__, rc); + } + kfree(dfspath); + return rc; +} #endif From 6b0764598dc7dea21a44cd4e7ec3dd4a7aabf5c2 Mon Sep 17 00:00:00 2001 From: Wang Cheng Date: Sat, 15 Jan 2022 20:52:36 +0800 Subject: [PATCH 0128/1295] docs: staging/tee.rst: fix two typos found while reading Signed-off-by: Wang Cheng Reviewed-by: Sumit Garg Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20220115125236.34886-1-wanngchenng@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/staging/tee.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/staging/tee.rst b/Documentation/staging/tee.rst index 4d4b5f889603..4eec213eaefe 100644 --- a/Documentation/staging/tee.rst +++ b/Documentation/staging/tee.rst @@ -225,7 +225,7 @@ The following picture shows a high level overview of AMD-TEE:: +--------------------------+ +---------+--------------------+ At the lowest level (in x86), the AMD Secure Processor (ASP) driver uses the -CPU to PSP mailbox regsister to submit commands to the PSP. The format of the +CPU to PSP mailbox register to submit commands to the PSP. The format of the command buffer is opaque to the ASP driver. It's role is to submit commands to the secure processor and return results to AMD-TEE driver. The interface between AMD-TEE driver and AMD Secure Processor driver can be found in [6]. @@ -260,7 +260,7 @@ cancel_req driver callback is not supported by AMD-TEE. The GlobalPlatform TEE Client API [5] can be used by the user space (client) to talk to AMD's TEE. AMD's TEE provides a secure environment for loading, opening -a session, invoking commands and clossing session with TA. +a session, invoking commands and closing session with TA. References ========== From cc2cf6796a90bf356a8ddc854f65cea434477ea7 Mon Sep 17 00:00:00 2001 From: Huichun Feng Date: Tue, 11 Jan 2022 13:20:01 +0800 Subject: [PATCH 0129/1295] docs: ftrace: fix ambiguous sentence The sentence looks ambiguous, rephrase it by adding ", there". Signed-off-by: Huichun Feng Signed-off-by: Ching-Chun (Jim) Huang Signed-off-by: Chun-Hung Tseng Acked-by: Steven Rostedt Link: https://lore.kernel.org/r/20220111052000.2675944-1-foxhoundsk.tw@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/trace/ftrace.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index b3166c4a7867..45b8c56af67a 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -3370,7 +3370,7 @@ one of the latency tracers, you will get the following results. Instances --------- -In the tracefs tracing directory is a directory called "instances". +In the tracefs tracing directory, there is a directory called "instances". This directory can have new directories created inside of it using mkdir, and removing directories with rmdir. The directory created with mkdir in this directory will already contain files and other From 2ba144e68edb4987a2fe1b1cf418b58cbdc4ee96 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Mon, 10 Jan 2022 22:44:56 +0100 Subject: [PATCH 0130/1295] dt-bindings: power: reset: gpio-restart: Correct default priority Commit bcd56fe1aa97 ("power: reset: gpio-restart: increase priority slightly") changed the default restart priority 129, but did not update the documentation. Correct this, so the driver and documentation have the same default value. Signed-off-by: Sander Vanheule Reviewed-by: Rob Herring Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220110214456.67087-1-sander@svanheule.net --- .../devicetree/bindings/power/reset/gpio-restart.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/power/reset/gpio-restart.yaml b/Documentation/devicetree/bindings/power/reset/gpio-restart.yaml index 3dd22220cb5f..a72d5c721516 100644 --- a/Documentation/devicetree/bindings/power/reset/gpio-restart.yaml +++ b/Documentation/devicetree/bindings/power/reset/gpio-restart.yaml @@ -43,7 +43,7 @@ properties: priority: $ref: /schemas/types.yaml#/definitions/uint32 description: | - A priority ranging from 0 to 255 (default 128) according to the following guidelines: + A priority ranging from 0 to 255 (default 129) according to the following guidelines: 0: Restart handler of last resort, with limited restart capabilities. 128: Default restart handler; use if no other restart handler is expected to be available, @@ -51,7 +51,7 @@ properties: 255: Highest priority restart handler, will preempt all other restart handlers. minimum: 0 maximum: 255 - default: 128 + default: 129 active-delay: $ref: /schemas/types.yaml#/definitions/uint32 From 38a9840e2e396e86e3a39d1d2daf5f46204066c7 Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 13 Jan 2022 11:28:42 +0100 Subject: [PATCH 0131/1295] dt-bindings: vendor-prefixes: add 8devices The vendor prefix for 8devices [1] is used in device tree [2], but was not documented so far. Add it to the schema to document it. [1] https://www.8devices.com/ [2] arch/arm/boot/dts/qcom-ipq4018-jalapeno.dts Signed-off-by: Stanislav Jakubek Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220113102842.GA4357@standask-GA-A55M-S2HP --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index 5983a2f6fb30..724a65146993 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -25,6 +25,8 @@ patternProperties: # Keep list in alphabetical order. "^70mai,.*": description: 70mai Co., Ltd. + "^8dev,.*": + description: 8devices, UAB "^abb,.*": description: ABB "^abilis,.*": From 8316cbbafd8bdf523fb1b3230f0701beae51679b Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 13 Jan 2022 11:29:26 +0100 Subject: [PATCH 0132/1295] dt-bindings: vendor-prefixes: add F(x)tec The vendor prefix for F(x)tec [1] is used in device tree [2], but was not documented so far. Add it to the schema to document it. [1] https://www.fxtec.com/ [2] arch/arm64/boot/dts/qcom/msm8998-fxtec-pro1.dts Signed-off-by: Stanislav Jakubek Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220113102926.GA4388@standask-GA-A55M-S2HP --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index 724a65146993..c14a084c90a6 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -439,6 +439,8 @@ patternProperties: description: Freescale Semiconductor "^fujitsu,.*": description: Fujitsu Ltd. + "^fxtec,.*": + description: FX Technology Ltd. "^gardena,.*": description: GARDENA GmbH "^gateworks,.*": From ca146834d6cdbde80dead9daf8e7e127fc85d31f Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 13 Jan 2022 11:30:05 +0100 Subject: [PATCH 0133/1295] dt-bindings: vendor-prefixes: add Huawei The vendor prefix for Huawei [1] is used in device trees [2][3], but was not documented so far. Add it to the schema to document it. [1] https://www.huawei.com/en/ [2] arch/arm64/boot/dts/qcom/msm8916-huawei-g7.dts [3] arch/arm64/boot/dts/qcom/msm8994-angler-rev-101.dts Signed-off-by: Stanislav Jakubek Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220113103005.GA4421@standask-GA-A55M-S2HP --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index c14a084c90a6..9de8c0f5217f 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -515,6 +515,8 @@ patternProperties: description: HannStar Display Co. "^holtek,.*": description: Holtek Semiconductor, Inc. + "^huawei,.*": + description: Huawei Technologies Co., Ltd. "^hugsun,.*": description: Shenzhen Hugsun Technology Co. Ltd. "^hwacom,.*": From 6f2dfed0b6f078149072c34631a77902a8c85c7e Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 13 Jan 2022 11:30:36 +0100 Subject: [PATCH 0134/1295] dt-bindings: vendor-prefixes: add Thundercomm The vendor prefix for Thundercomm [1] is used in device tree [2], but was not documented so far. Add it to the schema to document it. [1] https://www.thundercomm.com/ [2] arch/arm64/boot/dts/qcom/sdm845-db845c.dts Signed-off-by: Stanislav Jakubek Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220113103036.GA4456@standask-GA-A55M-S2HP --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index 9de8c0f5217f..1634c4db79a9 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -1207,6 +1207,8 @@ patternProperties: description: THine Electronics, Inc. "^thingyjp,.*": description: thingy.jp + "^thundercomm,.*": + description: Thundercomm Technology Co., Ltd. "^ti,.*": description: Texas Instruments "^tianma,.*": From 364da22cb30eb79198922ebf53c40f589bcece9d Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 13 Jan 2022 11:31:10 +0100 Subject: [PATCH 0135/1295] dt-bindings: vendor-prefixes: add Wingtech The vendor prefix for Wingtech [1] is used in device tree [2], but was not documented so far. Add it to the schema to document it. [1] http://www.wingtech.com/en [2] arch/arm64/boot/dts/qcom/msm8916-wingtech-wt88047.dts Signed-off-by: Stanislav Jakubek Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220113103110.GA4488@standask-GA-A55M-S2HP --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index 1634c4db79a9..a2efbc26d945 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -1332,6 +1332,8 @@ patternProperties: description: Wiligear, Ltd. "^winbond,.*": description: Winbond Electronics corp. + "^wingtech,.*": + description: Wingtech Technology Co., Ltd. "^winstar,.*": description: Winstar Display Corp. "^wits,.*": From 154e5f296e2af04fc2775a6de3c76e6ee37b5609 Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 13 Jan 2022 14:39:01 +0100 Subject: [PATCH 0136/1295] dt-bindings: trivial-devices: fix swapped comments sparkfun,qwiic-joystick and st,24c256 had their comments incorrectly swapped. Swap them to make them correct. Signed-off-by: Stanislav Jakubek Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/7dc6ddb0b042cd243b2875e9aea81cad541d1c6b.1642080090.git.stano.jakubek@gmail.com --- Documentation/devicetree/bindings/trivial-devices.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml index 72c9f8610766..ee5edc8ad153 100644 --- a/Documentation/devicetree/bindings/trivial-devices.yaml +++ b/Documentation/devicetree/bindings/trivial-devices.yaml @@ -295,9 +295,9 @@ properties: - skyworks,sky81452 # Socionext SynQuacer TPM MMIO module - socionext,synquacer-tpm-mmio - # i2c serial eeprom (24cxx) - - sparkfun,qwiic-joystick # SparkFun Qwiic Joystick (COM-15168) with i2c interface + - sparkfun,qwiic-joystick + # i2c serial eeprom (24cxx) - st,24c256 # Ambient Light Sensor with SMBUS/Two Wire Serial Interface - taos,tsl2550 From af35a8b5bab7c1e31d30b9ad78a981fae9bda903 Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 13 Jan 2022 14:39:20 +0100 Subject: [PATCH 0137/1295] dt-bindings: trivial-devices: fix double spaces in comments Cleanup double spaces in some of the comments. Signed-off-by: Stanislav Jakubek Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/13b3f66efd3b20f1d9bbb9eff1eca00757ac5367.1642080090.git.stano.jakubek@gmail.com --- Documentation/devicetree/bindings/trivial-devices.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml index ee5edc8ad153..e285386f3de0 100644 --- a/Documentation/devicetree/bindings/trivial-devices.yaml +++ b/Documentation/devicetree/bindings/trivial-devices.yaml @@ -31,7 +31,7 @@ properties: - enum: # SMBus/I2C Digital Temperature Sensor in 6-Pin SOT with SMBus Alert and Over Temperature Pin - ad,ad7414 - # ADM9240: Complete System Hardware Monitor for uProcessor-Based Systems + # ADM9240: Complete System Hardware Monitor for uProcessor-Based Systems - ad,adm9240 # AD5110 - Nonvolatile Digital Potentiometer - adi,ad5110 @@ -43,7 +43,7 @@ properties: - adi,adp5589 # AMS iAQ-Core VOC Sensor - ams,iaq-core - # i2c serial eeprom (24cxx) + # i2c serial eeprom (24cxx) - at,24c08 # i2c trusted platform module (TPM) - atmel,at97sc3204t @@ -297,7 +297,7 @@ properties: - socionext,synquacer-tpm-mmio # SparkFun Qwiic Joystick (COM-15168) with i2c interface - sparkfun,qwiic-joystick - # i2c serial eeprom (24cxx) + # i2c serial eeprom (24cxx) - st,24c256 # Ambient Light Sensor with SMBUS/Two Wire Serial Interface - taos,tsl2550 From d8adf5b92a9d2205620874d498c39923ecea8749 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Thu, 13 Jan 2022 09:19:18 +0100 Subject: [PATCH 0138/1295] scripts/dtc: dtx_diff: remove broken example from help text dtx_diff suggests to use <(...) syntax to pipe two inputs into it, but this has never worked: The /proc/self/fds/... paths passed by the shell will fail the `[ -f "${dtx}" ] && [ -r "${dtx}" ]` check in compile_to_dts, but even with this check removed, the function cannot work: hexdump will eat up the DTB magic, making the subsequent dtc call fail, as a pipe cannot be rewound. Simply remove this broken example, as there is already an alternative one that works fine. Fixes: 10eadc253ddf ("dtc: create tool to diff device trees") Signed-off-by: Matthias Schiffer Reviewed-by: Frank Rowand Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220113081918.10387-1-matthias.schiffer@ew.tq-group.com --- scripts/dtc/dtx_diff | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/dtc/dtx_diff b/scripts/dtc/dtx_diff index d3422ee15e30..f2bbde4bba86 100755 --- a/scripts/dtc/dtx_diff +++ b/scripts/dtc/dtx_diff @@ -59,12 +59,8 @@ Otherwise DTx is treated as a dts source file (aka .dts). or '/include/' to be processed. If DTx_1 and DTx_2 are in different architectures, then this script - may not work since \${ARCH} is part of the include path. Two possible - workarounds: - - `basename $0` \\ - <(ARCH=arch_of_dtx_1 `basename $0` DTx_1) \\ - <(ARCH=arch_of_dtx_2 `basename $0` DTx_2) + may not work since \${ARCH} is part of the include path. The following + workaround can be used: `basename $0` ARCH=arch_of_dtx_1 DTx_1 >tmp_dtx_1.dts `basename $0` ARCH=arch_of_dtx_2 DTx_2 >tmp_dtx_2.dts From b7fb2dad571d1e21173c06cef0bced77b323990a Mon Sep 17 00:00:00 2001 From: Sujit Kautkar Date: Mon, 10 Jan 2022 10:47:36 -0800 Subject: [PATCH 0139/1295] rpmsg: char: Fix race between the release of rpmsg_ctrldev and cdev struct rpmsg_ctrldev contains a struct cdev. The current code frees the rpmsg_ctrldev struct in rpmsg_ctrldev_release_device(), but the cdev is a managed object, therefore its release is not predictable and the rpmsg_ctrldev could be freed before the cdev is entirely released, as in the backtrace below. [ 93.625603] ODEBUG: free active (active state 0) object type: timer_list hint: delayed_work_timer_fn+0x0/0x7c [ 93.636115] WARNING: CPU: 0 PID: 12 at lib/debugobjects.c:488 debug_print_object+0x13c/0x1b0 [ 93.644799] Modules linked in: veth xt_cgroup xt_MASQUERADE rfcomm algif_hash algif_skcipher af_alg uinput ip6table_nat fuse uvcvideo videobuf2_vmalloc venus_enc venus_dec videobuf2_dma_contig hci_uart btandroid btqca snd_soc_rt5682_i2c bluetooth qcom_spmi_temp_alarm snd_soc_rt5682v [ 93.715175] CPU: 0 PID: 12 Comm: kworker/0:1 Tainted: G B 5.4.163-lockdep #26 [ 93.723855] Hardware name: Google Lazor (rev3 - 8) with LTE (DT) [ 93.730055] Workqueue: events kobject_delayed_cleanup [ 93.735271] pstate: 60c00009 (nZCv daif +PAN +UAO) [ 93.740216] pc : debug_print_object+0x13c/0x1b0 [ 93.744890] lr : debug_print_object+0x13c/0x1b0 [ 93.749555] sp : ffffffacf5bc7940 [ 93.752978] x29: ffffffacf5bc7940 x28: dfffffd000000000 [ 93.758448] x27: ffffffacdb11a800 x26: dfffffd000000000 [ 93.763916] x25: ffffffd0734f856c x24: dfffffd000000000 [ 93.769389] x23: 0000000000000000 x22: ffffffd0733c35b0 [ 93.774860] x21: ffffffd0751994a0 x20: ffffffd075ec27c0 [ 93.780338] x19: ffffffd075199100 x18: 00000000000276e0 [ 93.785814] x17: 0000000000000000 x16: dfffffd000000000 [ 93.791291] x15: ffffffffffffffff x14: 6e6968207473696c [ 93.796768] x13: 0000000000000000 x12: ffffffd075e2b000 [ 93.802244] x11: 0000000000000001 x10: 0000000000000000 [ 93.807723] x9 : d13400dff1921900 x8 : d13400dff1921900 [ 93.813200] x7 : 0000000000000000 x6 : 0000000000000000 [ 93.818676] x5 : 0000000000000080 x4 : 0000000000000000 [ 93.824152] x3 : ffffffd0732a0fa4 x2 : 0000000000000001 [ 93.829628] x1 : ffffffacf5bc7580 x0 : 0000000000000061 [ 93.835104] Call trace: [ 93.837644] debug_print_object+0x13c/0x1b0 [ 93.841963] __debug_check_no_obj_freed+0x25c/0x3c0 [ 93.846987] debug_check_no_obj_freed+0x18/0x20 [ 93.851669] slab_free_freelist_hook+0xbc/0x1e4 [ 93.856346] kfree+0xfc/0x2f4 [ 93.859416] rpmsg_ctrldev_release_device+0x78/0xb8 [ 93.864445] device_release+0x84/0x168 [ 93.868310] kobject_cleanup+0x12c/0x298 [ 93.872356] kobject_delayed_cleanup+0x10/0x18 [ 93.876948] process_one_work+0x578/0x92c [ 93.881086] worker_thread+0x804/0xcf8 [ 93.884963] kthread+0x2a8/0x314 [ 93.888303] ret_from_fork+0x10/0x18 The cdev_device_add/del() API was created to address this issue (see commit '233ed09d7fda ("chardev: add helper function to register char devs with a struct device")'), use it instead of cdev add/del(). Fixes: c0cdc19f84a4 ("rpmsg: Driver for user space endpoint interface") Signed-off-by: Sujit Kautkar Signed-off-by: Matthias Kaehlcke Reviewed-by: Mathieu Poirier Reviewed-by: Bjorn Andersson Reviewed-by: Stephen Boyd Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220110104706.v6.1.Iaac908f3e3149a89190ce006ba166e2d3fd247a3@changeid --- drivers/rpmsg/rpmsg_char.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/rpmsg/rpmsg_char.c b/drivers/rpmsg/rpmsg_char.c index d6214cb66026..8d3c2ca7081b 100644 --- a/drivers/rpmsg/rpmsg_char.c +++ b/drivers/rpmsg/rpmsg_char.c @@ -462,7 +462,6 @@ static void rpmsg_ctrldev_release_device(struct device *dev) ida_simple_remove(&rpmsg_ctrl_ida, dev->id); ida_simple_remove(&rpmsg_minor_ida, MINOR(dev->devt)); - cdev_del(&ctrldev->cdev); kfree(ctrldev); } @@ -497,19 +496,13 @@ static int rpmsg_chrdev_probe(struct rpmsg_device *rpdev) dev->id = ret; dev_set_name(&ctrldev->dev, "rpmsg_ctrl%d", ret); - ret = cdev_add(&ctrldev->cdev, dev->devt, 1); + ret = cdev_device_add(&ctrldev->cdev, &ctrldev->dev); if (ret) goto free_ctrl_ida; /* We can now rely on the release function for cleanup */ dev->release = rpmsg_ctrldev_release_device; - ret = device_add(dev); - if (ret) { - dev_err(&rpdev->dev, "device_add failed: %d\n", ret); - put_device(dev); - } - dev_set_drvdata(&rpdev->dev, ctrldev); return ret; @@ -535,7 +528,7 @@ static void rpmsg_chrdev_remove(struct rpmsg_device *rpdev) if (ret) dev_warn(&rpdev->dev, "failed to nuke endpoints: %d\n", ret); - device_del(&ctrldev->dev); + cdev_device_del(&ctrldev->cdev, &ctrldev->dev); put_device(&ctrldev->dev); } From 7a534ae89e34e9b51acb5a63dd0f88308178b46a Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Mon, 10 Jan 2022 10:47:37 -0800 Subject: [PATCH 0140/1295] rpmsg: char: Fix race between the release of rpmsg_eptdev and cdev struct rpmsg_eptdev contains a struct cdev. The current code frees the rpmsg_eptdev struct in rpmsg_eptdev_destroy(), but the cdev is a managed object, therefore its release is not predictable and the rpmsg_eptdev could be freed before the cdev is entirely released. The cdev_device_add/del() API was created to address this issue (see commit '233ed09d7fda ("chardev: add helper function to register char devs with a struct device")'), use it instead of cdev add/del(). Fixes: c0cdc19f84a4 ("rpmsg: Driver for user space endpoint interface") Suggested-by: Bjorn Andersson Signed-off-by: Matthias Kaehlcke Reviewed-by: Mathieu Poirier Reviewed-by: Stephen Boyd Reviewed-by: Bjorn Andersson Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220110104706.v6.2.Idde68b05b88d4a2e6e54766c653f3a6d9e419ce6@changeid --- drivers/rpmsg/rpmsg_char.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/rpmsg/rpmsg_char.c b/drivers/rpmsg/rpmsg_char.c index 8d3c2ca7081b..5663cf799c95 100644 --- a/drivers/rpmsg/rpmsg_char.c +++ b/drivers/rpmsg/rpmsg_char.c @@ -93,7 +93,7 @@ static int rpmsg_eptdev_destroy(struct device *dev, void *data) /* wake up any blocked readers */ wake_up_interruptible(&eptdev->readq); - device_del(&eptdev->dev); + cdev_device_del(&eptdev->cdev, &eptdev->dev); put_device(&eptdev->dev); return 0; @@ -336,7 +336,6 @@ static void rpmsg_eptdev_release_device(struct device *dev) ida_simple_remove(&rpmsg_ept_ida, dev->id); ida_simple_remove(&rpmsg_minor_ida, MINOR(eptdev->dev.devt)); - cdev_del(&eptdev->cdev); kfree(eptdev); } @@ -381,19 +380,13 @@ static int rpmsg_eptdev_create(struct rpmsg_ctrldev *ctrldev, dev->id = ret; dev_set_name(dev, "rpmsg%d", ret); - ret = cdev_add(&eptdev->cdev, dev->devt, 1); + ret = cdev_device_add(&eptdev->cdev, &eptdev->dev); if (ret) goto free_ept_ida; /* We can now rely on the release function for cleanup */ dev->release = rpmsg_eptdev_release_device; - ret = device_add(dev); - if (ret) { - dev_err(dev, "device_add failed: %d\n", ret); - put_device(dev); - } - return ret; free_ept_ida: From eee412e968f7b950564880bc6a7a9f00f49034da Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 14 Jan 2022 17:13:38 -0800 Subject: [PATCH 0141/1295] remoteproc: qcom: q6v5: fix service routines build errors When CONFIG_QCOM_AOSS_QMP=m and CONFIG_QCOM_Q6V5_MSS=y, the builtin driver cannot call into the loadable module's low-level service functions. Trying to build with that config combo causes linker errors. There are two problems here. First, drivers/remoteproc/qcom_q6v5.c should #include for the definitions of the service functions, depending on whether CONFIG_QCOM_AOSS_QMP is set/enabled or not. Second, the qcom remoteproc drivers should depend on QCOM_AOSS_QMP iff it is enabled (=y or =m) so that the qcom remoteproc drivers can be built properly. This prevents these build errors: aarch64-linux-ld: drivers/remoteproc/qcom_q6v5.o: in function `q6v5_load_state_toggle': qcom_q6v5.c:(.text+0xc4): undefined reference to `qmp_send' aarch64-linux-ld: drivers/remoteproc/qcom_q6v5.o: in function `qcom_q6v5_deinit': (.text+0x2e4): undefined reference to `qmp_put' aarch64-linux-ld: drivers/remoteproc/qcom_q6v5.o: in function `qcom_q6v5_init': (.text+0x778): undefined reference to `qmp_get' aarch64-linux-ld: (.text+0x7d8): undefined reference to `qmp_put' Fixes: c1fe10d238c0 ("remoteproc: qcom: q6v5: Use qmp_send to update co-processor load state") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Bjorn Andersson Cc: Mathieu Poirier Cc: linux-remoteproc@vger.kernel.org Cc: Sibi Sankar Cc: Stephen Boyd Reviewed-by: Stephen Boyd Reviewed-by: Bjorn Andersson Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220115011338.2973-1-rdunlap@infradead.org --- drivers/remoteproc/Kconfig | 4 ++++ drivers/remoteproc/qcom_q6v5.c | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig index 3ddd426fc969..166019786653 100644 --- a/drivers/remoteproc/Kconfig +++ b/drivers/remoteproc/Kconfig @@ -180,6 +180,7 @@ config QCOM_Q6V5_ADSP depends on RPMSG_QCOM_GLINK_SMEM || RPMSG_QCOM_GLINK_SMEM=n depends on QCOM_SYSMON || QCOM_SYSMON=n depends on RPMSG_QCOM_GLINK || RPMSG_QCOM_GLINK=n + depends on QCOM_AOSS_QMP || QCOM_AOSS_QMP=n select MFD_SYSCON select QCOM_PIL_INFO select QCOM_MDT_LOADER @@ -199,6 +200,7 @@ config QCOM_Q6V5_MSS depends on RPMSG_QCOM_GLINK_SMEM || RPMSG_QCOM_GLINK_SMEM=n depends on QCOM_SYSMON || QCOM_SYSMON=n depends on RPMSG_QCOM_GLINK || RPMSG_QCOM_GLINK=n + depends on QCOM_AOSS_QMP || QCOM_AOSS_QMP=n select MFD_SYSCON select QCOM_MDT_LOADER select QCOM_PIL_INFO @@ -218,6 +220,7 @@ config QCOM_Q6V5_PAS depends on RPMSG_QCOM_GLINK_SMEM || RPMSG_QCOM_GLINK_SMEM=n depends on QCOM_SYSMON || QCOM_SYSMON=n depends on RPMSG_QCOM_GLINK || RPMSG_QCOM_GLINK=n + depends on QCOM_AOSS_QMP || QCOM_AOSS_QMP=n select MFD_SYSCON select QCOM_PIL_INFO select QCOM_MDT_LOADER @@ -239,6 +242,7 @@ config QCOM_Q6V5_WCSS depends on RPMSG_QCOM_GLINK_SMEM || RPMSG_QCOM_GLINK_SMEM=n depends on QCOM_SYSMON || QCOM_SYSMON=n depends on RPMSG_QCOM_GLINK || RPMSG_QCOM_GLINK=n + depends on QCOM_AOSS_QMP || QCOM_AOSS_QMP=n select MFD_SYSCON select QCOM_MDT_LOADER select QCOM_PIL_INFO diff --git a/drivers/remoteproc/qcom_q6v5.c b/drivers/remoteproc/qcom_q6v5.c index eada7e34f3af..442a388f8102 100644 --- a/drivers/remoteproc/qcom_q6v5.c +++ b/drivers/remoteproc/qcom_q6v5.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include From aee101d7b95a03078945681dd7f7ea5e4a1e7686 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 17 Jan 2022 23:44:03 +1000 Subject: [PATCH 0142/1295] powerpc/64s: Mask SRR0 before checking against the masked NIP Commit 314f6c23dd8d ("powerpc/64s: Mask NIP before checking against SRR0") masked off the low 2 bits of the NIP value in the interrupt stack frame in case they are non-zero and mis-compare against a SRR0 register value of a CPU which always reads back 0 from the 2 low bits which are reserved. This now causes the opposite problem that an implementation which does implement those bits in SRR0 will mis-compare against the masked NIP value in which they have been cleared. QEMU is one such implementation, and this is allowed by the architecture. This can be triggered by sigfuz by setting low bits of PT_NIP in the signal context. Fix this for now by masking the SRR0 bits as well. Cleaner is probably to sanitise these values before putting them in registers or stack, but this is the quick and backportable fix. Fixes: 314f6c23dd8d ("powerpc/64s: Mask NIP before checking against SRR0") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220117134403.2995059-1-npiggin@gmail.com --- arch/powerpc/kernel/interrupt_64.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 92088f848266..7bab2d7de372 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -30,6 +30,7 @@ COMPAT_SYS_CALL_TABLE: .ifc \srr,srr mfspr r11,SPRN_SRR0 ld r12,_NIP(r1) + clrrdi r11,r11,2 clrrdi r12,r12,2 100: tdne r11,r12 EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) @@ -40,6 +41,7 @@ COMPAT_SYS_CALL_TABLE: .else mfspr r11,SPRN_HSRR0 ld r12,_NIP(r1) + clrrdi r11,r11,2 clrrdi r12,r12,2 100: tdne r11,r12 EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) From 5455b9ecaf231ec5c6b0cd5c6076eb64c9dbc9aa Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 18 Jan 2022 12:16:57 +1000 Subject: [PATCH 0143/1295] cifs: serialize all mount attempts RHBZ: 2008434 Some servers, such as Windows2016 have a very low number of concurrent mounts that they allow from each client. This can be a problem if you have a more than a handful (==3 in this case) of cifs entries in your fstab and cause a number of the mounts there to randomly fail. Add a global mutex and use it to serialize all mount attempts. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/fs_context.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index e3ed25dc6f3f..7ec35f3f0a5f 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -37,6 +37,8 @@ #include "rfc1002pdu.h" #include "fs_context.h" +static DEFINE_MUTEX(cifs_mount_mutex); + static const match_table_t cifs_smb_version_tokens = { { Smb_1, SMB1_VERSION_STRING }, { Smb_20, SMB20_VERSION_STRING}, @@ -707,10 +709,14 @@ static int smb3_get_tree_common(struct fs_context *fc) static int smb3_get_tree(struct fs_context *fc) { int err = smb3_fs_context_validate(fc); + int ret; if (err) return err; - return smb3_get_tree_common(fc); + mutex_lock(&cifs_mount_mutex); + ret = smb3_get_tree_common(fc); + mutex_unlock(&cifs_mount_mutex); + return ret; } static void smb3_fs_context_free(struct fs_context *fc) From 5f02ef741a785678930f3ff0a8b6b2b0ef1bb402 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 18 Jan 2022 04:34:43 -0500 Subject: [PATCH 0144/1295] KVM: VMX: switch blocked_vcpu_on_cpu_lock to raw spinlock blocked_vcpu_on_cpu_lock is taken from hard interrupt context (pi_wakeup_handler), therefore it cannot sleep. Switch it to a raw spinlock. Fixes: [41297.066254] BUG: scheduling while atomic: CPU 0/KVM/635218/0x00010001 [41297.066323] Preemption disabled at: [41297.066324] [] irq_enter_rcu+0xf/0x60 [41297.066339] Call Trace: [41297.066342] [41297.066346] dump_stack_lvl+0x34/0x44 [41297.066353] ? irq_enter_rcu+0xf/0x60 [41297.066356] __schedule_bug.cold+0x7d/0x8b [41297.066361] __schedule+0x439/0x5b0 [41297.066365] ? task_blocks_on_rt_mutex.constprop.0.isra.0+0x1b0/0x440 [41297.066369] schedule_rtlock+0x1e/0x40 [41297.066371] rtlock_slowlock_locked+0xf1/0x260 [41297.066374] rt_spin_lock+0x3b/0x60 [41297.066378] pi_wakeup_handler+0x31/0x90 [kvm_intel] [41297.066388] sysvec_kvm_posted_intr_wakeup_ipi+0x9d/0xd0 [41297.066392] [41297.066392] asm_sysvec_kvm_posted_intr_wakeup_ipi+0x12/0x20 ... Signed-off-by: Marcelo Tosatti Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 1c94783b5a54..21ea58d25771 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -15,7 +15,7 @@ * can find which vCPU should be waken up. */ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); -static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); +static DEFINE_PER_CPU(raw_spinlock_t, blocked_vcpu_on_cpu_lock); static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { @@ -121,9 +121,9 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) new.control) != old.control); if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); list_del(&vcpu->blocked_vcpu_list); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); vcpu->pre_pcpu = -1; } } @@ -154,11 +154,11 @@ int pi_pre_block(struct kvm_vcpu *vcpu) local_irq_disable(); if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { vcpu->pre_pcpu = vcpu->cpu; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); list_add_tail(&vcpu->blocked_vcpu_list, &per_cpu(blocked_vcpu_on_cpu, vcpu->pre_pcpu)); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); } do { @@ -215,7 +215,7 @@ void pi_wakeup_handler(void) struct kvm_vcpu *vcpu; int cpu = smp_processor_id(); - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), blocked_vcpu_list) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -223,13 +223,13 @@ void pi_wakeup_handler(void) if (pi_test_on(pi_desc) == 1) kvm_vcpu_kick(vcpu); } - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); } void __init pi_init_cpu(int cpu) { INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + raw_spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); } bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) From 09f5e7dc7ad705289e1b1ec065439aa3c42951c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Dec 2021 13:19:52 +0100 Subject: [PATCH 0145/1295] perf: Fix perf_event_read_local() time Time readers that cannot take locks (due to NMI etc..) currently make use of perf_event::shadow_ctx_time, which, for that event gives: time' = now + (time - timestamp) or, alternatively arranged: time' = time + (now - timestamp) IOW, the progression of time since the last time the shadow_ctx_time was updated. There's problems with this: A) the shadow_ctx_time is per-event, even though the ctx_time it reflects is obviously per context. The direct concequence of this is that the context needs to iterate all events all the time to keep the shadow_ctx_time in sync. B) even with the prior point, the context itself might not be active meaning its time should not advance to begin with. C) shadow_ctx_time isn't consistently updated when ctx_time is There are 3 users of this stuff, that suffer differently from this: - calc_timer_values() - perf_output_read() - perf_event_update_userpage() /* A */ - perf_event_read_local() /* A,B */ In particular, perf_output_read() doesn't suffer at all, because it's sample driven and hence only relevant when the event is actually running. This same was supposed to be true for perf_event_update_userpage(), after all self-monitoring implies the context is active *HOWEVER*, as per commit f79256532682 ("perf/core: fix userpage->time_enabled of inactive events") this goes wrong when combined with counter overcommit, in that case those events that do not get scheduled when the context becomes active (task events typically) miss out on the EVENT_TIME update and ENABLED time is inflated (for a little while) with the time the context was inactive. Once the event gets rotated in, this gets corrected, leading to a non-monotonic timeflow. perf_event_read_local() made things even worse, it can request time at any point, suffering all the problems perf_event_update_userpage() does and more. Because while perf_event_update_userpage() is limited by the context being active, perf_event_read_local() users have no such constraint. Therefore, completely overhaul things and do away with perf_event::shadow_ctx_time. Instead have regular context time updates keep track of this offset directly and provide perf_event_time_now() to complement perf_event_time(). perf_event_time_now() will, in adition to being context wide, also take into account if the context is active. For inactive context, it will not advance time. This latter property means the cgroup perf_cgroup_info context needs to grow addition state to track this. Additionally, since all this is strictly per-cpu, we can use barrier() to order context activity vs context time. Fixes: 7d9285e82db5 ("perf/bpf: Extend the perf_event_read_local() interface, a.k.a. "bpf: perf event change needed for subsequent bpf helpers"") Signed-off-by: Peter Zijlstra (Intel) Tested-by: Song Liu Tested-by: Namhyung Kim Link: https://lkml.kernel.org/r/YcB06DasOBtU0b00@hirez.programming.kicks-ass.net --- include/linux/perf_event.h | 15 +-- kernel/events/core.c | 250 ++++++++++++++++++++++--------------- 2 files changed, 151 insertions(+), 114 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 117f230bcdfd..733649184b27 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -693,18 +693,6 @@ struct perf_event { u64 total_time_running; u64 tstamp; - /* - * timestamp shadows the actual context timing but it can - * be safely used in NMI interrupt context. It reflects the - * context time as it was when the event was last scheduled in, - * or when ctx_sched_in failed to schedule the event because we - * run out of PMC. - * - * ctx_time already accounts for ctx->timestamp. Therefore to - * compute ctx_time for a sample, simply add perf_clock(). - */ - u64 shadow_ctx_time; - struct perf_event_attr attr; u16 header_size; u16 id_header_size; @@ -852,6 +840,7 @@ struct perf_event_context { */ u64 time; u64 timestamp; + u64 timeoffset; /* * These fields let us detect when two contexts have both @@ -934,6 +923,8 @@ struct bpf_perf_event_data_kern { struct perf_cgroup_info { u64 time; u64 timestamp; + u64 timeoffset; + int active; }; struct perf_cgroup { diff --git a/kernel/events/core.c b/kernel/events/core.c index fc18664f49b0..479c9e672ec4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -674,6 +674,23 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state) WRITE_ONCE(event->state, state); } +/* + * UP store-release, load-acquire + */ + +#define __store_release(ptr, val) \ +do { \ + barrier(); \ + WRITE_ONCE(*(ptr), (val)); \ +} while (0) + +#define __load_acquire(ptr) \ +({ \ + __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ + barrier(); \ + ___p; \ +}) + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -719,34 +736,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) return t->time; } -static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { - struct perf_cgroup_info *info; - u64 now; + struct perf_cgroup_info *t; - now = perf_clock(); - - info = this_cpu_ptr(cgrp->info); - - info->time += now - info->timestamp; - info->timestamp = now; + t = per_cpu_ptr(event->cgrp->info, event->cpu); + if (!__load_acquire(&t->active)) + return t->time; + now += READ_ONCE(t->timeoffset); + return now; } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) +{ + if (adv) + info->time += now - info->timestamp; + info->timestamp = now; + /* + * see update_context_time() + */ + WRITE_ONCE(info->timeoffset, info->time - info->timestamp); +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; + struct perf_cgroup_info *info; if (cgrp) { + u64 now = perf_clock(); + for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); - __update_cgrp_time(cgrp); + info = this_cpu_ptr(cgrp->info); + + __update_cgrp_time(info, now, true); + if (final) + __store_release(&info->active, 0); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { + struct perf_cgroup_info *info; struct perf_cgroup *cgrp; /* @@ -760,8 +794,10 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) /* * Do not update time when cgroup is not active */ - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) - __update_cgrp_time(event->cgrp); + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) { + info = this_cpu_ptr(event->cgrp->info); + __update_cgrp_time(info, perf_clock(), true); + } } static inline void @@ -785,7 +821,8 @@ perf_cgroup_set_timestamp(struct task_struct *task, for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; + __update_cgrp_time(info, ctx->timestamp, false); + __store_release(&info->active, 1); } } @@ -981,14 +1018,6 @@ out: return ret; } -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) -{ - struct perf_cgroup_info *t; - t = per_cpu_ptr(event->cgrp->info, event->cpu); - event->shadow_ctx_time = now - t->timestamp; -} - static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { @@ -1066,7 +1095,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) { } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, + bool final) { } @@ -1098,12 +1128,12 @@ perf_cgroup_switch(struct task_struct *task, struct task_struct *next) { } -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +static inline u64 perf_cgroup_event_time(struct perf_event *event) { + return 0; } -static inline u64 perf_cgroup_event_time(struct perf_event *event) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } @@ -1525,22 +1555,59 @@ static void perf_unpin_context(struct perf_event_context *ctx) /* * Update the record of the current time in a context. */ -static void update_context_time(struct perf_event_context *ctx) +static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); - ctx->time += now - ctx->timestamp; + if (adv) + ctx->time += now - ctx->timestamp; ctx->timestamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); +} + +static void update_context_time(struct perf_event_context *ctx) +{ + __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + if (unlikely(!ctx)) + return 0; + if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx ? ctx->time : 0; + return ctx->time; +} + +static u64 perf_event_time_now(struct perf_event *event, u64 now) +{ + struct perf_event_context *ctx = event->ctx; + + if (unlikely(!ctx)) + return 0; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time_now(event, now); + + if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) + return ctx->time; + + now += READ_ONCE(ctx->timeoffset); + return now; } static enum event_type_t get_event_type(struct perf_event *event) @@ -2350,7 +2417,7 @@ __perf_remove_from_context(struct perf_event *event, if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); + update_cgrp_time_from_cpuctx(cpuctx, false); } event_sched_out(event, cpuctx, ctx); @@ -2361,6 +2428,9 @@ __perf_remove_from_context(struct perf_event *event, list_del_event(event, ctx); if (!ctx->nr_events && ctx->is_active) { + if (ctx == &cpuctx->ctx) + update_cgrp_time_from_cpuctx(cpuctx, true); + ctx->is_active = 0; ctx->rotate_necessary = 0; if (ctx->task) { @@ -2482,40 +2552,6 @@ void perf_event_disable_inatomic(struct perf_event *event) irq_work_queue(&event->pending); } -static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx) -{ - /* - * use the correct time source for the time snapshot - * - * We could get by without this by leveraging the - * fact that to get to this function, the caller - * has most likely already called update_context_time() - * and update_cgrp_time_xx() and thus both timestamp - * are identical (or very close). Given that tstamp is, - * already adjusted for cgroup, we could say that: - * tstamp - ctx->timestamp - * is equivalent to - * tstamp - cgrp->timestamp. - * - * Then, in perf_output_read(), the calculation would - * work with no changes because: - * - event is guaranteed scheduled in - * - no scheduled out in between - * - thus the timestamp would be the same - * - * But this is a bit hairy. - * - * So instead, we have an explicit cgroup call to remain - * within the time source all along. We believe it - * is cleaner and simpler to understand. - */ - if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, event->tstamp); - else - event->shadow_ctx_time = event->tstamp - ctx->timestamp; -} - #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); @@ -2556,8 +2592,6 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); - perf_set_shadow_time(event, ctx); - perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { @@ -3251,16 +3285,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, return; } - ctx->is_active &= ~event_type; - if (!(ctx->is_active & EVENT_ALL)) - ctx->is_active = 0; - - if (ctx->task) { - WARN_ON_ONCE(cpuctx->task_ctx != ctx); - if (!ctx->is_active) - cpuctx->task_ctx = NULL; - } - /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last @@ -3274,7 +3298,22 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (is_active & EVENT_TIME) { /* update (and stop) ctx time */ update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); + update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + } + + ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!ctx->is_active) + cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ @@ -3711,13 +3750,19 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, return 0; } +/* + * Because the userpage is strictly per-event (there is no concept of context, + * so there cannot be a context indirection), every userpage must be updated + * when context time starts :-( + * + * IOW, we must not miss EVENT_TIME edges. + */ static inline bool event_update_userpage(struct perf_event *event) { if (likely(!atomic_read(&event->mmap_count))) return false; perf_event_update_time(event); - perf_set_shadow_time(event, event->ctx); perf_event_update_userpage(event); return true; @@ -3801,13 +3846,23 @@ ctx_sched_in(struct perf_event_context *ctx, struct task_struct *task) { int is_active = ctx->is_active; - u64 now; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; + if (is_active ^ EVENT_TIME) { + /* start ctx time */ + __update_context_time(ctx, false); + perf_cgroup_set_timestamp(task, ctx); + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + } + ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) @@ -3818,13 +3873,6 @@ ctx_sched_in(struct perf_event_context *ctx, is_active ^= ctx->is_active; /* changed bits */ - if (is_active & EVENT_TIME) { - /* start ctx time */ - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); - } - /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -4418,6 +4466,18 @@ static inline u64 perf_event_count(struct perf_event *event) return local64_read(&event->count) + atomic64_read(&event->child_count); } +static void calc_timer_values(struct perf_event *event, + u64 *now, + u64 *enabled, + u64 *running) +{ + u64 ctx_time; + + *now = perf_clock(); + ctx_time = perf_event_time_now(event, *now); + __perf_update_times(event, ctx_time, enabled, running); +} + /* * NMI-safe method to read a local event, that is an event that * is: @@ -4477,10 +4537,9 @@ int perf_event_read_local(struct perf_event *event, u64 *value, *value = local64_read(&event->count); if (enabled || running) { - u64 now = event->shadow_ctx_time + perf_clock(); - u64 __enabled, __running; + u64 __enabled, __running, __now;; - __perf_update_times(event, now, &__enabled, &__running); + calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) *enabled = __enabled; if (running) @@ -5802,18 +5861,6 @@ static int perf_event_index(struct perf_event *event) return event->pmu->event_idx(event); } -static void calc_timer_values(struct perf_event *event, - u64 *now, - u64 *enabled, - u64 *running) -{ - u64 ctx_time; - - *now = perf_clock(); - ctx_time = event->shadow_ctx_time + *now; - __perf_update_times(event, ctx_time, enabled, running); -} - static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; @@ -6353,7 +6400,6 @@ accounting: ring_buffer_attach(event, rb); perf_event_update_time(event); - perf_set_shadow_time(event, event->ctx); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { From 7fa981cad216e9f64f49e22112f610c0bfed91bc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 11 Jan 2022 10:20:38 -0800 Subject: [PATCH 0146/1295] perf/x86/intel: Add a quirk for the calculation of the number of counters on Alder Lake For some Alder Lake machine with all E-cores disabled in a BIOS, the below warning may be triggered. [ 2.010766] hw perf events fixed 5 > max(4), clipping! Current perf code relies on the CPUID leaf 0xA and leaf 7.EDX[15] to calculate the number of the counters and follow the below assumption. For a hybrid configuration, the leaf 7.EDX[15] (X86_FEATURE_HYBRID_CPU) is set. The leaf 0xA only enumerate the common counters. Linux perf has to manually add the extra GP counters and fixed counters for P-cores. For a non-hybrid configuration, the X86_FEATURE_HYBRID_CPU should not be set. The leaf 0xA enumerates all counters. However, that's not the case when all E-cores are disabled in a BIOS. Although there are only P-cores in the system, the leaf 7.EDX[15] (X86_FEATURE_HYBRID_CPU) is still set. But the leaf 0xA is updated to enumerate all counters of P-cores. The inconsistency triggers the warning. Several software ways were considered to handle the inconsistency. - Drop the leaf 0xA and leaf 7.EDX[15] CPUID enumeration support. Hardcode the number of counters. This solution may be a problem for virtualization. A hypervisor cannot control the number of counters in a Linux guest via changing the guest CPUID enumeration anymore. - Find another CPUID bit that is also updated with E-cores disabled. There may be a problem in the virtualization environment too. Because a hypervisor may disable the feature/CPUID bit. - The P-cores have a maximum of 8 GP counters and 4 fixed counters on ADL. The maximum number can be used to detect the case. This solution is implemented in this patch. Fixes: ee72a94ea4a6 ("perf/x86/intel: Fix fixed counter check warning for some Alder Lake") Reported-by: Damjan Marion (damarion) Reported-by: Chan Edison Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Damjan Marion (damarion) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1641925238-149288-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fd9f908debe5..d5f940c57934 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6236,6 +6236,19 @@ __init int intel_pmu_init(void) pmu->num_counters = x86_pmu.num_counters; pmu->num_counters_fixed = x86_pmu.num_counters_fixed; } + + /* + * Quirk: For some Alder Lake machine, when all E-cores are disabled in + * a BIOS, the leaf 0xA will enumerate all counters of P-cores. However, + * the X86_FEATURE_HYBRID_CPU is still set. The above codes will + * mistakenly add extra counters for P-cores. Correct the number of + * counters here. + */ + if ((pmu->num_counters > 8) || (pmu->num_counters_fixed > 4)) { + pmu->num_counters = x86_pmu.num_counters; + pmu->num_counters_fixed = x86_pmu.num_counters_fixed; + } + pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); pmu->unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, From 96fd2e89fba1aaada6f4b1e5d25a9d9ecbe1943d Mon Sep 17 00:00:00 2001 From: Zhengjun Xing Date: Thu, 23 Dec 2021 22:48:26 +0800 Subject: [PATCH 0147/1295] perf/x86/intel/uncore: Fix CAS_COUNT_WRITE issue for ICX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The user recently report a perf issue in the ICX platform, when test by perf event “uncore_imc_x/cas_count_write”,the write bandwidth is always very small (only 0.38MB/s), it is caused by the wrong "umask" for the "cas_count_write" event. When double-checking, find "cas_count_read" also is wrong. The public document for ICX uncore: 3rd Gen Intel® Xeon® Processor Scalable Family, Codename Ice Lake,Uncore Performance Monitoring Reference Manual, Revision 1.00, May 2021 On 2.4.7, it defines Unit Masks for CAS_COUNT: RD b00001111 WR b00110000 So corrected both "cas_count_read" and "cas_count_write" for ICX. Old settings: hswep_uncore_imc_events INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03") INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c") New settings: snr_uncore_imc_events INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x0f") INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x30") Fixes: 2b3b76b5ec67 ("perf/x86/intel/uncore: Add Ice Lake server uncore support") Signed-off-by: Zhengjun Xing Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Adrian Hunter Reviewed-by: Kan Liang Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20211223144826.841267-1-zhengjun.xing@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 3660f698fb2a..ed869443efb2 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -5482,7 +5482,7 @@ static struct intel_uncore_type icx_uncore_imc = { .fixed_ctr_bits = 48, .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, - .event_descs = hswep_uncore_imc_events, + .event_descs = snr_uncore_imc_events, .perf_ctr = SNR_IMC_MMIO_PMON_CTR0, .event_ctl = SNR_IMC_MMIO_PMON_CTL0, .event_mask = SNBEP_PMON_RAW_EVENT_MASK, From 0036fb00a756a2f6e360d44e2e3d2200a8afbc9b Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 5 Jan 2022 10:56:59 -0800 Subject: [PATCH 0148/1295] perf/x86/rapl: fix AMD event handling The RAPL events exposed under /sys/devices/power/events should only reflect what the underlying hardware actually support. This is how it works on Intel RAPL and Intel core/uncore PMUs in general. But on AMD, this was not the case. All possible RAPL events were advertised. This is what it showed on an AMD Fam17h: $ ls /sys/devices/power/events/ energy-cores energy-gpu energy-pkg energy-psys energy-ram energy-cores.scale energy-gpu.scale energy-pkg.scale energy-psys.scale energy-ram.scale energy-cores.unit energy-gpu.unit energy-pkg.unit energy-psys.unit energy-ram.unit Yet, on AMD Fam17h, only energy-pkg is supported. This patch fixes the problem. Given the way perf_msr_probe() works, the amd_rapl_msrs[] table has to have all entries filled out and in particular the group field, otherwise perf_msr_probe() defaults to making the event visible. With the patch applied, the kernel now only shows was is actually supported: $ ls /sys/devices/power/events/ energy-pkg energy-pkg.scale energy-pkg.unit The patch also uses the RAPL_MSR_MASK because only the 32-bits LSB of the RAPL counters are relevant when reading power consumption. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220105185659.643355-1-eranian@google.com --- arch/x86/events/rapl.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 85feafacc445..77e3a47af5ad 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -536,11 +536,14 @@ static struct perf_msr intel_rapl_spr_msrs[] = { * - perf_msr_probe(PERF_RAPL_MAX) * - want to use same event codes across both architectures */ -static struct perf_msr amd_rapl_msrs[PERF_RAPL_MAX] = { - [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, +static struct perf_msr amd_rapl_msrs[] = { + [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, 0, false, 0 }, + [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, 0, false, 0 }, + [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, 0, false, 0 }, + [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, 0, false, 0 }, }; - static int rapl_cpu_offline(unsigned int cpu) { struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); From 1ac7fd8159a842b3aa51f0b46a351fa3eeb8fbf3 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 4 Jan 2022 08:51:16 -0800 Subject: [PATCH 0149/1295] perf/x86/intel/lbr: Support LBR format V7 The Goldmont plus and Tremont have LBR format V7. The V7 has LBR_INFO, which is the same as LBR format V5. But V7 doesn't support TSX. Without the patch, the associated misprediction and cycles information in the LBR_INFO may be lost on a Goldmont plus platform. For Tremont, the patch only impacts the non-PEBS events. Because of the adaptive PEBS, the LBR_INFO is always processed for a PEBS event. Currently, two different ways are used to check the LBR capabilities, which make the codes complex and confusing. For the LBR format V4 and earlier, the global static lbr_desc array is used to store the flags for the LBR capabilities in each LBR format. For LBR format V5 and V6, the current code checks the version number for the LBR capabilities. There are common LBR capabilities among LBR format versions. Several flags for the LBR capabilities are introduced into the struct x86_pmu. The flags, which can be shared among LBR formats, are used to check the LBR capabilities. Add intel_pmu_lbr_init() to set the flags accordingly at boot time. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Kan Liang Link: https://lkml.kernel.org/r/1641315077-96661-1-git-send-email-peterz@infradead.org --- arch/x86/events/intel/core.c | 2 + arch/x86/events/intel/lbr.c | 108 ++++++++++++++++++++--------------- arch/x86/events/perf_event.h | 10 +++- 3 files changed, 72 insertions(+), 48 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index d5f940c57934..c91434056c29 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6353,6 +6353,8 @@ __init int intel_pmu_init(void) } if (x86_pmu.lbr_nr) { + intel_pmu_lbr_init(); + pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); /* only support branch_stack snapshot for perfmon >= v2 */ diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 8043213b75a5..b7228a170278 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -8,14 +8,6 @@ #include "../perf_event.h" -static const enum { - LBR_EIP_FLAGS = 1, - LBR_TSX = 2, -} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = { - [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS, - [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX, -}; - /* * Intel LBR_SELECT bits * Intel Vol3a, April 2011, Section 16.7 Table 16-10 @@ -243,7 +235,7 @@ void intel_pmu_lbr_reset_64(void) for (i = 0; i < x86_pmu.lbr_nr; i++) { wrmsrl(x86_pmu.lbr_from + i, 0); wrmsrl(x86_pmu.lbr_to + i, 0); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + if (x86_pmu.lbr_has_info) wrmsrl(x86_pmu.lbr_info + i, 0); } } @@ -305,11 +297,10 @@ enum { */ static inline bool lbr_from_signext_quirk_needed(void) { - int lbr_format = x86_pmu.intel_cap.lbr_format; bool tsx_support = boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM); - return !tsx_support && (lbr_desc[lbr_format] & LBR_TSX); + return !tsx_support && x86_pmu.lbr_has_tsx; } static DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key); @@ -427,12 +418,12 @@ rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) void intel_pmu_lbr_restore(void *ctx) { - bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx = ctx; - int i; - unsigned lbr_idx, mask; + bool need_info = x86_pmu.lbr_has_info; u64 tos = task_ctx->tos; + unsigned lbr_idx, mask; + int i; mask = x86_pmu.lbr_nr - 1; for (i = 0; i < task_ctx->valid_lbrs; i++) { @@ -444,7 +435,7 @@ void intel_pmu_lbr_restore(void *ctx) lbr_idx = (tos - i) & mask; wrlbr_from(lbr_idx, 0); wrlbr_to(lbr_idx, 0); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + if (need_info) wrlbr_info(lbr_idx, 0); } @@ -519,9 +510,9 @@ static void __intel_pmu_lbr_restore(void *ctx) void intel_pmu_lbr_save(void *ctx) { - bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx = ctx; + bool need_info = x86_pmu.lbr_has_info; unsigned lbr_idx, mask; u64 tos; int i; @@ -816,7 +807,6 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; - int lbr_format = x86_pmu.intel_cap.lbr_format; u64 tos = intel_pmu_lbr_tos(); int i; int out = 0; @@ -831,9 +821,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) for (i = 0; i < num; i++) { unsigned long lbr_idx = (tos - i) & mask; u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0; - int skip = 0; u16 cycles = 0; - int lbr_flags = lbr_desc[lbr_format]; from = rdlbr_from(lbr_idx, NULL); to = rdlbr_to(lbr_idx, NULL); @@ -845,38 +833,40 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (call_stack && !from) break; - if (lbr_format == LBR_FORMAT_INFO && need_info) { - u64 info; + if (x86_pmu.lbr_has_info) { + if (need_info) { + u64 info; - info = rdlbr_info(lbr_idx, NULL); - mis = !!(info & LBR_INFO_MISPRED); - pred = !mis; - in_tx = !!(info & LBR_INFO_IN_TX); - abort = !!(info & LBR_INFO_ABORT); - cycles = (info & LBR_INFO_CYCLES); - } + info = rdlbr_info(lbr_idx, NULL); + mis = !!(info & LBR_INFO_MISPRED); + pred = !mis; + cycles = (info & LBR_INFO_CYCLES); + if (x86_pmu.lbr_has_tsx) { + in_tx = !!(info & LBR_INFO_IN_TX); + abort = !!(info & LBR_INFO_ABORT); + } + } + } else { + int skip = 0; - if (lbr_format == LBR_FORMAT_TIME) { - mis = !!(from & LBR_FROM_FLAG_MISPRED); - pred = !mis; - skip = 1; - cycles = ((to >> 48) & LBR_INFO_CYCLES); + if (x86_pmu.lbr_from_flags) { + mis = !!(from & LBR_FROM_FLAG_MISPRED); + pred = !mis; + skip = 1; + } + if (x86_pmu.lbr_has_tsx) { + in_tx = !!(from & LBR_FROM_FLAG_IN_TX); + abort = !!(from & LBR_FROM_FLAG_ABORT); + skip = 3; + } + from = (u64)((((s64)from) << skip) >> skip); - to = (u64)((((s64)to) << 16) >> 16); + if (x86_pmu.lbr_to_cycles) { + cycles = ((to >> 48) & LBR_INFO_CYCLES); + to = (u64)((((s64)to) << 16) >> 16); + } } - if (lbr_flags & LBR_EIP_FLAGS) { - mis = !!(from & LBR_FROM_FLAG_MISPRED); - pred = !mis; - skip = 1; - } - if (lbr_flags & LBR_TSX) { - in_tx = !!(from & LBR_FROM_FLAG_IN_TX); - abort = !!(from & LBR_FROM_FLAG_ABORT); - skip = 3; - } - from = (u64)((((s64)from) << skip) >> skip); - /* * Some CPUs report duplicated abort records, * with the second entry not having an abort bit set. @@ -1120,7 +1110,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) && (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) && - (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)) + x86_pmu.lbr_has_info) reg->config |= LBR_NO_INFO; return 0; @@ -1706,6 +1696,30 @@ void intel_pmu_lbr_init_knl(void) x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS; } +void intel_pmu_lbr_init(void) +{ + switch (x86_pmu.intel_cap.lbr_format) { + case LBR_FORMAT_EIP_FLAGS2: + x86_pmu.lbr_has_tsx = 1; + fallthrough; + case LBR_FORMAT_EIP_FLAGS: + x86_pmu.lbr_from_flags = 1; + break; + + case LBR_FORMAT_INFO: + x86_pmu.lbr_has_tsx = 1; + fallthrough; + case LBR_FORMAT_INFO2: + x86_pmu.lbr_has_info = 1; + break; + + case LBR_FORMAT_TIME: + x86_pmu.lbr_from_flags = 1; + x86_pmu.lbr_to_cycles = 1; + break; + } +} + /* * LBR state size is variable based on the max number of registers. * This calculates the expected state size, which should match diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 9d376e528dfc..150261d929b9 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -215,7 +215,8 @@ enum { LBR_FORMAT_EIP_FLAGS2 = 0x04, LBR_FORMAT_INFO = 0x05, LBR_FORMAT_TIME = 0x06, - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, + LBR_FORMAT_INFO2 = 0x07, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO2, }; enum { @@ -840,6 +841,11 @@ struct x86_pmu { bool lbr_double_abort; /* duplicated lbr aborts */ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ + unsigned int lbr_has_info:1; + unsigned int lbr_has_tsx:1; + unsigned int lbr_from_flags:1; + unsigned int lbr_to_cycles:1; + /* * Intel Architectural LBR CPUID Enumeration */ @@ -1392,6 +1398,8 @@ void intel_pmu_lbr_init_skl(void); void intel_pmu_lbr_init_knl(void); +void intel_pmu_lbr_init(void); + void intel_pmu_arch_lbr_init(void); void intel_pmu_pebs_data_source_nhm(void); From 6b19788ddc5937831ffd27525a1b793953fd2d2b Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 4 Jan 2022 08:51:17 -0800 Subject: [PATCH 0150/1295] perf/x86/intel/lbr: Add static_branch for LBR INFO flags Using static_branch to replace the LBR INFO flags to optimize the LBR INFO parsing. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Kan Liang Link: https://lkml.kernel.org/r/1641315077-96661-2-git-send-email-peterz@infradead.org --- arch/x86/events/intel/lbr.c | 53 ++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index b7228a170278..f8fd25528935 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -893,37 +893,40 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_stack.hw_idx = tos; } +static DEFINE_STATIC_KEY_FALSE(x86_lbr_mispred); +static DEFINE_STATIC_KEY_FALSE(x86_lbr_cycles); +static DEFINE_STATIC_KEY_FALSE(x86_lbr_type); + static __always_inline int get_lbr_br_type(u64 info) { - if (!static_cpu_has(X86_FEATURE_ARCH_LBR) || !x86_pmu.lbr_br_type) - return 0; + int type = 0; - return (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET; + if (static_branch_likely(&x86_lbr_type)) + type = (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET; + + return type; } static __always_inline bool get_lbr_mispred(u64 info) { - if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred) - return 0; + bool mispred = 0; - return !!(info & LBR_INFO_MISPRED); -} + if (static_branch_likely(&x86_lbr_mispred)) + mispred = !!(info & LBR_INFO_MISPRED); -static __always_inline bool get_lbr_predicted(u64 info) -{ - if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred) - return 0; - - return !(info & LBR_INFO_MISPRED); + return mispred; } static __always_inline u16 get_lbr_cycles(u64 info) { - if (static_cpu_has(X86_FEATURE_ARCH_LBR) && - !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID)) - return 0; + u16 cycles = info & LBR_INFO_CYCLES; - return info & LBR_INFO_CYCLES; + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && + (!static_branch_likely(&x86_lbr_cycles) || + !(info & LBR_INFO_CYC_CNT_VALID))) + cycles = 0; + + return cycles; } static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, @@ -951,7 +954,7 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, e->from = from; e->to = to; e->mispred = get_lbr_mispred(info); - e->predicted = get_lbr_predicted(info); + e->predicted = !e->mispred; e->in_tx = !!(info & LBR_INFO_IN_TX); e->abort = !!(info & LBR_INFO_ABORT); e->cycles = get_lbr_cycles(info); @@ -1718,6 +1721,14 @@ void intel_pmu_lbr_init(void) x86_pmu.lbr_to_cycles = 1; break; } + + if (x86_pmu.lbr_has_info) { + /* + * Only used in combination with baseline pebs. + */ + static_branch_enable(&x86_lbr_mispred); + static_branch_enable(&x86_lbr_cycles); + } } /* @@ -1779,6 +1790,12 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_br_type = ecx.split.lbr_br_type; x86_pmu.lbr_nr = lbr_nr; + if (x86_pmu.lbr_mispred) + static_branch_enable(&x86_lbr_mispred); + if (x86_pmu.lbr_timed_lbr) + static_branch_enable(&x86_lbr_cycles); + if (x86_pmu.lbr_br_type) + static_branch_enable(&x86_lbr_type); arch_lbr_xsave = is_arch_lbr_xsave_available(); if (arch_lbr_xsave) { From 5a4487f9ef5ef2fdb3215cadf0a9c3e5e8678634 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 13 Jan 2022 14:05:54 -0800 Subject: [PATCH 0151/1295] perf/x86/intel/uncore: Add IMC uncore support for ADL Current ADL uncore code only supports the legacy IMC (memory controller) free-running counters. Besides the free-running counters, ADL also supports several general purpose-counters. The general-purpose counters can also be accessed via MMIO but in a different location. Factor out __uncore_imc_init_box() with offset as a parameter. The function can be shared between ADL and TGL. The event format and the layout of the control registers are a little bit different from other uncore counters. The intel_generic_uncore_mmio_enable_event() can be shared with client IMC uncore. Expose the function. Add more PCI IDs for ADL machines. Fixes: 772ed05f3c5c ("perf/x86/intel/uncore: Add Alder Lake support") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1642111554-118524-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 2 +- arch/x86/events/intel/uncore.h | 3 +- arch/x86/events/intel/uncore_discovery.c | 4 +- arch/x86/events/intel/uncore_discovery.h | 2 + arch/x86/events/intel/uncore_snb.c | 214 ++++++++++++++++++++++- 5 files changed, 220 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index f1ba6ab2e97e..e497da9bf427 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1762,7 +1762,7 @@ static const struct intel_uncore_init_fun rkl_uncore_init __initconst = { static const struct intel_uncore_init_fun adl_uncore_init __initconst = { .cpu_init = adl_uncore_cpu_init, - .mmio_init = tgl_uncore_mmio_init, + .mmio_init = adl_uncore_mmio_init, }; static const struct intel_uncore_init_fun icx_uncore_init __initconst = { diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index b9687980aab6..2adeaf4de4df 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -584,10 +584,11 @@ void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); void skl_uncore_cpu_init(void); void icl_uncore_cpu_init(void); -void adl_uncore_cpu_init(void); void tgl_uncore_cpu_init(void); +void adl_uncore_cpu_init(void); void tgl_uncore_mmio_init(void); void tgl_l_uncore_mmio_init(void); +void adl_uncore_mmio_init(void); int snb_pci2phy_map_init(int devid); /* uncore_snbep.c */ diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 3049c646fa20..6ddadb482f68 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -494,8 +494,8 @@ void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box) writel(0, box->io_addr); } -static void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box, - struct perf_event *event) +void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box, + struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index 6d735611c281..cfaf558bdb6b 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -139,6 +139,8 @@ void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box); void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box); void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box, struct perf_event *event); +void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box, + struct perf_event *event); void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box); void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 0f63706cdadf..f698a55bde81 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Nehalem/SandBridge/Haswell/Broadwell/Skylake uncore support */ #include "uncore.h" +#include "uncore_discovery.h" /* Uncore IMC PCI IDs */ #define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100 @@ -64,6 +65,20 @@ #define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53 #define PCI_DEVICE_ID_INTEL_ADL_1_IMC 0x4660 #define PCI_DEVICE_ID_INTEL_ADL_2_IMC 0x4641 +#define PCI_DEVICE_ID_INTEL_ADL_3_IMC 0x4601 +#define PCI_DEVICE_ID_INTEL_ADL_4_IMC 0x4602 +#define PCI_DEVICE_ID_INTEL_ADL_5_IMC 0x4609 +#define PCI_DEVICE_ID_INTEL_ADL_6_IMC 0x460a +#define PCI_DEVICE_ID_INTEL_ADL_7_IMC 0x4621 +#define PCI_DEVICE_ID_INTEL_ADL_8_IMC 0x4623 +#define PCI_DEVICE_ID_INTEL_ADL_9_IMC 0x4629 +#define PCI_DEVICE_ID_INTEL_ADL_10_IMC 0x4637 +#define PCI_DEVICE_ID_INTEL_ADL_11_IMC 0x463b +#define PCI_DEVICE_ID_INTEL_ADL_12_IMC 0x4648 +#define PCI_DEVICE_ID_INTEL_ADL_13_IMC 0x4649 +#define PCI_DEVICE_ID_INTEL_ADL_14_IMC 0x4650 +#define PCI_DEVICE_ID_INTEL_ADL_15_IMC 0x4668 +#define PCI_DEVICE_ID_INTEL_ADL_16_IMC 0x4670 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -155,6 +170,7 @@ DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(chmask, chmask, "config:8-11"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); @@ -1334,6 +1350,62 @@ static const struct pci_device_id tgl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_2_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_4_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_5_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_6_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_7_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_8_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_9_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_10_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_11_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_12_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_13_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_14_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_15_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_16_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ } }; @@ -1390,7 +1462,8 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void) #define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000 #define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000 -static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) +static void __uncore_imc_init_box(struct intel_uncore_box *box, + unsigned int base_offset) { struct pci_dev *pdev = tgl_uncore_get_mc_dev(); struct intel_uncore_pmu *pmu = box->pmu; @@ -1417,11 +1490,17 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) addr |= ((resource_size_t)mch_bar << 32); #endif + addr += base_offset; box->io_addr = ioremap(addr, type->mmio_map_size); if (!box->io_addr) pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); } +static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) +{ + __uncore_imc_init_box(box, 0); +} + static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = { .init_box = tgl_uncore_imc_freerunning_init_box, .exit_box = uncore_mmio_exit_box, @@ -1469,3 +1548,136 @@ void tgl_uncore_mmio_init(void) } /* end of Tiger Lake MMIO uncore support */ + +/* Alder Lake MMIO uncore support */ +#define ADL_UNCORE_IMC_BASE 0xd900 +#define ADL_UNCORE_IMC_MAP_SIZE 0x200 +#define ADL_UNCORE_IMC_CTR 0xe8 +#define ADL_UNCORE_IMC_CTRL 0xd0 +#define ADL_UNCORE_IMC_GLOBAL_CTL 0xc0 +#define ADL_UNCORE_IMC_BOX_CTL 0xc4 +#define ADL_UNCORE_IMC_FREERUNNING_BASE 0xd800 +#define ADL_UNCORE_IMC_FREERUNNING_MAP_SIZE 0x100 + +#define ADL_UNCORE_IMC_CTL_FRZ (1 << 0) +#define ADL_UNCORE_IMC_CTL_RST_CTRL (1 << 1) +#define ADL_UNCORE_IMC_CTL_RST_CTRS (1 << 2) +#define ADL_UNCORE_IMC_CTL_INT (ADL_UNCORE_IMC_CTL_RST_CTRL | \ + ADL_UNCORE_IMC_CTL_RST_CTRS) + +static void adl_uncore_imc_init_box(struct intel_uncore_box *box) +{ + __uncore_imc_init_box(box, ADL_UNCORE_IMC_BASE); + + /* The global control in MC1 can control both MCs. */ + if (box->io_addr && (box->pmu->pmu_idx == 1)) + writel(ADL_UNCORE_IMC_CTL_INT, box->io_addr + ADL_UNCORE_IMC_GLOBAL_CTL); +} + +static void adl_uncore_mmio_disable_box(struct intel_uncore_box *box) +{ + if (!box->io_addr) + return; + + writel(ADL_UNCORE_IMC_CTL_FRZ, box->io_addr + uncore_mmio_box_ctl(box)); +} + +static void adl_uncore_mmio_enable_box(struct intel_uncore_box *box) +{ + if (!box->io_addr) + return; + + writel(0, box->io_addr + uncore_mmio_box_ctl(box)); +} + +static struct intel_uncore_ops adl_uncore_mmio_ops = { + .init_box = adl_uncore_imc_init_box, + .exit_box = uncore_mmio_exit_box, + .disable_box = adl_uncore_mmio_disable_box, + .enable_box = adl_uncore_mmio_enable_box, + .disable_event = intel_generic_uncore_mmio_disable_event, + .enable_event = intel_generic_uncore_mmio_enable_event, + .read_counter = uncore_mmio_read_counter, +}; + +#define ADL_UNC_CTL_CHMASK_MASK 0x00000f00 +#define ADL_UNC_IMC_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + ADL_UNC_CTL_CHMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET) + +static struct attribute *adl_uncore_imc_formats_attr[] = { + &format_attr_event.attr, + &format_attr_chmask.attr, + &format_attr_edge.attr, + NULL, +}; + +static const struct attribute_group adl_uncore_imc_format_group = { + .name = "format", + .attrs = adl_uncore_imc_formats_attr, +}; + +static struct intel_uncore_type adl_uncore_imc = { + .name = "imc", + .num_counters = 5, + .num_boxes = 2, + .perf_ctr_bits = 64, + .perf_ctr = ADL_UNCORE_IMC_CTR, + .event_ctl = ADL_UNCORE_IMC_CTRL, + .event_mask = ADL_UNC_IMC_EVENT_MASK, + .box_ctl = ADL_UNCORE_IMC_BOX_CTL, + .mmio_offset = 0, + .mmio_map_size = ADL_UNCORE_IMC_MAP_SIZE, + .ops = &adl_uncore_mmio_ops, + .format_group = &adl_uncore_imc_format_group, +}; + +enum perf_adl_uncore_imc_freerunning_types { + ADL_MMIO_UNCORE_IMC_DATA_TOTAL, + ADL_MMIO_UNCORE_IMC_DATA_READ, + ADL_MMIO_UNCORE_IMC_DATA_WRITE, + ADL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX +}; + +static struct freerunning_counters adl_uncore_imc_freerunning[] = { + [ADL_MMIO_UNCORE_IMC_DATA_TOTAL] = { 0x40, 0x0, 0x0, 1, 64 }, + [ADL_MMIO_UNCORE_IMC_DATA_READ] = { 0x58, 0x0, 0x0, 1, 64 }, + [ADL_MMIO_UNCORE_IMC_DATA_WRITE] = { 0xA0, 0x0, 0x0, 1, 64 }, +}; + +static void adl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) +{ + __uncore_imc_init_box(box, ADL_UNCORE_IMC_FREERUNNING_BASE); +} + +static struct intel_uncore_ops adl_uncore_imc_freerunning_ops = { + .init_box = adl_uncore_imc_freerunning_init_box, + .exit_box = uncore_mmio_exit_box, + .read_counter = uncore_mmio_read_counter, + .hw_config = uncore_freerunning_hw_config, +}; + +static struct intel_uncore_type adl_uncore_imc_free_running = { + .name = "imc_free_running", + .num_counters = 3, + .num_boxes = 2, + .num_freerunning_types = ADL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = ADL_UNCORE_IMC_FREERUNNING_MAP_SIZE, + .freerunning = adl_uncore_imc_freerunning, + .ops = &adl_uncore_imc_freerunning_ops, + .event_descs = tgl_uncore_imc_events, + .format_group = &tgl_uncore_imc_format_group, +}; + +static struct intel_uncore_type *adl_mmio_uncores[] = { + &adl_uncore_imc, + &adl_uncore_imc_free_running, + NULL +}; + +void adl_uncore_mmio_init(void) +{ + uncore_mmio_uncores = adl_mmio_uncores; +} + +/* end of Alder Lake MMIO uncore support */ From 8c16dc047b5dd8f7b3bf4584fa75733ea0dde7dc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 15 Dec 2021 12:40:29 -0800 Subject: [PATCH 0152/1295] x86/perf: Avoid warning for Arch LBR without XSAVE Some hypervisors support Arch LBR, but without the LBR XSAVE support. The current Arch LBR init code prints a warning when the xsave size (0) is unexpected. Avoid printing the warning for the "no LBR XSAVE" case. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211215204029.150686-1-ak@linux.intel.com --- arch/x86/events/intel/lbr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index f8fd25528935..669c2be14784 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1751,6 +1751,9 @@ static bool is_arch_lbr_xsave_available(void) * Check the LBR state with the corresponding software structure. * Disable LBR XSAVES support if the size doesn't match. */ + if (xfeature_size(XFEATURE_LBR) == 0) + return false; + if (WARN_ON(xfeature_size(XFEATURE_LBR) != get_lbr_state_size())) return false; From a06247c6804f1a7c86a2e5398a4c1f1db1471848 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 11 Jan 2022 15:23:09 -0800 Subject: [PATCH 0153/1295] psi: Fix uaf issue when psi trigger is destroyed while being polled With write operation on psi files replacing old trigger with a new one, the lifetime of its waitqueue is totally arbitrary. Overwriting an existing trigger causes its waitqueue to be freed and pending poll() will stumble on trigger->event_wait which was destroyed. Fix this by disallowing to redefine an existing psi trigger. If a write operation is used on a file descriptor with an already existing psi trigger, the operation will fail with EBUSY error. Also bypass a check for psi_disabled in the psi_trigger_destroy as the flag can be flipped after the trigger is created, leading to a memory leak. Fixes: 0e94682b73bf ("psi: introduce psi monitor") Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com Suggested-by: Linus Torvalds Analyzed-by: Eric Biggers Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Eric Biggers Acked-by: Johannes Weiner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220111232309.1786347-1-surenb@google.com --- Documentation/accounting/psi.rst | 3 +- include/linux/psi.h | 2 +- include/linux/psi_types.h | 3 -- kernel/cgroup/cgroup.c | 11 ++++-- kernel/sched/psi.c | 66 ++++++++++++++------------------ 5 files changed, 40 insertions(+), 45 deletions(-) diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst index f2b3439edcc2..860fe651d645 100644 --- a/Documentation/accounting/psi.rst +++ b/Documentation/accounting/psi.rst @@ -92,7 +92,8 @@ Triggers can be set on more than one psi metric and more than one trigger for the same psi metric can be specified. However for each trigger a separate file descriptor is required to be able to poll it separately from others, therefore for each trigger a separate open() syscall should be made even -when opening the same psi interface file. +when opening the same psi interface file. Write operations to a file descriptor +with an already existing psi trigger will fail with EBUSY. Monitors activate only when system enters stall state for the monitored psi metric and deactivates upon exit from the stall state. While system is diff --git a/include/linux/psi.h b/include/linux/psi.h index a70ca833c6d7..f8ce53bfdb2a 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -33,7 +33,7 @@ void cgroup_move_task(struct task_struct *p, struct css_set *to); struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res); -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t); +void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 516c0fe836fd..1a3cef26d129 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -141,9 +141,6 @@ struct psi_trigger { * events to one per window */ u64 last_event_time; - - /* Refcounting to prevent premature destruction */ - struct kref refcount; }; struct psi_group { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b31e1465868a..9d05c3ca2d5e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3643,6 +3643,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn); + /* Allow only one trigger per file descriptor */ + if (ctx->psi.trigger) { + cgroup_put(cgrp); + return -EBUSY; + } + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { @@ -3650,8 +3656,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return PTR_ERR(new); } - psi_trigger_replace(&ctx->psi.trigger, new); - + smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; @@ -3690,7 +3695,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; - psi_trigger_replace(&ctx->psi.trigger, NULL); + psi_trigger_destroy(ctx->psi.trigger); } bool cgroup_psi_enabled(void) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index a679613a7cb7..c137c4d6983e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1162,7 +1162,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount); mutex_lock(&group->trigger_lock); @@ -1191,15 +1190,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, return t; } -static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; + struct psi_group *group; struct task_struct *task_to_destroy = NULL; - if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return; + group = t->group; /* * Wakeup waiters to stop polling. Can happen if cgroup is deleted * from under a polling process. @@ -1235,9 +1238,9 @@ static void psi_trigger_destroy(struct kref *ref) mutex_unlock(&group->trigger_lock); /* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_task RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_task + * Wait for psi_schedule_poll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * poll_task. */ synchronize_rcu(); /* @@ -1254,18 +1257,6 @@ static void psi_trigger_destroy(struct kref *ref) kfree(t); } -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); -} - __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait) { @@ -1275,24 +1266,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock(); poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; - kref_put(&t->refcount, psi_trigger_destroy); - return ret; } @@ -1316,14 +1298,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, buf[buf_size - 1] = '\0'; - new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock); return nbytes; @@ -1358,7 +1350,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); } From 98b0d890220d45418cfbc5157b3382e6da5a12ab Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 11 Jan 2022 14:46:56 +0100 Subject: [PATCH 0154/1295] sched/pelt: Relax the sync of util_sum with util_avg Rick reported performance regressions in bugzilla because of cpu frequency being lower than before: https://bugzilla.kernel.org/show_bug.cgi?id=215045 He bisected the problem to: commit 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent") This commit forces util_sum to be synced with the new util_avg after removing the contribution of a task and before the next periodic sync. By doing so util_sum is rounded to its lower bound and might lost up to LOAD_AVG_MAX-1 of accumulated contribution which has not yet been reflected in util_avg. Instead of always setting util_sum to the low bound of util_avg, which can significantly lower the utilization of root cfs_rq after propagating the change down into the hierarchy, we revert the change of util_sum and propagate the difference. In addition, we also check that cfs's util_sum always stays above the lower bound for a given util_avg as it has been observed that sched_entity's util_sum is sometimes above cfs one. Fixes: 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent") Reported-by: Rick Yiu Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Sachin Sant Link: https://lkml.kernel.org/r/20220111134659.24961-2-vincent.guittot@linaro.org --- kernel/sched/fair.c | 16 +++++++++++++--- kernel/sched/pelt.h | 4 +++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 095b0aa378df..d8f068d72b9d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3381,7 +3381,6 @@ void set_task_rq_fair(struct sched_entity *se, se->avg.last_update_time = n_last_update_time; } - /* * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to * propagate its contribution. The key to this propagation is the invariant @@ -3449,7 +3448,6 @@ void set_task_rq_fair(struct sched_entity *se, * XXX: only do this for the part of runnable > running ? * */ - static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { @@ -3681,7 +3679,19 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) r = removed_util; sub_positive(&sa->util_avg, r); - sa->util_sum = sa->util_avg * divider; + sub_positive(&sa->util_sum, r * divider); + /* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ + sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); r = removed_runnable; sub_positive(&sa->runnable_avg, r); diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index e06071bf3472..c336f5f481bc 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -37,9 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running) } #endif +#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024) + static inline u32 get_pelt_divider(struct sched_avg *avg) { - return LOAD_AVG_MAX - 1024 + avg->period_contrib; + return PELT_MIN_DIVIDER + avg->period_contrib; } static inline void cfs_se_util_change(struct sched_avg *avg) From 7ceb77103001544a43e11d7f3a8a69a2c1f422cf Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 11 Jan 2022 14:46:57 +0100 Subject: [PATCH 0155/1295] sched/pelt: Continue to relax the sync of util_sum with util_avg Rick reported performance regressions in bugzilla because of cpu frequency being lower than before: https://bugzilla.kernel.org/show_bug.cgi?id=215045 He bisected the problem to: commit 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent") This commit forces util_sum to be synced with the new util_avg after removing the contribution of a task and before the next periodic sync. By doing so util_sum is rounded to its lower bound and might lost up to LOAD_AVG_MAX-1 of accumulated contribution which has not yet been reflected in util_avg. update_tg_cfs_util() is not the only place where we round util_sum and lost some accumulated contributions that are not already reflected in util_avg. Modify update_tg_cfs_util() and detach_entity_load_avg() to not sync util_sum with the new util_avg. Instead of always setting util_sum to the low bound of util_avg, which can significantly lower the utilization, we propagate the difference. In addition, we also check that cfs's util_sum always stays above the lower bound for a given util_avg as it has been observed that sched_entity's util_sum is sometimes above cfs one. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Sachin Sant Link: https://lkml.kernel.org/r/20220111134659.24961-3-vincent.guittot@linaro.org --- kernel/sched/fair.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d8f068d72b9d..ad2809cc57c7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3451,11 +3451,11 @@ void set_task_rq_fair(struct sched_entity *se, static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; - u32 divider; + long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* @@ -3464,13 +3464,20 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq */ divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * divider; + new_sum = se->avg.util_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.util_sum; + se->avg.util_sum = new_sum; /* Update parent cfs_rq utilization */ - add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + add_positive(&cfs_rq->avg.util_avg, delta_avg); + add_positive(&cfs_rq->avg.util_sum, delta_sum); + + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); } static inline void @@ -3790,7 +3797,11 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; From 95246d1ec80b8d19d882cd8eb7ad094e63b41bb8 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 11 Jan 2022 14:46:58 +0100 Subject: [PATCH 0156/1295] sched/pelt: Relax the sync of runnable_sum with runnable_avg Similarly to util_avg and util_sum, don't sync runnable_sum with the low bound of runnable_avg but only ensure that runnable_sum stays in the correct range. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Sachin Sant Link: https://lkml.kernel.org/r/20220111134659.24961-4-vincent.guittot@linaro.org --- kernel/sched/fair.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ad2809cc57c7..0e87e1916504 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3483,11 +3483,11 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; - u32 divider; + long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* @@ -3498,11 +3498,16 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; - se->avg.runnable_sum = se->avg.runnable_avg * divider; + new_sum = se->avg.runnable_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.runnable_sum; + se->avg.runnable_sum = new_sum; /* Update parent cfs_rq runnable */ - add_positive(&cfs_rq->avg.runnable_avg, delta); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + add_positive(&cfs_rq->avg.runnable_avg, delta_avg); + add_positive(&cfs_rq->avg.runnable_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); } static inline void @@ -3702,7 +3707,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) r = removed_runnable; sub_positive(&sa->runnable_avg, r); - sa->runnable_sum = sa->runnable_avg * divider; + sub_positive(&sa->runnable_sum, r * divider); + /* See sa->util_sum above */ + sa->runnable_sum = max_t(u32, sa->runnable_sum, + sa->runnable_avg * PELT_MIN_DIVIDER); /* * removed_runnable is the unweighted version of removed_load so we @@ -3789,12 +3797,6 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - /* - * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. - * See ___update_load_avg() for details. - */ - u32 divider = get_pelt_divider(&cfs_rq->avg); - dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); @@ -3803,7 +3805,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); From 2d02fa8cc21a93da35cfba462bf8ab87bf2db651 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 11 Jan 2022 14:46:59 +0100 Subject: [PATCH 0157/1295] sched/pelt: Relax the sync of load_sum with load_avg Similarly to util_avg and util_sum, don't sync load_sum with the low bound of load_avg but only ensure that load_sum stays in the correct range. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Sachin Sant Link: https://lkml.kernel.org/r/20220111134659.24961-5-vincent.guittot@linaro.org --- kernel/sched/fair.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0e87e1916504..f4f02c2cff87 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3028,9 +3028,11 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u32 divider = get_pelt_divider(&se->avg); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } #else static inline void @@ -3513,9 +3515,10 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf static inline void update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; + long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; unsigned long load_avg; u64 load_sum = 0; + s64 delta_sum; u32 divider; if (!runnable_sum) @@ -3542,7 +3545,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq * assuming all tasks are equally runnable. */ if (scale_load_down(gcfs_rq->load.weight)) { - load_sum = div_s64(gcfs_rq->avg.load_sum, + load_sum = div_u64(gcfs_rq->avg.load_sum, scale_load_down(gcfs_rq->load.weight)); } @@ -3559,19 +3562,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; runnable_sum = max(runnable_sum, running_sum); - load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, divider); + load_sum = se_weight(se) * runnable_sum; + load_avg = div_u64(load_sum, divider); - se->avg.load_sum = runnable_sum; - - delta = load_avg - se->avg.load_avg; - if (!delta) + delta_avg = load_avg - se->avg.load_avg; + if (!delta_avg) return; - se->avg.load_avg = load_avg; + delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; - add_positive(&cfs_rq->avg.load_avg, delta); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; + se->avg.load_sum = runnable_sum; + se->avg.load_avg = load_avg; + add_positive(&cfs_rq->avg.load_avg, delta_avg); + add_positive(&cfs_rq->avg.load_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3687,7 +3693,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) r = removed_load; sub_positive(&sa->load_avg, r); - sa->load_sum = sa->load_avg * divider; + sub_positive(&sa->load_sum, r * divider); + /* See sa->util_sum below */ + sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); r = removed_util; sub_positive(&sa->util_avg, r); From b171501f258063f5c56dd2c5fdf310802d8d7dc1 Mon Sep 17 00:00:00 2001 From: Cruz Zhao Date: Tue, 11 Jan 2022 17:55:59 +0800 Subject: [PATCH 0158/1295] sched/core: Accounting forceidle time for all tasks except idle task There are two types of forced idle time: forced idle time from cookie'd task and forced idle time form uncookie'd task. The forced idle time from uncookie'd task is actually caused by the cookie'd task in runqueue indirectly, and it's more accurate to measure the capacity loss with the sum of both. Assuming cpu x and cpu y are a pair of SMT siblings, consider the following scenarios: 1.There's a cookie'd task running on cpu x, and there're 4 uncookie'd tasks running on cpu y. For cpu x, there will be 80% forced idle time (from uncookie'd task); for cpu y, there will be 20% forced idle time (from cookie'd task). 2.There's a uncookie'd task running on cpu x, and there're 4 cookie'd tasks running on cpu y. For cpu x, there will be 80% forced idle time (from cookie'd task); for cpu y, there will be 20% forced idle time (from uncookie'd task). The scenario1 can recurrent by stress-ng(scenario2 can recurrent similary): (cookie'd)taskset -c x stress-ng -c 1 -l 100 (uncookie'd)taskset -c y stress-ng -c 4 -l 100 In the above two scenarios, the total capacity loss is 1 cpu, but in scenario1, the cookie'd forced idle time tells us 20% cpu capacity loss, in scenario2, the cookie'd forced idle time tells us 80% cpu capacity loss, which are not accurate. It'll be more accurate to measure with cookie'd forced idle time and uncookie'd forced idle time. Signed-off-by: Cruz Zhao Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josh Don Link: https://lore.kernel.org/r/1641894961-9241-2-git-send-email-CruzZhao@linux.alibaba.com --- kernel/sched/core.c | 3 +-- kernel/sched/core_sched.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 83872f95a1ea..0d2ab2a2f9fe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5822,8 +5822,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } if (schedstat_enabled() && rq->core->core_forceidle_count) { - if (cookie) - rq->core->core_forceidle_start = rq_clock(rq->core); + rq->core->core_forceidle_start = rq_clock(rq->core); rq->core->core_forceidle_occupation = occ; } diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 1fb45672ec85..c8746a9a7ada 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -277,7 +277,7 @@ void __sched_core_account_forceidle(struct rq *rq) rq_i = cpu_rq(i); p = rq_i->core_pick ?: rq_i->curr; - if (!p->core_cookie) + if (p == rq_i->idle) continue; __schedstat_add(p->stats.core_forceidle_sum, delta); From a315da5e686b02b20c1713dda818e8fb691526bb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Dec 2021 21:59:00 -0800 Subject: [PATCH 0159/1295] sched/fair: Fix all kernel-doc warnings Quieten all kernel-doc warnings in kernel/sched/fair.c: kernel/sched/fair.c:3663: warning: No description found for return value of 'update_cfs_rq_load_avg' kernel/sched/fair.c:8601: warning: No description found for return value of 'asym_smt_can_pull_tasks' kernel/sched/fair.c:8673: warning: Function parameter or member 'sds' not described in 'update_sg_lb_stats' kernel/sched/fair.c:9483: warning: contents before sections Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ricardo Neri Acked-by: Vincent Guittot Link: https://lore.kernel.org/r/20211218055900.2704-1-rdunlap@infradead.org --- kernel/sched/fair.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f4f02c2cff87..5146163bfabb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3668,7 +3668,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed load. + * Return: true if the load decayed or we removed load. * * Since both these conditions indicate a changed cfs_rq->avg.load we should * call update_tg_load_avg() when this function returns true. @@ -8573,6 +8573,8 @@ group_type group_classify(unsigned int imbalance_pct, * * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings * of @dst_cpu are idle and @sg has lower priority. + * + * Return: true if @dst_cpu can pull tasks, false otherwise. */ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, @@ -8648,6 +8650,7 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. + * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @sg_status: Holds flag indicating the status of the sched_group @@ -9455,12 +9458,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. + * @env: The load balancing environment. * * Also calculates the amount of runnable load which should be moved * to restore balance. * - * @env: The load balancing environment. - * * Return: - The busiest group if imbalance exists. */ static struct sched_group *find_busiest_group(struct lb_env *env) From 7e406d1ff39b8ee574036418a5043c86723170cf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 25 Dec 2021 01:04:57 +0100 Subject: [PATCH 0160/1295] sched: Avoid double preemption in __cond_resched_*lock*() For PREEMPT/DYNAMIC_PREEMPT the *_unlock() will already trigger a preemption, no point in then calling preempt_schedule_common() *again*. Use _cond_resched() instead, since this is a NOP for the preemptible configs while it provide a preemption point for the others. Reported-by: xuhaifeng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YcGnvDEYBwOiV0cR@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d2ab2a2f9fe..56b428c8ea96 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8218,9 +8218,7 @@ int __cond_resched_lock(spinlock_t *lock) if (spin_needbreak(lock) || resched) { spin_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; spin_lock(lock); @@ -8238,9 +8236,7 @@ int __cond_resched_rwlock_read(rwlock_t *lock) if (rwlock_needbreak(lock) || resched) { read_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; read_lock(lock); @@ -8258,9 +8254,7 @@ int __cond_resched_rwlock_write(rwlock_t *lock) if (rwlock_needbreak(lock) || resched) { write_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; write_lock(lock); From 0e3872499de1a1230cef5221607d71aa09264bd5 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Fri, 7 Jan 2022 17:52:54 +0800 Subject: [PATCH 0161/1295] kernel/sched: Remove dl_boosted flag comment since commit 2279f540ea7d ("sched/deadline: Fix priority inheritance with multiple scheduling classes"), we should not keep it here. Signed-off-by: Hui Su Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Link: https://lore.kernel.org/r/20220107095254.GA49258@localhost.localdomain --- include/linux/sched.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 7a1f16df66e3..7f8b4495e243 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -614,10 +614,6 @@ struct sched_dl_entity { * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * - * @dl_boosted tells if we are boosted due to DI. If so we are - * outside bandwidth enforcement mechanism (but only until we - * exit the critical section); - * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * From 4624f199327a704dd1069aca1c3cadb8f2a28c6f Mon Sep 17 00:00:00 2001 From: Zechuan Chen Date: Tue, 28 Dec 2021 19:13:38 +0800 Subject: [PATCH 0162/1295] perf probe: Fix ppc64 'perf probe add events failed' case Because of commit bf794bf52a80c627 ("powerpc/kprobes: Fix kallsyms lookup across powerpc ABIv1 and ABIv2"), in ppc64 ABIv1, our perf command eliminates the need to use the prefix "." at the symbol name. But when the command "perf probe -a schedule" is executed on ppc64 ABIv1, it obtains two symbol address information through /proc/kallsyms, for example: cat /proc/kallsyms | grep -w schedule c000000000657020 T .schedule c000000000d4fdb8 D schedule The symbol "D schedule" is not a function symbol, and perf will print: "p:probe/schedule _text+13958584"Failed to write event: Invalid argument Therefore, when searching symbols from map and adding probe point for them, a symbol type check is added. If the type of symbol is not a function, skip it. Fixes: bf794bf52a80c627 ("powerpc/kprobes: Fix kallsyms lookup across powerpc ABIv1 and ABIv2") Signed-off-by: Zechuan Chen Acked-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jianlin Lv Cc: Jin Yao Cc: Jiri Olsa Cc: Mark Rutland Cc: Michael Ellerman Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Yang Jihong Link: https://lore.kernel.org/r/20211228111338.218602-1-chenzechuan1@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index b2a02c9ab8ea..a834918a0a0d 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -3083,6 +3083,9 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev, for (j = 0; j < num_matched_functions; j++) { sym = syms[j]; + if (sym->type != STT_FUNC) + continue; + /* There can be duplicated symbols in the map */ for (i = 0; i < j; i++) if (sym->start == syms[i]->start) { From 1855b796f2f672cbb25400be2d3171c26fc869a3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 17 Jan 2022 13:09:28 -0300 Subject: [PATCH 0163/1295] perf affinity: Allow passing a NULL arg to affinity__cleanup() Just like with free(), NULL is checked to avoid having all callers do it. Its convenient for when not using affinity setup/cleanup for dummy CPU maps, i.e. CPU maps for pid targets. Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220117160931.1191712-2-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/affinity.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/affinity.c b/tools/perf/util/affinity.c index f1e30d566db3..4d216c0dc425 100644 --- a/tools/perf/util/affinity.c +++ b/tools/perf/util/affinity.c @@ -62,7 +62,7 @@ void affinity__set(struct affinity *a, int cpu) clear_bit(cpu, a->sched_cpus); } -void affinity__cleanup(struct affinity *a) +static void __affinity__cleanup(struct affinity *a) { int cpu_set_size = get_cpu_set_size(); @@ -71,3 +71,9 @@ void affinity__cleanup(struct affinity *a) zfree(&a->sched_cpus); zfree(&a->orig_cpus); } + +void affinity__cleanup(struct affinity *a) +{ + if (a != NULL) + __affinity__cleanup(a); +} From 49de179577e7b05b57f625bf05cdc60a72de38d0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 17 Jan 2022 13:09:29 -0300 Subject: [PATCH 0164/1295] perf stat: No need to setup affinities when starting a workload I.e. the simple: $ perf stat sleep 1 Uses a dummy CPU map and thus there is no need to setup/cleanup affinities to avoid IPIs, etc. With this we're down to a sched_getaffinity() call, in the libnuma initialization, that probably can be removed in a followup patch. Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220117160931.1191712-3-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 973ade18b72a..934e992c966f 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -788,7 +788,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) const bool forks = (argc > 0); bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false; struct evlist_cpu_iterator evlist_cpu_itr; - struct affinity affinity; + struct affinity saved_affinity, *affinity = NULL; int err; bool second_pass = false; @@ -803,8 +803,11 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) if (group) evlist__set_leader(evsel_list); - if (affinity__setup(&affinity) < 0) - return -1; + if (!cpu_map__is_dummy(evsel_list->core.cpus)) { + if (affinity__setup(&saved_affinity) < 0) + return -1; + affinity = &saved_affinity; + } evlist__for_each_entry(evsel_list, counter) { if (bpf_counter__load(counter, &target)) @@ -813,7 +816,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) all_counters_use_bpf = false; } - evlist__for_each_cpu(evlist_cpu_itr, evsel_list, &affinity) { + evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) { counter = evlist_cpu_itr.evsel; /* @@ -869,7 +872,7 @@ try_again: */ /* First close errored or weak retry */ - evlist__for_each_cpu(evlist_cpu_itr, evsel_list, &affinity) { + evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) { counter = evlist_cpu_itr.evsel; if (!counter->reset_group && !counter->errored) @@ -878,7 +881,7 @@ try_again: perf_evsel__close_cpu(&counter->core, evlist_cpu_itr.cpu_map_idx); } /* Now reopen weak */ - evlist__for_each_cpu(evlist_cpu_itr, evsel_list, &affinity) { + evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) { counter = evlist_cpu_itr.evsel; if (!counter->reset_group && !counter->errored) @@ -904,7 +907,7 @@ try_again_reset: counter->supported = true; } } - affinity__cleanup(&affinity); + affinity__cleanup(affinity); evlist__for_each_entry(evsel_list, counter) { if (!counter->supported) { From f350ee95498a3fa65c37ed597d9c051c6b2b6974 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 17 Jan 2022 13:09:30 -0300 Subject: [PATCH 0165/1295] perf evlist: No need to setup affinities when enabling events for pid targets When the target is a pid, not started by 'perf stat' we need to enable the events, and in that case there is no need to setup affinities as we use a dummy CPU map, with just one entry set to -1. So stop doing it to avoid this needless call to sched_getaffinity(): # strace -ke sched_getaffinity perf stat -e cycles -p 241957 sleep 1 sched_getaffinity(0, 512, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]) = 8 > /usr/lib64/libc-2.33.so(sched_getaffinity@@GLIBC_2.3.4+0x1a) [0xe6eea] > /var/home/acme/bin/perf(affinity__setup+0x6a) [0x5329ca] > /var/home/acme/bin/perf(__evlist__enable.constprop.0+0x23) [0x4b9693] > /var/home/acme/bin/perf(enable_counters+0x14d) [0x42de5d] > /var/home/acme/bin/perf(cmd_stat+0x2358) [0x4310c8] > /var/home/acme/bin/perf(run_builtin+0x6a) [0x4a2cfa] > /var/home/acme/bin/perf(main+0x612) [0x40f8c2] > /usr/lib64/libc-2.33.so(__libc_start_main+0xd4) [0x27b74] > /var/home/acme/bin/perf(_start+0x2d) [0x40fadd] Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220117160931.1191712-4-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 6e88d404b5b3..ae6d4363da76 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -487,12 +487,16 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name) { struct evsel *pos; struct evlist_cpu_iterator evlist_cpu_itr; - struct affinity affinity; + struct affinity saved_affinity, *affinity = NULL; - if (affinity__setup(&affinity) < 0) - return; + // See explanation in evlist__close() + if (!cpu_map__is_dummy(evlist->core.cpus)) { + if (affinity__setup(&saved_affinity) < 0) + return; + affinity = &saved_affinity; + } - evlist__for_each_cpu(evlist_cpu_itr, evlist, &affinity) { + evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) { pos = evlist_cpu_itr.evsel; if (evsel__strcmp(pos, evsel_name)) continue; @@ -500,7 +504,7 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name) continue; evsel__enable_cpu(pos, evlist_cpu_itr.cpu_map_idx); } - affinity__cleanup(&affinity); + affinity__cleanup(affinity); evlist__for_each_entry(evlist, pos) { if (evsel__strcmp(pos, evsel_name)) continue; From 0d3d237651fd7a01fe5dc501b0d170a43d8156ba Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 17 Jan 2022 13:09:31 -0300 Subject: [PATCH 0166/1295] perf evlist: No need to setup affinities when disabling events for pid targets When the target is a pid, not started by 'perf stat' we need to disable the events, and in that case there is no need to setup affinities as we use a dummy CPU map, with just one entry set to -1. So stop doing it to avoid this needless call to sched_getaffinity(): # strace -ke sched_getaffinity perf stat -e cycles -p 241957 sleep 1 sched_getaffinity(0, 512, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]) = 8 > /usr/lib64/libc-2.33.so(sched_getaffinity@@GLIBC_2.3.4+0x1a) [0xe6eea] > /var/home/acme/bin/perf(affinity__setup+0x6a) [0x532a2a] > /var/home/acme/bin/perf(__evlist__disable.constprop.0+0x27) [0x4b9827] > /var/home/acme/bin/perf(cmd_stat+0x29b5) [0x431725] > /var/home/acme/bin/perf(run_builtin+0x6a) [0x4a2cfa] > /var/home/acme/bin/perf(main+0x612) [0x40f8c2] > /usr/lib64/libc-2.33.so(__libc_start_main+0xd4) [0x27b74] > /var/home/acme/bin/perf(_start+0x2d) [0x40fadd] Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220117160931.1191712-5-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index ae6d4363da76..eaad04e1672a 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -430,15 +430,19 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name) { struct evsel *pos; struct evlist_cpu_iterator evlist_cpu_itr; - struct affinity affinity; + struct affinity saved_affinity, *affinity = NULL; bool has_imm = false; - if (affinity__setup(&affinity) < 0) - return; + // See explanation in evlist__close() + if (!cpu_map__is_dummy(evlist->core.cpus)) { + if (affinity__setup(&saved_affinity) < 0) + return; + affinity = &saved_affinity; + } /* Disable 'immediate' events last */ for (int imm = 0; imm <= 1; imm++) { - evlist__for_each_cpu(evlist_cpu_itr, evlist, &affinity) { + evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) { pos = evlist_cpu_itr.evsel; if (evsel__strcmp(pos, evsel_name)) continue; @@ -454,7 +458,7 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name) break; } - affinity__cleanup(&affinity); + affinity__cleanup(affinity); evlist__for_each_entry(evlist, pos) { if (evsel__strcmp(pos, evsel_name)) continue; From a8e422af696133003903e440b87f10a8248051b8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 18 Jan 2022 10:18:36 -0800 Subject: [PATCH 0167/1295] xfs: remove unused xfs_ioctl32.h declarations Remove these unused ia32 compat declarations; all the bits involved have either been withdrawn or hoisted to the VFS. Signed-off-by: Darrick J. Wong Reviewed-by: Eric Sandeen --- fs/xfs/xfs_ioctl32.h | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index fc5a91f3a5e0..c14852362fce 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -142,24 +142,6 @@ typedef struct compat_xfs_fsop_attrmulti_handlereq { _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq) #ifdef BROKEN_X86_ALIGNMENT -/* on ia32 l_start is on a 32-bit boundary */ -typedef struct compat_xfs_flock64 { - __s16 l_type; - __s16 l_whence; - __s64 l_start __attribute__((packed)); - /* len == 0 means until end of file */ - __s64 l_len __attribute__((packed)); - __s32 l_sysid; - __u32 l_pid; - __s32 l_pad[4]; /* reserve area */ -} compat_xfs_flock64_t; - -#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64) -#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) -#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) -#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) -#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64) - typedef struct compat_xfs_fsop_geom_v1 { __u32 blocksize; /* filesystem (data) block size */ __u32 rtextsize; /* realtime extent size */ From 6e7f90d163afa8fc2efd6ae318e7c20156a5621f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 18 Jan 2022 17:00:16 -0500 Subject: [PATCH 0168/1295] lockd: fix server crash on reboot of client holding lock I thought I was iterating over the array when actually the iteration is over the values contained in the array? Ugh, keep it simple. Symptoms were a null deference in vfs_lock_file() when an NFSv3 client that previously held a lock came back up and sent a notify. Reported-by: Jonathan Woithe Fixes: 7f024fcd5c97 ("Keep read and write fds with each nlm_file") Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/lockd/svcsubs.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index cb3a7512c33e..54c2e42130ca 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -179,19 +179,20 @@ nlm_delete_file(struct nlm_file *file) static int nlm_unlock_files(struct nlm_file *file) { struct file_lock lock; - struct file *f; lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; - for (f = file->f_file[0]; f <= file->f_file[1]; f++) { - if (f && vfs_lock_file(f, F_SETLK, &lock, NULL) < 0) { - pr_warn("lockd: unlock failure in %s:%d\n", - __FILE__, __LINE__); - return 1; - } - } + if (file->f_file[O_RDONLY] && + vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL)) + goto out_err; + if (file->f_file[O_WRONLY] && + vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, &lock, NULL)) + goto out_err; return 0; +out_err: + pr_warn("lockd: unlock failure in %s:%d\n", __FILE__, __LINE__); + return 1; } /* From e4e2787bef7e643511cf0f352deb6bd16e7fa9b4 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 18 Jan 2022 16:36:27 -0600 Subject: [PATCH 0169/1295] smb3: add new defines from protocol specification In the October updates to MS-SMB2 two additional FSCTLs were described. Add the missing defines for these, as well as fix a typo in an earlier define. Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/smbfs_common/smb2pdu.h | 2 +- fs/smbfs_common/smbfsctl.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 7ccadcbe684b..38b8fc514860 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -449,7 +449,7 @@ struct smb2_netname_neg_context { */ /* Flags */ -#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001 +#define SMB2_ACCEPT_TRANSPORT_LEVEL_SECURITY 0x00000001 struct smb2_transport_capabilities_context { __le16 ContextType; /* 6 */ diff --git a/fs/smbfs_common/smbfsctl.h b/fs/smbfs_common/smbfsctl.h index 926f87cd6af0..d51939c43ad7 100644 --- a/fs/smbfs_common/smbfsctl.h +++ b/fs/smbfs_common/smbfsctl.h @@ -95,8 +95,10 @@ #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ #define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C #define FSCTL_GET_REFS_VOLUME_DATA 0x000902D8 /* See MS-FSCC 2.3.24 */ +#define FSCTL_SET_INTEGRITY_INFORMATION_EXT 0x00090380 #define FSCTL_GET_RETRIEVAL_POINTERS_AND_REFCOUNT 0x000903d3 #define FSCTL_GET_RETRIEVAL_POINTER_COUNT 0x0009042b +#define FSCTL_REFS_STREAM_SNAPSHOT_MANAGEMENT 0x00090440 #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF #define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ #define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */ From 51b667a32d616c399124328be97833ec154e0ff8 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Mon, 17 Jan 2022 20:19:57 +0200 Subject: [PATCH 0170/1295] MAINTAINERS: add common wireless and wireless-next trees For easier maintenance we have decided to create common wireless and wireless-next trees for all wireless patches. Old mac80211 and wireless-drivers trees will not be used anymore. While at it, add a wiki link to wireless drivers section and a patchwork link to 802.11, mac80211 and rfkill sections. Also use https in patchwork links. Acked-by: Johannes Berg Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220117181958.3509-1-kvalo@kernel.org --- MAINTAINERS | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 306de106f31b..f67e7dae2c55 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -190,8 +190,9 @@ M: Johannes Berg L: linux-wireless@vger.kernel.org S: Maintained W: https://wireless.wiki.kernel.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git +Q: https://patchwork.kernel.org/project/linux-wireless/list/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git F: Documentation/driver-api/80211/cfg80211.rst F: Documentation/networking/regulatory.rst F: include/linux/ieee80211.h @@ -11308,8 +11309,9 @@ M: Johannes Berg L: linux-wireless@vger.kernel.org S: Maintained W: https://wireless.wiki.kernel.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git +Q: https://patchwork.kernel.org/project/linux-wireless/list/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git F: Documentation/networking/mac80211-injection.rst F: Documentation/networking/mac80211_hwsim/mac80211_hwsim.rst F: drivers/net/wireless/mac80211_hwsim.[ch] @@ -13302,9 +13304,10 @@ NETWORKING DRIVERS (WIRELESS) M: Kalle Valo L: linux-wireless@vger.kernel.org S: Maintained -Q: http://patchwork.kernel.org/project/linux-wireless/list/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers-next.git +W: https://wireless.wiki.kernel.org/ +Q: https://patchwork.kernel.org/project/linux-wireless/list/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git F: Documentation/devicetree/bindings/net/wireless/ F: drivers/net/wireless/ @@ -16391,8 +16394,9 @@ M: Johannes Berg L: linux-wireless@vger.kernel.org S: Maintained W: https://wireless.wiki.kernel.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git +Q: https://patchwork.kernel.org/project/linux-wireless/list/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git F: Documentation/ABI/stable/sysfs-class-rfkill F: Documentation/driver-api/rfkill.rst F: include/linux/rfkill.h From a1222ca0681f1db3696d703aa8df61c8b41a61ac Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Mon, 17 Jan 2022 20:19:58 +0200 Subject: [PATCH 0171/1295] MAINTAINERS: remove extra wireless section There's an unneeded and almost empty wireless section in MAINTAINERS, seems to be leftovers from commit 0e324cf640fb ("MAINTAINERS: changes for wireless"). I don't see any need for that so let's remove it. Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220117181958.3509-2-kvalo@kernel.org --- MAINTAINERS | 4 ---- 1 file changed, 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index f67e7dae2c55..ae00e2b5e8dc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13441,10 +13441,6 @@ F: include/net/tls.h F: include/uapi/linux/tls.h F: net/tls/* -NETWORKING [WIRELESS] -L: linux-wireless@vger.kernel.org -Q: http://patchwork.kernel.org/project/linux-wireless/list/ - NETXEN (1/10) GbE SUPPORT M: Manish Chopra M: Rahul Verma From 0a3d12ab5097b1d045e693412e6b366b7e82031b Mon Sep 17 00:00:00 2001 From: Padmanabha Srinivasaiah Date: Tue, 18 Jan 2022 01:51:26 +0100 Subject: [PATCH 0172/1295] drm/vc4: Fix deadlock on DSI device attach error DSI device attach to DSI host will be done with host device's lock held. Un-registering host in "device attach" error path (ex: probe retry) will result in deadlock with below call trace and non operational DSI display. Startup Call trace: [ 35.043036] rt_mutex_slowlock.constprop.21+0x184/0x1b8 [ 35.043048] mutex_lock_nested+0x7c/0xc8 [ 35.043060] device_del+0x4c/0x3e8 [ 35.043075] device_unregister+0x20/0x40 [ 35.043082] mipi_dsi_remove_device_fn+0x18/0x28 [ 35.043093] device_for_each_child+0x68/0xb0 [ 35.043105] mipi_dsi_host_unregister+0x40/0x90 [ 35.043115] vc4_dsi_host_attach+0xf0/0x120 [vc4] [ 35.043199] mipi_dsi_attach+0x30/0x48 [ 35.043209] tc358762_probe+0x128/0x164 [tc358762] [ 35.043225] mipi_dsi_drv_probe+0x28/0x38 [ 35.043234] really_probe+0xc0/0x318 [ 35.043244] __driver_probe_device+0x80/0xe8 [ 35.043254] driver_probe_device+0xb8/0x118 [ 35.043263] __device_attach_driver+0x98/0xe8 [ 35.043273] bus_for_each_drv+0x84/0xd8 [ 35.043281] __device_attach+0xf0/0x150 [ 35.043290] device_initial_probe+0x1c/0x28 [ 35.043300] bus_probe_device+0xa4/0xb0 [ 35.043308] deferred_probe_work_func+0xa0/0xe0 [ 35.043318] process_one_work+0x254/0x700 [ 35.043330] worker_thread+0x4c/0x448 [ 35.043339] kthread+0x19c/0x1a8 [ 35.043348] ret_from_fork+0x10/0x20 Shutdown Call trace: [ 365.565417] Call trace: [ 365.565423] __switch_to+0x148/0x200 [ 365.565452] __schedule+0x340/0x9c8 [ 365.565467] schedule+0x48/0x110 [ 365.565479] schedule_timeout+0x3b0/0x448 [ 365.565496] wait_for_completion+0xac/0x138 [ 365.565509] __flush_work+0x218/0x4e0 [ 365.565523] flush_work+0x1c/0x28 [ 365.565536] wait_for_device_probe+0x68/0x158 [ 365.565550] device_shutdown+0x24/0x348 [ 365.565561] kernel_restart_prepare+0x40/0x50 [ 365.565578] kernel_restart+0x20/0x70 [ 365.565591] __do_sys_reboot+0x10c/0x220 [ 365.565605] __arm64_sys_reboot+0x2c/0x38 [ 365.565619] invoke_syscall+0x4c/0x110 [ 365.565634] el0_svc_common.constprop.3+0xfc/0x120 [ 365.565648] do_el0_svc+0x2c/0x90 [ 365.565661] el0_svc+0x4c/0xf0 [ 365.565671] el0t_64_sync_handler+0x90/0xb8 [ 365.565682] el0t_64_sync+0x180/0x184 Signed-off-by: Padmanabha Srinivasaiah Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220118005127.29015-1-treasure4paddy@gmail.com --- drivers/gpu/drm/vc4/vc4_dsi.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/vc4/vc4_dsi.c b/drivers/gpu/drm/vc4/vc4_dsi.c index a229da58962a..9300d3354c51 100644 --- a/drivers/gpu/drm/vc4/vc4_dsi.c +++ b/drivers/gpu/drm/vc4/vc4_dsi.c @@ -1262,7 +1262,6 @@ static int vc4_dsi_host_attach(struct mipi_dsi_host *host, struct mipi_dsi_device *device) { struct vc4_dsi *dsi = host_to_dsi(host); - int ret; dsi->lanes = device->lanes; dsi->channel = device->channel; @@ -1297,18 +1296,15 @@ static int vc4_dsi_host_attach(struct mipi_dsi_host *host, return 0; } - ret = component_add(&dsi->pdev->dev, &vc4_dsi_ops); - if (ret) { - mipi_dsi_host_unregister(&dsi->dsi_host); - return ret; - } - - return 0; + return component_add(&dsi->pdev->dev, &vc4_dsi_ops); } static int vc4_dsi_host_detach(struct mipi_dsi_host *host, struct mipi_dsi_device *device) { + struct vc4_dsi *dsi = host_to_dsi(host); + + component_del(&dsi->pdev->dev, &vc4_dsi_ops); return 0; } @@ -1686,9 +1682,7 @@ static int vc4_dsi_dev_remove(struct platform_device *pdev) struct device *dev = &pdev->dev; struct vc4_dsi *dsi = dev_get_drvdata(dev); - component_del(&pdev->dev, &vc4_dsi_ops); mipi_dsi_host_unregister(&dsi->dsi_host); - return 0; } From 8b59b0a53c840921b625378f137e88adfa87647e Mon Sep 17 00:00:00 2001 From: sparkhuang Date: Wed, 15 Dec 2021 10:08:23 +0100 Subject: [PATCH 0173/1295] ARM: 9170/1: fix panic when kasan and kprobe are enabled arm32 uses software to simulate the instruction replaced by kprobe. some instructions may be simulated by constructing assembly functions. therefore, before executing instruction simulation, it is necessary to construct assembly function execution environment in C language through binding registers. after kasan is enabled, the register binding relationship will be destroyed, resulting in instruction simulation errors and causing kernel panic. the kprobe emulate instruction function is distributed in three files: actions-common.c actions-arm.c actions-thumb.c, so disable KASAN when compiling these files. for example, use kprobe insert on cap_capable+20 after kasan enabled, the cap_capable assembly code is as follows: : e92d47f0 push {r4, r5, r6, r7, r8, r9, sl, lr} e1a05000 mov r5, r0 e280006c add r0, r0, #108 ; 0x6c e1a04001 mov r4, r1 e1a06002 mov r6, r2 e59fa090 ldr sl, [pc, #144] ; ebfc7bf8 bl c03aa4b4 <__asan_load4> e595706c ldr r7, [r5, #108] ; 0x6c e2859014 add r9, r5, #20 ...... The emulate_ldr assembly code after enabling kasan is as follows: c06f1384 : e92d47f0 push {r4, r5, r6, r7, r8, r9, sl, lr} e282803c add r8, r2, #60 ; 0x3c e1a05000 mov r5, r0 e7e37855 ubfx r7, r5, #16, #4 e1a00008 mov r0, r8 e1a09001 mov r9, r1 e1a04002 mov r4, r2 ebf35462 bl c03c6530 <__asan_load4> e357000f cmp r7, #15 e7e36655 ubfx r6, r5, #12, #4 e205a00f and sl, r5, #15 0a000001 beq c06f13bc e0840107 add r0, r4, r7, lsl #2 ebf3545c bl c03c6530 <__asan_load4> e084010a add r0, r4, sl, lsl #2 ebf3545a bl c03c6530 <__asan_load4> e2890010 add r0, r9, #16 ebf35458 bl c03c6530 <__asan_load4> e5990010 ldr r0, [r9, #16] e12fff30 blx r0 e356000f cm r6, #15 1a000014 bne c06f1430 e1a06000 mov r6, r0 e2840040 add r0, r4, #64 ; 0x40 ...... when running in emulate_ldr to simulate the ldr instruction, panic occurred, and the log is as follows: Unable to handle kernel NULL pointer dereference at virtual address 00000090 pgd = ecb46400 [00000090] *pgd=2e0fa003, *pmd=00000000 Internal error: Oops: 206 [#1] SMP ARM PC is at cap_capable+0x14/0xb0 LR is at emulate_ldr+0x50/0xc0 psr: 600d0293 sp : ecd63af8 ip : 00000004 fp : c0a7c30c r10: 00000000 r9 : c30897f4 r8 : ecd63cd4 r7 : 0000000f r6 : 0000000a r5 : e59fa090 r4 : ecd63c98 r3 : c06ae294 r2 : 00000000 r1 : b7611300 r0 : bf4ec008 Flags: nZCv IRQs off FIQs on Mode SVC_32 ISA ARM Segment user Control: 32c5387d Table: 2d546400 DAC: 55555555 Process bash (pid: 1643, stack limit = 0xecd60190) (cap_capable) from (kprobe_handler+0x218/0x340) (kprobe_handler) from (kprobe_trap_handler+0x24/0x48) (kprobe_trap_handler) from (do_undefinstr+0x13c/0x364) (do_undefinstr) from (__und_svc_finish+0x0/0x30) (__und_svc_finish) from (cap_capable+0x18/0xb0) (cap_capable) from (cap_vm_enough_memory+0x38/0x48) (cap_vm_enough_memory) from (security_vm_enough_memory_mm+0x48/0x6c) (security_vm_enough_memory_mm) from (copy_process.constprop.5+0x16b4/0x25c8) (copy_process.constprop.5) from (_do_fork+0xe8/0x55c) (_do_fork) from (SyS_clone+0x1c/0x24) (SyS_clone) from (__sys_trace_return+0x0/0x10) Code: 0050a0e1 6c0080e2 0140a0e1 0260a0e1 (f801f0e7) Fixes: 35aa1df43283 ("ARM kprobes: instruction single-stepping support") Fixes: 421015713b30 ("ARM: 9017/2: Enable KASan for ARM") Signed-off-by: huangshaobo Acked-by: Ard Biesheuvel Signed-off-by: Russell King (Oracle) --- arch/arm/probes/kprobes/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/probes/kprobes/Makefile b/arch/arm/probes/kprobes/Makefile index 14db56f49f0a..6159010dac4a 100644 --- a/arch/arm/probes/kprobes/Makefile +++ b/arch/arm/probes/kprobes/Makefile @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +KASAN_SANITIZE_actions-common.o := n +KASAN_SANITIZE_actions-arm.o := n +KASAN_SANITIZE_actions-thumb.o := n obj-$(CONFIG_KPROBES) += core.o actions-common.o checkers-common.o obj-$(CONFIG_ARM_KPROBES_TEST) += test-kprobes.o test-kprobes-objs := test-core.o From 15420269b02a63ed8c1841905d8b8b2403246004 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 18 Jan 2022 13:45:09 +0100 Subject: [PATCH 0174/1295] ARM: 9179/1: uaccess: avoid alignment faults in copy_[from|to]_kernel_nofault The helpers that are used to implement copy_from_kernel_nofault() and copy_to_kernel_nofault() cast a void* to a pointer to a wider type, which may result in alignment faults on ARM if the compiler decides to use double-word or multiple-word load/store instructions. Only configurations that define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y are affected, given that commit 2423de2e6f4d ("ARM: 9115/1: mm/maccess: fix unaligned copy_{from,to}_kernel_nofault") ensures that dst and src are sufficiently aligned otherwise. So use the unaligned accessors for accessing dst and src in cases where they may be misaligned. Cc: # depends on 2423de2e6f4d Fixes: 2df4c9a741a0 ("ARM: 9112/1: uaccess: add __{get,put}_kernel_nofault") Reviewed-by: Arnd Bergmann Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King (Oracle) --- arch/arm/include/asm/uaccess.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 36fbc3329252..32dbfd81f42a 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -497,7 +498,10 @@ do { \ } \ default: __err = __get_user_bad(); break; \ } \ - *(type *)(dst) = __val; \ + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) \ + put_unaligned(__val, (type *)(dst)); \ + else \ + *(type *)(dst) = __val; /* aligned by caller */ \ if (__err) \ goto err_label; \ } while (0) @@ -507,7 +511,9 @@ do { \ const type *__pk_ptr = (dst); \ unsigned long __dst = (unsigned long)__pk_ptr; \ int __err = 0; \ - type __val = *(type *)src; \ + type __val = IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \ + ? get_unaligned((type *)(src)) \ + : *(type *)(src); /* aligned by caller */ \ switch (sizeof(type)) { \ case 1: __put_user_asm_byte(__val, __dst, __err, ""); break; \ case 2: __put_user_asm_half(__val, __dst, __err, ""); break; \ From 9f80ccda53b9417236945bc7ece4b519037df74d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 18 Jan 2022 19:32:17 +0100 Subject: [PATCH 0175/1295] ARM: 9180/1: Thumb2: align ALT_UP() sections in modules sufficiently When building for Thumb2, the .alt.smp.init sections that are emitted by the ALT_UP() patching code may not be 32-bit aligned, even though the fixup_smp_on_up() routine expects that. This results in alignment faults at module load time, which need to be fixed up by the fault handler. So let's align those sections explicitly, and prevent this from occurring. Cc: Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King (Oracle) --- arch/arm/include/asm/assembler.h | 2 ++ arch/arm/include/asm/processor.h | 1 + 2 files changed, 3 insertions(+) diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 7d23d4bb2168..6fe67963ba5a 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -288,6 +288,7 @@ */ #define ALT_UP(instr...) \ .pushsection ".alt.smp.init", "a" ;\ + .align 2 ;\ .long 9998b - . ;\ 9997: instr ;\ .if . - 9997b == 2 ;\ @@ -299,6 +300,7 @@ .popsection #define ALT_UP_B(label) \ .pushsection ".alt.smp.init", "a" ;\ + .align 2 ;\ .long 9998b - . ;\ W(b) . + (label - 9998b) ;\ .popsection diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 6af68edfa53a..bdc35c0e8dfb 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -96,6 +96,7 @@ unsigned long __get_wchan(struct task_struct *p); #define __ALT_SMP_ASM(smp, up) \ "9998: " smp "\n" \ " .pushsection \".alt.smp.init\", \"a\"\n" \ + " .align 2\n" \ " .long 9998b - .\n" \ " " up "\n" \ " .popsection\n" From d3cbc6e323c9299d10c8d2e4127c77c7d05d07b1 Mon Sep 17 00:00:00 2001 From: Raymond Jay Golo Date: Thu, 13 Jan 2022 08:06:20 +0800 Subject: [PATCH 0176/1295] drm: panel-orientation-quirks: Add quirk for the 1Netbook OneXPlayer The 1Netbook OneXPlayer uses a panel which has been mounted 90 degrees rotated. Add a quirk for this. Signed-off-by: Raymond Jay Golo Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20220113000619.90988-1-rjgolo@gmail.com --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index 042bb80383c9..b910978d3e48 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -115,6 +115,12 @@ static const struct drm_dmi_panel_orientation_data lcd1280x1920_rightside_up = { .orientation = DRM_MODE_PANEL_ORIENTATION_RIGHT_UP, }; +static const struct drm_dmi_panel_orientation_data lcd1600x2560_leftside_up = { + .width = 1600, + .height = 2560, + .orientation = DRM_MODE_PANEL_ORIENTATION_LEFT_UP, +}; + static const struct dmi_system_id orientation_data[] = { { /* Acer One 10 (S1003) */ .matches = { @@ -275,6 +281,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "Default string"), }, .driver_data = (void *)&onegx1_pro, + }, { /* OneXPlayer */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "ONE-NETBOOK TECHNOLOGY CO., LTD."), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "ONE XPLAYER"), + }, + .driver_data = (void *)&lcd1600x2560_leftside_up, }, { /* Samsung GalaxyBook 10.6 */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), From c476d430bfc02115e67a32a268ebd31b8d683698 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 21 Dec 2021 08:52:09 -0400 Subject: [PATCH 0177/1295] dt-bindings: display: Add SPI peripheral schema to SPI based displays With 'unevaluatedProperties' support enabled, several SPI based display binding examples have warnings: Documentation/devicetree/bindings/display/panel/samsung,ld9040.example.dt.yaml: lcd@0: Unevaluated properties are not allowed ('#address-cells', '#size-cells', 'spi-max-frequency', 'spi-cpol', 'spi-cpha' were unexpected) Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.example.dt.yaml: panel@0: Unevaluated properties are not allowed ('spi-max-frequency', 'spi-3wire' were unexpected) Documentation/devicetree/bindings/display/panel/ilitek,ili9322.example.dt.yaml: display@0: Unevaluated properties are not allowed ('reg' was unexpected) Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.example.dt.yaml: display@0: Unevaluated properties are not allowed ('spi-max-frequency' was unexpected) Documentation/devicetree/bindings/display/panel/abt,y030xx067a.example.dt.yaml: panel@0: Unevaluated properties are not allowed ('spi-max-frequency' was unexpected) Documentation/devicetree/bindings/display/panel/sony,acx565akm.example.dt.yaml: panel@2: Unevaluated properties are not allowed ('spi-max-frequency', 'reg' were unexpected) Documentation/devicetree/bindings/display/panel/tpo,td.example.dt.yaml: panel@0: Unevaluated properties are not allowed ('spi-max-frequency', 'spi-cpol', 'spi-cpha' were unexpected) Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.example.dt.yaml: panel@0: Unevaluated properties are not allowed ('reg', 'spi-max-frequency', 'spi-cpol', 'spi-cpha' were unexpected) Documentation/devicetree/bindings/display/panel/innolux,ej030na.example.dt.yaml: panel@0: Unevaluated properties are not allowed ('spi-max-frequency' was unexpected) Documentation/devicetree/bindings/display/panel/sitronix,st7789v.example.dt.yaml: panel@0: Unevaluated properties are not allowed ('spi-max-frequency', 'spi-cpol', 'spi-cpha' were unexpected) Fix all of these by adding a reference to spi-peripheral-props.yaml. With this, the description that the binding must follow spi-controller.yaml is both a bit out of date and redundant, so remove it. Signed-off-by: Rob Herring Reviewed-by: Linus Walleij Acked-by: Paul Cercueil Acked-by: Sam Ravnborg Link: https://lore.kernel.org/r/20211221125209.1195932-1-robh@kernel.org --- .../devicetree/bindings/display/panel/abt,y030xx067a.yaml | 5 +---- .../devicetree/bindings/display/panel/ilitek,ili9322.yaml | 4 +--- .../devicetree/bindings/display/panel/innolux,ej030na.yaml | 5 +---- .../bindings/display/panel/kingdisplay,kd035g6-54nt.yaml | 5 +---- .../bindings/display/panel/lgphilips,lb035q02.yaml | 5 +---- .../devicetree/bindings/display/panel/samsung,ld9040.yaml | 7 +------ .../devicetree/bindings/display/panel/samsung,s6e63m0.yaml | 1 + .../bindings/display/panel/sitronix,st7789v.yaml | 5 +---- .../devicetree/bindings/display/panel/sony,acx565akm.yaml | 5 +---- .../devicetree/bindings/display/panel/tpo,td.yaml | 5 +---- 10 files changed, 10 insertions(+), 37 deletions(-) diff --git a/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml b/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml index a108029ecfab..acd2f3faa6b9 100644 --- a/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml +++ b/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml @@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Asia Better Technology 3.0" (320x480 pixels) 24-bit IPS LCD panel -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Paul Cercueil allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/ilitek,ili9322.yaml b/Documentation/devicetree/bindings/display/panel/ilitek,ili9322.yaml index e89c1ea62ffa..7d221ef35443 100644 --- a/Documentation/devicetree/bindings/display/panel/ilitek,ili9322.yaml +++ b/Documentation/devicetree/bindings/display/panel/ilitek,ili9322.yaml @@ -15,11 +15,9 @@ description: | 960 TFT source driver pins and 240 TFT gate driver pins, VCOM, VCOML and VCOMH outputs. - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/innolux,ej030na.yaml b/Documentation/devicetree/bindings/display/panel/innolux,ej030na.yaml index cda36c04e85c..72788e3e6c59 100644 --- a/Documentation/devicetree/bindings/display/panel/innolux,ej030na.yaml +++ b/Documentation/devicetree/bindings/display/panel/innolux,ej030na.yaml @@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Innolux EJ030NA 3.0" (320x480 pixels) 24-bit TFT LCD panel -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Paul Cercueil allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.yaml b/Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.yaml index c45c92a3d41f..2a2756d19681 100644 --- a/Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.yaml +++ b/Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.yaml @@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: King Display KD035G6-54NT 3.5" (320x240 pixels) 24-bit TFT LCD panel -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Paul Cercueil allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.yaml b/Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.yaml index 830e335ddb53..5e4e0e552c2f 100644 --- a/Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.yaml +++ b/Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.yaml @@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: LG.Philips LB035Q02 Panel -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Tomi Valkeinen allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/samsung,ld9040.yaml b/Documentation/devicetree/bindings/display/panel/samsung,ld9040.yaml index 060ee27a4749..d525165d6d63 100644 --- a/Documentation/devicetree/bindings/display/panel/samsung,ld9040.yaml +++ b/Documentation/devicetree/bindings/display/panel/samsung,ld9040.yaml @@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Samsung LD9040 AMOLED LCD parallel RGB panel with SPI control bus -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Andrzej Hajda allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: @@ -63,8 +60,6 @@ examples: lcd@0 { compatible = "samsung,ld9040"; - #address-cells = <1>; - #size-cells = <0>; reg = <0>; vdd3-supply = <&ldo7_reg>; diff --git a/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.yaml b/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.yaml index ea58df49263a..940f7f88526f 100644 --- a/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.yaml +++ b/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.yaml @@ -12,6 +12,7 @@ maintainers: allOf: - $ref: panel-common.yaml# - $ref: /schemas/leds/backlight/common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/sitronix,st7789v.yaml b/Documentation/devicetree/bindings/display/panel/sitronix,st7789v.yaml index fa46d151e7b3..9e1d707c2ace 100644 --- a/Documentation/devicetree/bindings/display/panel/sitronix,st7789v.yaml +++ b/Documentation/devicetree/bindings/display/panel/sitronix,st7789v.yaml @@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Sitronix ST7789V RGB panel with SPI control bus -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Maxime Ripard allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/sony,acx565akm.yaml b/Documentation/devicetree/bindings/display/panel/sony,acx565akm.yaml index 95d053c548ab..98abdf4ddeac 100644 --- a/Documentation/devicetree/bindings/display/panel/sony,acx565akm.yaml +++ b/Documentation/devicetree/bindings/display/panel/sony,acx565akm.yaml @@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Sony ACX565AKM SDI Panel -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Tomi Valkeinen allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/tpo,td.yaml b/Documentation/devicetree/bindings/display/panel/tpo,td.yaml index 4aa605613445..f902a9d74141 100644 --- a/Documentation/devicetree/bindings/display/panel/tpo,td.yaml +++ b/Documentation/devicetree/bindings/display/panel/tpo,td.yaml @@ -6,16 +6,13 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Toppoly TD Panels -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Marek Belisko - H. Nikolaus Schaller allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: From 59449e5dc87e72e9b4a16df115625a93b0112203 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 6 Jan 2022 12:25:14 -0600 Subject: [PATCH 0178/1295] dt-bindings: mmc: arm,pl18x: Make each example a separate entry Each independent example should be a separate entry. This and dropping 'interrupt-parent' allows for 'interrupts' to have different cell sizes. Signed-off-by: Rob Herring Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20220106182518.1435497-6-robh@kernel.org --- Documentation/devicetree/bindings/mmc/arm,pl18x.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mmc/arm,pl18x.yaml b/Documentation/devicetree/bindings/mmc/arm,pl18x.yaml index 47595cb483be..2a64cffbe6ad 100644 --- a/Documentation/devicetree/bindings/mmc/arm,pl18x.yaml +++ b/Documentation/devicetree/bindings/mmc/arm,pl18x.yaml @@ -167,6 +167,9 @@ examples: clock-names = "mclk", "apb_pclk"; }; + - | + #include + mmc@80126000 { compatible = "arm,pl18x", "arm,primecell"; reg = <0x80126000 0x1000>; @@ -188,12 +191,12 @@ examples: vqmmc-supply = <&vmmci>; }; + - | mmc@101f6000 { compatible = "arm,pl18x", "arm,primecell"; reg = <0x101f6000 0x1000>; clocks = <&sdiclk>, <&pclksdi>; clock-names = "mclk", "apb_pclk"; - interrupt-parent = <&vica>; interrupts = <22>; max-frequency = <400000>; bus-width = <4>; @@ -208,6 +211,7 @@ examples: vmmc-supply = <&vmmc_regulator>; }; + - | mmc@52007000 { compatible = "arm,pl18x", "arm,primecell"; arm,primecell-periphid = <0x10153180>; From d9dfab097d90f74dd8d7198aa6e8b87bc15f2122 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 6 Jan 2022 12:25:15 -0600 Subject: [PATCH 0179/1295] dt-bindings: rtc: st,stm32-rtc: Make each example a separate entry Each independent example should be a separate entry. This allows for 'interrupts' to have different cell sizes. Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220106182518.1435497-7-robh@kernel.org --- Documentation/devicetree/bindings/rtc/st,stm32-rtc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/rtc/st,stm32-rtc.yaml b/Documentation/devicetree/bindings/rtc/st,stm32-rtc.yaml index 2359f541b770..764717ce1873 100644 --- a/Documentation/devicetree/bindings/rtc/st,stm32-rtc.yaml +++ b/Documentation/devicetree/bindings/rtc/st,stm32-rtc.yaml @@ -127,6 +127,7 @@ examples: st,syscfg = <&pwrcfg 0x00 0x100>; }; + - | #include #include rtc@5c004000 { From 4c907bcd9dcd233da6707059d777ab389dcbd964 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 19 Jan 2022 15:31:01 +0300 Subject: [PATCH 0180/1295] ASoC: max9759: fix underflow in speaker_gain_control_put() Check for negative values of "priv->gain" to prevent an out of bounds access. The concern is that these might come from the user via: -> snd_ctl_elem_write_user() -> snd_ctl_elem_write() -> kctl->put() Fixes: fa8d915172b8 ("ASoC: max9759: Add Amplifier Driver") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20220119123101.GA9509@kili Signed-off-by: Mark Brown --- sound/soc/codecs/max9759.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/max9759.c b/sound/soc/codecs/max9759.c index d75fd61b9032..bc57d7687f16 100644 --- a/sound/soc/codecs/max9759.c +++ b/sound/soc/codecs/max9759.c @@ -64,7 +64,8 @@ static int speaker_gain_control_put(struct snd_kcontrol *kcontrol, struct snd_soc_component *c = snd_soc_kcontrol_component(kcontrol); struct max9759 *priv = snd_soc_component_get_drvdata(c); - if (ucontrol->value.integer.value[0] > 3) + if (ucontrol->value.integer.value[0] < 0 || + ucontrol->value.integer.value[0] > 3) return -EINVAL; priv->gain = ucontrol->value.integer.value[0]; From 579b2c8f72d974f27d85bbd53846f34675ee3b01 Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Mon, 17 Jan 2022 00:03:24 -0500 Subject: [PATCH 0181/1295] ASoC: mediatek: fix unmet dependency on GPIOLIB for SND_SOC_DMIC When SND_SOC_MT8195_MT6359_RT1011_RT5682 is selected, and GPIOLIB is not selected, Kbuild gives the following warning: WARNING: unmet direct dependencies detected for SND_SOC_DMIC Depends on [n]: SOUND [=y] && !UML && SND [=y] && SND_SOC [=y] && GPIOLIB [=n] Selected by [y]: - SND_SOC_MT8195_MT6359_RT1011_RT5682 [=y] && SOUND [=y] && !UML && SND [=y] && SND_SOC [=y] && I2C [=y] && SND_SOC_MT8195 [=y] && MTK_PMIC_WRAP [=y] This is because SND_SOC_MT8195_MT6359_RT1011_RT5682 selects SND_SOC_DMIC without selecting or depending on GPIOLIB, depsite SND_SOC_DMIC depending on GPIOLIB. This unmet dependency bug was detected by Kismet, a static analysis tool for Kconfig. Please advise if this is not the appropriate solution. Signed-off-by: Julian Braha Reviewed-by: Tzung-Bi Shih Link: https://lore.kernel.org/r/20220117050324.68371-1-julianbraha@gmail.com Signed-off-by: Mark Brown --- sound/soc/mediatek/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/mediatek/Kconfig b/sound/soc/mediatek/Kconfig index 9306b7ca2644..0d154350f180 100644 --- a/sound/soc/mediatek/Kconfig +++ b/sound/soc/mediatek/Kconfig @@ -216,7 +216,7 @@ config SND_SOC_MT8195_MT6359_RT1019_RT5682 config SND_SOC_MT8195_MT6359_RT1011_RT5682 tristate "ASoC Audio driver for MT8195 with MT6359 RT1011 RT5682 codec" - depends on I2C + depends on I2C && GPIOLIB depends on SND_SOC_MT8195 && MTK_PMIC_WRAP select SND_SOC_MT6359 select SND_SOC_RT1011 From b4c18c18ebf7cf1e602af88c12ef9cb0d6e5ce51 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 14 Jan 2022 19:36:03 -0800 Subject: [PATCH 0182/1295] regulator: MAX20086: add gpio/consumer.h max20086-regulator.c needs for an enum, some macros, and a function prototype. (seen on ARCH=m68k) Adding this header file fixes multiple build errors: ../drivers/regulator/max20086-regulator.c: In function 'max20086_i2c_probe': ../drivers/regulator/max20086-regulator.c:217:26: error: storage size of 'flags' isn't known 217 | enum gpiod_flags flags; ../drivers/regulator/max20086-regulator.c:261:27: error: 'GPIOD_OUT_HIGH' undeclared (first use in this function); did you mean 'GPIOF_INIT_HIGH'? 261 | flags = boot_on ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW; | ^~~~~~~~~~~~~~ ../drivers/regulator/max20086-regulator.c:261:44: error: 'GPIOD_OUT_LOW' undeclared (first use in this function); did you mean 'GPIOF_INIT_LOW'? 261 | flags = boot_on ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW; ../drivers/regulator/max20086-regulator.c:262:27: error: implicit declaration of function 'devm_gpiod_get'; did you mean 'devm_gpio_free'? [-Werror=implicit-function-declaration] 262 | chip->ena_gpiod = devm_gpiod_get(chip->dev, "enable", flags); ../drivers/regulator/max20086-regulator.c:217:26: warning: unused variable 'flags' [-Wunused-variable] 217 | enum gpiod_flags flags; Fixes: bfff546aae50 ("regulator: Add MAX20086-MAX20089 driver") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Watson Chow Cc: Mark Brown Cc: Laurent Pinchart Reviewed-by: Laurent Pinchart Link: https://lore.kernel.org/r/20220115033603.24473-1-rdunlap@infradead.org Signed-off-by: Mark Brown --- drivers/regulator/max20086-regulator.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/regulator/max20086-regulator.c b/drivers/regulator/max20086-regulator.c index 63aa6ec3254a..b8bf76c170fe 100644 --- a/drivers/regulator/max20086-regulator.c +++ b/drivers/regulator/max20086-regulator.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include From e4d63473d3110afd170e6e0e48494d3789d26136 Mon Sep 17 00:00:00 2001 From: Patrice Chotard Date: Mon, 17 Jan 2022 13:17:44 +0100 Subject: [PATCH 0183/1295] spi: stm32-qspi: Update spi registering Some device driver need to communicate to qspi device during the remove process, qspi controller must be functional when spi_unregister_master() is called. To ensure this, replace devm_spi_register_master() by spi_register_master() and spi_unregister_master() is called directly in .remove callback before stopping the qspi controller. This issue was put in evidence using kernel v5.11 and later with a spi-nor which supports the software reset feature introduced by commit d73ee7534cc5 ("mtd: spi-nor: core: perform a Soft Reset on shutdown") Fixes: c530cd1d9d5e ("spi: spi-mem: add stm32 qspi controller") Signed-off-by: Patrice Chotard Cc: # 5.8.x Reviewed-by: Lukas Wunner Link: https://lore.kernel.org/r/20220117121744.29729-1-patrice.chotard@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32-qspi.c | 47 +++++++++++++----------------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/drivers/spi/spi-stm32-qspi.c b/drivers/spi/spi-stm32-qspi.c index 514337c86d2c..ffdc55f87e82 100644 --- a/drivers/spi/spi-stm32-qspi.c +++ b/drivers/spi/spi-stm32-qspi.c @@ -688,7 +688,7 @@ static int stm32_qspi_probe(struct platform_device *pdev) struct resource *res; int ret, irq; - ctrl = spi_alloc_master(dev, sizeof(*qspi)); + ctrl = devm_spi_alloc_master(dev, sizeof(*qspi)); if (!ctrl) return -ENOMEM; @@ -697,58 +697,46 @@ static int stm32_qspi_probe(struct platform_device *pdev) res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "qspi"); qspi->io_base = devm_ioremap_resource(dev, res); - if (IS_ERR(qspi->io_base)) { - ret = PTR_ERR(qspi->io_base); - goto err_master_put; - } + if (IS_ERR(qspi->io_base)) + return PTR_ERR(qspi->io_base); qspi->phys_base = res->start; res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "qspi_mm"); qspi->mm_base = devm_ioremap_resource(dev, res); - if (IS_ERR(qspi->mm_base)) { - ret = PTR_ERR(qspi->mm_base); - goto err_master_put; - } + if (IS_ERR(qspi->mm_base)) + return PTR_ERR(qspi->mm_base); qspi->mm_size = resource_size(res); - if (qspi->mm_size > STM32_QSPI_MAX_MMAP_SZ) { - ret = -EINVAL; - goto err_master_put; - } + if (qspi->mm_size > STM32_QSPI_MAX_MMAP_SZ) + return -EINVAL; irq = platform_get_irq(pdev, 0); - if (irq < 0) { - ret = irq; - goto err_master_put; - } + if (irq < 0) + return irq; ret = devm_request_irq(dev, irq, stm32_qspi_irq, 0, dev_name(dev), qspi); if (ret) { dev_err(dev, "failed to request irq\n"); - goto err_master_put; + return ret; } init_completion(&qspi->data_completion); init_completion(&qspi->match_completion); qspi->clk = devm_clk_get(dev, NULL); - if (IS_ERR(qspi->clk)) { - ret = PTR_ERR(qspi->clk); - goto err_master_put; - } + if (IS_ERR(qspi->clk)) + return PTR_ERR(qspi->clk); qspi->clk_rate = clk_get_rate(qspi->clk); - if (!qspi->clk_rate) { - ret = -EINVAL; - goto err_master_put; - } + if (!qspi->clk_rate) + return -EINVAL; ret = clk_prepare_enable(qspi->clk); if (ret) { dev_err(dev, "can not enable the clock\n"); - goto err_master_put; + return ret; } rstc = devm_reset_control_get_exclusive(dev, NULL); @@ -784,7 +772,7 @@ static int stm32_qspi_probe(struct platform_device *pdev) pm_runtime_enable(dev); pm_runtime_get_noresume(dev); - ret = devm_spi_register_master(dev, ctrl); + ret = spi_register_master(ctrl); if (ret) goto err_pm_runtime_free; @@ -806,8 +794,6 @@ err_dma_free: stm32_qspi_dma_free(qspi); err_clk_disable: clk_disable_unprepare(qspi->clk); -err_master_put: - spi_master_put(qspi->ctrl); return ret; } @@ -817,6 +803,7 @@ static int stm32_qspi_remove(struct platform_device *pdev) struct stm32_qspi *qspi = platform_get_drvdata(pdev); pm_runtime_get_sync(qspi->dev); + spi_unregister_master(qspi->ctrl); /* disable qspi */ writel_relaxed(0, qspi->io_base + QSPI_CR); stm32_qspi_dma_free(qspi); From 3cefddb72f80dc8d49ce605628ceb6525cfd64da Mon Sep 17 00:00:00 2001 From: Alain Volmat Date: Wed, 19 Jan 2022 10:32:44 +0100 Subject: [PATCH 0184/1295] spi: stm32: remove inexistant variables in struct stm32_spi_cfg comment Variables 'can_dma' and 'has_startbit' are described within the struct stm32_spi_cfg comment but have never existed in this structure so remove them. Signed-off-by: Alain Volmat Link: https://lore.kernel.org/r/20220119093245.624878-2-alain.volmat@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index 9bd3fd1652f7..b5ef2470cefe 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -221,7 +221,6 @@ struct stm32_spi; * time between frames (if driver has this functionality) * @set_number_of_data: optional routine to configure registers to desired * number of data (if driver has this functionality) - * @can_dma: routine to determine if the transfer is eligible for DMA use * @transfer_one_dma_start: routine to start transfer a single spi_transfer * using DMA * @dma_rx_cb: routine to call after DMA RX channel operation is complete @@ -232,7 +231,6 @@ struct stm32_spi; * @baud_rate_div_min: minimum baud rate divisor * @baud_rate_div_max: maximum baud rate divisor * @has_fifo: boolean to know if fifo is used for driver - * @has_startbit: boolean to know if start bit is used to start transfer */ struct stm32_spi_cfg { const struct stm32_spi_regspec *regs; From 9df15d842a0f77f2b8ee29386f6d714e4220df57 Mon Sep 17 00:00:00 2001 From: Alain Volmat Date: Wed, 19 Jan 2022 10:32:45 +0100 Subject: [PATCH 0185/1295] spi: stm32: make SPI_MASTER_MUST_TX flags only specific to STM32F4 Commit 61367d0b8f5e ("spi: stm32: Add 'SPI_SIMPLEX_RX', 'SPI_3WIRE_RX' support for stm32f4") allowed to properly communicate with the st-gyro-spi even when there is no tx_buf provided by setting the flag SPI_MASTER_MUST_TX and thus forcing a dummy TX buffer to work in Full Duplex. This behavior should kept only for the STM32F4 and not for other compatible since the STM32H7 do support SIMPLEX_RX and SIMPLEX_TX. Add the flags variable within the struct stm32_spi_cfg so that flags used at master registration time are compatible specific. Fixes: 61367d0b8f5e ("spi: stm32: Add 'SPI_SIMPLEX_RX', 'SPI_3WIRE_RX' support for stm32f4") Signed-off-by: Alain Volmat Link: https://lore.kernel.org/r/20220119093245.624878-3-alain.volmat@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index b5ef2470cefe..7fc24505a72c 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -231,6 +231,7 @@ struct stm32_spi; * @baud_rate_div_min: minimum baud rate divisor * @baud_rate_div_max: maximum baud rate divisor * @has_fifo: boolean to know if fifo is used for driver + * @flags: compatible specific SPI controller flags used at registration time */ struct stm32_spi_cfg { const struct stm32_spi_regspec *regs; @@ -251,6 +252,7 @@ struct stm32_spi_cfg { unsigned int baud_rate_div_min; unsigned int baud_rate_div_max; bool has_fifo; + u16 flags; }; /** @@ -1720,6 +1722,7 @@ static const struct stm32_spi_cfg stm32f4_spi_cfg = { .baud_rate_div_min = STM32F4_SPI_BR_DIV_MIN, .baud_rate_div_max = STM32F4_SPI_BR_DIV_MAX, .has_fifo = false, + .flags = SPI_MASTER_MUST_TX, }; static const struct stm32_spi_cfg stm32h7_spi_cfg = { @@ -1852,7 +1855,7 @@ static int stm32_spi_probe(struct platform_device *pdev) master->prepare_message = stm32_spi_prepare_msg; master->transfer_one = stm32_spi_transfer_one; master->unprepare_message = stm32_spi_unprepare_msg; - master->flags = SPI_MASTER_MUST_TX; + master->flags = spi->cfg->flags; spi->dma_tx = dma_request_chan(spi->dev, "tx"); if (IS_ERR(spi->dma_tx)) { From 9d5f0c36438eeae7566ca383b2b673179e3cc613 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 18 Jan 2022 09:02:43 -0300 Subject: [PATCH 0186/1295] perf machine: Use path__join() to compose a path instead of snprintf(dir, '/', filename) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Its more intention revealing, and if we're interested in the odd cases where this may end up truncating we can do debug checks at one centralized place. Motivation, of all the container builds, fedora rawhide started complaining of: util/machine.c: In function ‘machine__create_modules’: util/machine.c:1419:50: error: ‘%s’ directive output may be truncated writing up to 255 bytes into a region of size between 0 and 4095 [-Werror=format-truncation=] 1419 | snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name); | ^~ In file included from /usr/include/stdio.h:894, from util/branch.h:9, from util/callchain.h:8, from util/machine.c:7: In function ‘snprintf’, inlined from ‘maps__set_modules_path_dir’ at util/machine.c:1419:3, inlined from ‘machine__set_modules_path’ at util/machine.c:1473:9, inlined from ‘machine__create_modules’ at util/machine.c:1519:7: /usr/include/bits/stdio2.h:71:10: note: ‘__builtin___snprintf_chk’ output between 2 and 4352 bytes into a destination of size 4096 There are other places where we should use path__join(), but lets get rid of this one first. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Acked-by: Ian Rogers Link: Link: https://lore.kernel.org/r/YebZKjwgfdOz0lAs@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 3901440aeff9..f70ba56912d4 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -16,6 +16,7 @@ #include "map_symbol.h" #include "branch.h" #include "mem-events.h" +#include "path.h" #include "srcline.h" #include "symbol.h" #include "sort.h" @@ -1416,7 +1417,7 @@ static int maps__set_modules_path_dir(struct maps *maps, const char *dir_name, i struct stat st; /*sshfs might return bad dent->d_type, so we have to stat*/ - snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name); + path__join(path, sizeof(path), dir_name, dent->d_name); if (stat(path, &st)) continue; From 7c8a4742c4abe205ec9daf416c9d42fd6b406e8e Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 13 Jan 2022 23:30:17 +0000 Subject: [PATCH 0187/1295] KVM: x86/mmu: Fix write-protection of PTs mapped by the TDP MMU When the TDP MMU is write-protection GFNs for page table protection (as opposed to for dirty logging, or due to the HVA not being writable), it checks if the SPTE is already write-protected and if so skips modifying the SPTE and the TLB flush. This behavior is incorrect because it fails to check if the SPTE is write-protected for page table protection, i.e. fails to check that MMU-writable is '0'. If the SPTE was write-protected for dirty logging but not page table protection, the SPTE could locklessly be made writable, and vCPUs could still be running with writable mappings cached in their TLB. Fix this by only skipping setting the SPTE if the SPTE is already write-protected *and* MMU-writable is already clear. Technically, checking only MMU-writable would suffice; a SPTE cannot be writable without MMU-writable being set. But check both to be paranoid and because it arguably yields more readable code. Fixes: 46044f72c382 ("kvm: x86/mmu: Support write protection for nesting in tdp MMU") Cc: stable@vger.kernel.org Signed-off-by: David Matlack Message-Id: <20220113233020.3986005-2-dmatlack@google.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7b1bc816b7c3..bc9e3553fba2 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1442,12 +1442,12 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, !is_last_spte(iter.old_spte, iter.level)) continue; - if (!is_writable_pte(iter.old_spte)) - break; - new_spte = iter.old_spte & ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); + if (new_spte == iter.old_spte) + break; + tdp_mmu_set_spte(kvm, &iter, new_spte); spte_set = true; } From f082d86ea68559e4bd1ecaffa04981b72281e28f Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 13 Jan 2022 23:30:18 +0000 Subject: [PATCH 0188/1295] KVM: x86/mmu: Clear MMU-writable during changed_pte notifier When handling the changed_pte notifier and the new PTE is read-only, clear both the Host-writable and MMU-writable bits in the SPTE. This preserves the invariant that MMU-writable is set if-and-only-if Host-writable is set. No functional change intended. Nothing currently relies on the aforementioned invariant and technically the changed_pte notifier is dead code. Signed-off-by: David Matlack Message-Id: <20220113233020.3986005-3-dmatlack@google.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/spte.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 8a7b03207762..f8677404c93c 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -215,6 +215,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~shadow_host_writable_mask; + new_spte &= ~shadow_mmu_writable_mask; new_spte = mark_spte_for_access_track(new_spte); From 5f16bcac6e280a7dade580d9627f5cf93ef6aa56 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 13 Jan 2022 23:30:19 +0000 Subject: [PATCH 0189/1295] KVM: x86/mmu: Document and enforce MMU-writable and Host-writable invariants SPTEs are tagged with software-only bits to indicate if it is "MMU-writable" and "Host-writable". These bits are used to determine why KVM has marked an SPTE as read-only. Document these bits and their invariants, and enforce the invariants with new WARNs in spte_can_locklessly_be_made_writable() to ensure they are not accidentally violated in the future. Opportunistically move DEFAULT_SPTE_{MMU,HOST}_WRITABLE next to EPT_SPTE_{MMU,HOST}_WRITABLE since the new documentation applies to both. No functional change intended. Signed-off-by: David Matlack Message-Id: <20220113233020.3986005-4-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/spte.h | 42 +++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index a4af2a42695c..be6a007a4af3 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -60,10 +60,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) -/* Bits 9 and 10 are ignored by all non-EPT PTEs. */ -#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) -#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) - /* * The mask/shift to use for saving the original R/X bits when marking the PTE * as not-present for access tracking purposes. We do not save the W bit as the @@ -78,6 +74,35 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); +/* + * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits + * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs + * that map guest pages in read-only memslots and read-only VMAs. + * + * Invariants: + * - If Host-writable is clear, PT_WRITABLE_MASK must be clear. + * + * + * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU + * allows writes to the guest page mapped by the SPTE. This bit is cleared when + * the guest page mapped by the SPTE contains a page table that is being + * monitored for shadow paging. In this case the SPTE can only be made writable + * by unsyncing the shadow page under the mmu_lock. + * + * Invariants: + * - If MMU-writable is clear, PT_WRITABLE_MASK must be clear. + * - If MMU-writable is set, Host-writable must be set. + * + * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared + * to track writes for dirty logging. For such SPTEs, KVM will locklessly set + * PT_WRITABLE_MASK upon the next write from the guest and record the write in + * the dirty log (see fast_page_fault()). + */ + +/* Bits 9 and 10 are ignored by all non-EPT PTEs. */ +#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) +#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) + /* * Low ignored bits are at a premium for EPT, use high ignored bits, taking care * to not overlap the A/D type mask or the saved access bits of access-tracked @@ -316,8 +341,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, static inline bool spte_can_locklessly_be_made_writable(u64 spte) { - return (spte & shadow_host_writable_mask) && - (spte & shadow_mmu_writable_mask); + if (spte & shadow_mmu_writable_mask) { + WARN_ON_ONCE(!(spte & shadow_host_writable_mask)); + return true; + } + + WARN_ON_ONCE(spte & PT_WRITABLE_MASK); + return false; } static inline u64 get_mmio_spte_generation(u64 spte) From 6ff94f27fd47847d6ecb9302f9d3bd1ca991a17f Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 13 Jan 2022 23:30:20 +0000 Subject: [PATCH 0190/1295] KVM: x86/mmu: Improve TLB flush comment in kvm_mmu_slot_remove_write_access() Rewrite the comment in kvm_mmu_slot_remove_write_access() that explains why it is safe to flush TLBs outside of the MMU lock after write-protecting SPTEs for dirty logging. The current comment is a long run-on sentence that was difficult to understand. In addition it was specific to the shadow MMU (mentioning mmu_spte_update()) when the TDP MMU has to handle this as well. The new comment explains: - Why the TLB flush is necessary at all. - Why it is desirable to do the TLB flush outside of the MMU lock. - Why it is safe to do the TLB flush outside of the MMU lock. No functional change intended. Signed-off-by: David Matlack Message-Id: <20220113233020.3986005-5-dmatlack@google.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1d275e9d76b5..593093b52395 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5756,6 +5756,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) continue; flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } @@ -5825,15 +5826,27 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, } /* - * We can flush all the TLBs out of the mmu lock without TLB - * corruption since we just change the spte from writable to - * readonly so that we only need to care the case of changing - * spte from present to present (changing the spte from present - * to nonpresent will flush all the TLBs immediately), in other - * words, the only case we care is mmu_spte_update() where we - * have checked Host-writable | MMU-writable instead of - * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK - * anymore. + * Flush TLBs if any SPTEs had to be write-protected to ensure that + * guest writes are reflected in the dirty bitmap before the memslot + * update completes, i.e. before enabling dirty logging is visible to + * userspace. + * + * Perform the TLB flush outside the mmu_lock to reduce the amount of + * time the lock is held. However, this does mean that another CPU can + * now grab mmu_lock and encounter a write-protected SPTE while CPUs + * still have a writable mapping for the associated GFN in their TLB. + * + * This is safe but requires KVM to be careful when making decisions + * based on the write-protection status of an SPTE. Specifically, KVM + * also write-protects SPTEs to monitor changes to guest page tables + * during shadow paging, and must guarantee no CPUs can write to those + * page before the lock is dropped. As mentioned in the previous + * paragraph, a write-protected SPTE is no guarantee that CPU cannot + * perform writes. So to determine if a TLB flush is truly required, KVM + * will clear a separate software-only bit (MMU-writable) and skip the + * flush if-and-only-if this bit was already clear. + * + * See DEFAULT_SPTE_MMU_WRITEABLE for more details. */ if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); From e9737468829c2f6abc0c67e5372f8878dff11653 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 17 Jan 2022 15:45:31 +0800 Subject: [PATCH 0191/1295] KVM: x86/cpuid: Clear XFD for component i if the base feature is missing According to Intel extended feature disable (XFD) spec, the sub-function i (i > 1) of CPUID function 0DH enumerates "details for state component i. ECX[2] enumerates support for XFD support for this state component." If KVM does not report F(XFD) feature (e.g. due to CONFIG_X86_64), then the corresponding XFD support for any state component i should also be removed. Translate this dependency into KVM terms. Fixes: 690a757d610e ("kvm: x86: Add CPUID support for Intel AMX") Signed-off-by: Like Xu Message-Id: <20220117074531.76925-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a7c280d2113b..3902c28fb6cb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -936,6 +936,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) --array->nent; continue; } + + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + entry->ecx &= ~BIT_ULL(2); entry->edx = 0; } break; From 1a1d1dbce6d5477e2bb08ce1ef0d77caa838cc8e Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 17 Jan 2022 20:48:17 -0500 Subject: [PATCH 0192/1295] kvm: selftests: conditionally build vm_xsave_req_perm() vm_xsave_req_perm() is currently defined and used by x86_64 only. Make it compiled into vm_create_with_vcpus() only when on x86_64 machines. Otherwise, it would cause linkage errors, e.g. on s390x. Fixes: 415a3c33e8 ("kvm: selftests: Add support for KVM_CAP_XSAVE2") Reported-by: Janis Schoetterl-Glausch Signed-off-by: Wei Wang Tested-by: Janis Schoetterl-Glausch Message-Id: <20220118014817.30910-1-wei.w.wang@intel.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 4a645dc77f34..c22a17aac6b0 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -393,10 +393,12 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, struct kvm_vm *vm; int i; +#ifdef __x86_64__ /* * Permission needs to be requested before KVM_SET_CPUID2. */ vm_xsave_req_perm(); +#endif /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */ if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES) From 3663c9045f51a7ad635a0785adef07c21b79b560 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Sat, 30 Oct 2021 04:51:35 +0000 Subject: [PATCH 0193/1295] cifs: check reconnects for channels of active tcons too With the new multichannel logic, when a channel needs reconnection, the tree connect and other channels can still be active. This fix will handle cases of checking for channel reconnect, when the tcon does not need reconnect. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 2 ++ fs/cifs/connect.c | 34 +++++++++++++++++-- fs/cifs/smb2pdu.c | 82 +++++++++++++++++++++++++++++++++++++--------- 3 files changed, 99 insertions(+), 19 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index f84978b76bb6..cace0818d510 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -117,6 +117,7 @@ enum statusEnum { CifsInSessSetup, CifsNeedTcon, CifsInTcon, + CifsNeedFilesInvalidate, CifsInFilesInvalidate }; @@ -923,6 +924,7 @@ struct cifs_chan { */ struct cifs_ses { struct list_head smb_ses_list; + struct list_head rlist; /* reconnect list */ struct list_head tcon_list; struct cifs_tcon *tcon_ipc; struct mutex session_mutex; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index accce1b351c6..804ffcde0391 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -335,6 +335,7 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, spin_unlock(&cifs_tcp_ses_lock); cifs_swn_reset_server_dstaddr(server); mutex_unlock(&server->srv_mutex); + mod_delayed_work(cifsiod_wq, &server->reconnect, 0); } } while (server->tcpStatus == CifsNeedReconnect); @@ -4404,9 +4405,22 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru char *tree; struct dfs_info3_param ref = {0}; + /* only send once per connect */ + spin_lock(&cifs_tcp_ses_lock); + if (tcon->ses->status != CifsGood || + (tcon->tidStatus != CifsNew && + tcon->tidStatus != CifsNeedTcon)) { + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } + tcon->tidStatus = CifsInTcon; + spin_unlock(&cifs_tcp_ses_lock); + tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL); - if (!tree) - return -ENOMEM; + if (!tree) { + rc = -ENOMEM; + goto out; + } if (tcon->ipc) { scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname); @@ -4438,11 +4452,18 @@ out: kfree(tree); cifs_put_tcp_super(sb); + if (rc) { + spin_lock(&cifs_tcp_ses_lock); + tcon->tidStatus = CifsNeedTcon; + spin_unlock(&cifs_tcp_ses_lock); + } + return rc; } #else int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc) { + int rc; const struct smb_version_operations *ops = tcon->ses->server->ops; /* only send once per connect */ @@ -4456,6 +4477,13 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru tcon->tidStatus = CifsInTcon; spin_unlock(&cifs_tcp_ses_lock); - return ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); + rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); + if (rc) { + spin_lock(&cifs_tcp_ses_lock); + tcon->tidStatus = CifsNeedTcon; + spin_unlock(&cifs_tcp_ses_lock); + } + + return rc; } #endif diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 625e3e9cb614..41b6dffc84c7 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -289,14 +289,18 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, rc = -EHOSTDOWN; goto failed; } - } - - if (rc || !tcon->need_reconnect) { + } else { mutex_unlock(&ses->session_mutex); goto out; } + mutex_unlock(&ses->session_mutex); skip_sess_setup: + mutex_lock(&ses->session_mutex); + if (!tcon->need_reconnect) { + mutex_unlock(&ses->session_mutex); + goto out; + } cifs_mark_open_files_invalid(tcon); if (tcon->use_persistent) tcon->need_reopen_files = true; @@ -3787,27 +3791,35 @@ void smb2_reconnect_server(struct work_struct *work) { struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, reconnect.work); - struct cifs_ses *ses; + struct TCP_Server_Info *pserver; + struct cifs_ses *ses, *ses2; struct cifs_tcon *tcon, *tcon2; - struct list_head tmp_list; - int tcon_exist = false; + struct list_head tmp_list, tmp_ses_list; + bool tcon_exist = false, ses_exist = false; + bool tcon_selected = false, ses_selected = false; int rc; - int resched = false; + bool resched = false; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; /* Prevent simultaneous reconnects that can corrupt tcon->rlist list */ - mutex_lock(&server->reconnect_mutex); + mutex_lock(&pserver->reconnect_mutex); INIT_LIST_HEAD(&tmp_list); - cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n"); + INIT_LIST_HEAD(&tmp_ses_list); + cifs_dbg(FYI, "Reconnecting tcons and channels\n"); spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + + tcon_selected = ses_selected = false; + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->need_reconnect || tcon->need_reopen_files) { tcon->tc_count++; list_add_tail(&tcon->rlist, &tmp_list); - tcon_exist = true; + tcon_selected = tcon_exist = true; } } /* @@ -3816,7 +3828,17 @@ void smb2_reconnect_server(struct work_struct *work) */ if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) { list_add_tail(&ses->tcon_ipc->rlist, &tmp_list); - tcon_exist = true; + tcon_selected = tcon_exist = true; + ses->ses_count++; + } + /* + * handle the case where channel needs to reconnect + * binding session, but tcon is healthy (some other channel + * is active) + */ + if (!tcon_selected && cifs_chan_needs_reconnect(ses, server)) { + list_add_tail(&ses->rlist, &tmp_ses_list); + ses_selected = ses_exist = true; ses->ses_count++; } } @@ -3824,7 +3846,7 @@ void smb2_reconnect_server(struct work_struct *work) * Get the reference to server struct to be sure that the last call of * cifs_put_tcon() in the loop below won't release the server pointer. */ - if (tcon_exist) + if (tcon_exist || ses_exist) server->srv_count++; spin_unlock(&cifs_tcp_ses_lock); @@ -3842,13 +3864,41 @@ void smb2_reconnect_server(struct work_struct *work) cifs_put_tcon(tcon); } - cifs_dbg(FYI, "Reconnecting tcons finished\n"); + if (!ses_exist) + goto done; + + /* allocate a dummy tcon struct used for reconnect */ + tcon = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL); + if (!tcon) { + resched = true; + list_del_init(&ses->rlist); + cifs_put_smb_ses(ses); + goto done; + } + + tcon->tidStatus = CifsGood; + tcon->retry = false; + tcon->need_reconnect = false; + + /* now reconnect sessions for necessary channels */ + list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { + tcon->ses = ses; + rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server); + if (rc) + resched = true; + list_del_init(&ses->rlist); + cifs_put_smb_ses(ses); + } + kfree(tcon); + +done: + cifs_dbg(FYI, "Reconnecting tcons and channels finished\n"); if (resched) queue_delayed_work(cifsiod_wq, &server->reconnect, 2 * HZ); - mutex_unlock(&server->reconnect_mutex); + mutex_unlock(&pserver->reconnect_mutex); /* now we can safely release srv struct */ - if (tcon_exist) + if (tcon_exist || ses_exist) cifs_put_tcp_session(server, 1); } From a05885ce13bd5ec9602551e32dfb1a4f26bfa542 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Wed, 17 Nov 2021 15:57:22 +0000 Subject: [PATCH 0194/1295] cifs: fix the connection state transitions with multichannel Recent changes to multichannel required some adjustments in the way connection states transitioned during/after reconnect. Also some minor fixes: 1. A pending switch of GlobalMid_Lock to cifs_tcp_ses_lock 2. Relocations of the code that logs reconnect 3. Changed some code in allocate_mid to suit the new scheme Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 40 ++++++++++++++++++++++++++-------------- fs/cifs/sess.c | 1 - fs/cifs/smb2pdu.c | 1 - fs/cifs/transport.c | 14 ++------------ 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 804ffcde0391..9fba2ec56328 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -181,17 +181,16 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, server->maxBuf = 0; server->max_read = 0; - cifs_dbg(FYI, "Mark tcp session as need reconnect\n"); - trace_smb3_reconnect(server->CurrentMid, server->conn_id, server->hostname); /* * before reconnecting the tcp session, mark the smb session (uid) and the tid bad so they * are not used until reconnected. */ - cifs_dbg(FYI, "%s: marking sessions and tcons for reconnect\n", __func__); + cifs_dbg(FYI, "%s: marking necessary sessions and tcons for reconnect\n", __func__); /* If server is a channel, select the primary channel */ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { spin_lock(&ses->chan_lock); @@ -218,13 +217,8 @@ next_session: } spin_unlock(&cifs_tcp_ses_lock); - /* - * before reconnecting the tcp session, mark the smb session (uid) - * and the tid bad so they are not used until reconnected - */ - cifs_dbg(FYI, "%s: marking sessions and tcons for reconnect and tearing down socket\n", - __func__); /* do not want to be sending data on a socket we are freeing */ + cifs_dbg(FYI, "%s: tearing down socket\n", __func__); mutex_lock(&server->srv_mutex); if (server->ssocket) { cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n", server->ssocket->state, @@ -280,7 +274,12 @@ static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num wake_up(&server->response_q); return false; } + + cifs_dbg(FYI, "Mark tcp session as need reconnect\n"); + trace_smb3_reconnect(server->CurrentMid, server->conn_id, + server->hostname); server->tcpStatus = CifsNeedReconnect; + spin_unlock(&cifs_tcp_ses_lock); return true; } @@ -634,7 +633,6 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) if (server->tcpStatus == CifsNeedReconnect) { spin_unlock(&cifs_tcp_ses_lock); - cifs_reconnect(server, false); return -ECONNABORTED; } spin_unlock(&cifs_tcp_ses_lock); @@ -3885,6 +3883,11 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, else rc = -EHOSTDOWN; spin_unlock(&cifs_tcp_ses_lock); + } else { + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsInNegotiate) + server->tcpStatus = CifsNeedNegotiate; + spin_unlock(&cifs_tcp_ses_lock); } return rc; @@ -3931,8 +3934,18 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (server->ops->sess_setup) rc = server->ops->sess_setup(xid, ses, server, nls_info); - if (rc) + if (rc) { cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc); + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsInSessSetup) + server->tcpStatus = CifsNeedSessSetup; + spin_unlock(&cifs_tcp_ses_lock); + } else { + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsInSessSetup) + server->tcpStatus = CifsGood; + spin_unlock(&cifs_tcp_ses_lock); + } return rc; } @@ -4279,9 +4292,8 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); - if (tcon->ses->status != CifsGood || - (tcon->tidStatus != CifsNew && - tcon->tidStatus != CifsNeedTcon)) { + if (tcon->tidStatus != CifsNew && + tcon->tidStatus != CifsNeedTcon) { spin_unlock(&cifs_tcp_ses_lock); return 0; } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 50ca479d7039..e7fddd4a5990 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1054,7 +1054,6 @@ sess_establish_session(struct sess_data *sess_data) /* Even if one channel is active, session is in good state */ spin_lock(&cifs_tcp_ses_lock); - server->tcpStatus = CifsGood; ses->status = CifsGood; spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 41b6dffc84c7..6d8cfe15fbaf 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1393,7 +1393,6 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) /* Even if one channel is active, session is in good state */ spin_lock(&cifs_tcp_ses_lock); - server->tcpStatus = CifsGood; ses->status = CifsGood; spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 93f0e8c1ea23..548cb376942e 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -431,7 +431,8 @@ unmask: * socket so the server throws away the partial SMB */ spin_lock(&cifs_tcp_ses_lock); - server->tcpStatus = CifsNeedReconnect; + if (server->tcpStatus != CifsExiting) + server->tcpStatus = CifsNeedReconnect; spin_unlock(&cifs_tcp_ses_lock); trace_smb3_partial_send_reconnect(server->CurrentMid, server->conn_id, server->hostname); @@ -729,17 +730,6 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, struct mid_q_entry **ppmidQ) { spin_lock(&cifs_tcp_ses_lock); - if (ses->server->tcpStatus == CifsExiting) { - spin_unlock(&cifs_tcp_ses_lock); - return -ENOENT; - } - - if (ses->server->tcpStatus == CifsNeedReconnect) { - spin_unlock(&cifs_tcp_ses_lock); - cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); - return -EAGAIN; - } - if (ses->status == CifsNew) { if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && (in_buf->Command != SMB_COM_NEGOTIATE)) { From 88b024f556fcd5bf1288c6333016f576cfa5f539 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Fri, 19 Nov 2021 14:16:57 +0000 Subject: [PATCH 0195/1295] cifs: protect all accesses to chan_* with chan_lock A spin lock called chan_lock was introduced recently. But not all accesses were protected. Doing that with this change. To make sure that a channel is not freed when in use, we need to introduce a ref count. But today, we don't ever free channels. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 4 +++- fs/cifs/sess.c | 10 +++++++--- fs/cifs/smb2pdu.c | 4 +++- fs/cifs/smb2transport.c | 6 ++++++ fs/cifs/transport.c | 3 +++ 5 files changed, 22 insertions(+), 5 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9fba2ec56328..d643f9e16f4e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1831,7 +1831,6 @@ void cifs_put_smb_ses(struct cifs_ses *ses) spin_lock(&ses->chan_lock); chan_count = ses->chan_count; - spin_unlock(&ses->chan_lock); /* close any extra channels */ if (chan_count > 1) { @@ -1848,6 +1847,7 @@ void cifs_put_smb_ses(struct cifs_ses *ses) ses->chans[i].server = NULL; } } + spin_unlock(&ses->chan_lock); sesInfoFree(ses); cifs_put_tcp_session(server, 0); @@ -2123,8 +2123,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) mutex_unlock(&ses->session_mutex); /* each channel uses a different signing key */ + spin_lock(&ses->chan_lock); memcpy(ses->chans[0].signkey, ses->smb3signingkey, sizeof(ses->smb3signingkey)); + spin_unlock(&ses->chan_lock); if (rc) goto get_ses_fail; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index e7fddd4a5990..f7de57f6f047 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -65,6 +65,8 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) return false; } +/* channel helper functions. assumed that chan_lock is held by caller. */ + unsigned int cifs_ses_get_chan_index(struct cifs_ses *ses, struct TCP_Server_Info *server) @@ -134,10 +136,10 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) left = ses->chan_max - ses->chan_count; if (left <= 0) { + spin_unlock(&ses->chan_lock); cifs_dbg(FYI, "ses already at max_channels (%zu), nothing to open\n", ses->chan_max); - spin_unlock(&ses->chan_lock); return 0; } @@ -369,12 +371,14 @@ void cifs_ses_mark_for_reconnect(struct cifs_ses *ses) { int i; + spin_lock(&cifs_tcp_ses_lock); + spin_lock(&ses->chan_lock); for (i = 0; i < ses->chan_count; i++) { - spin_lock(&cifs_tcp_ses_lock); if (ses->chans[i].server->tcpStatus != CifsExiting) ses->chans[i].server->tcpStatus = CifsNeedReconnect; - spin_unlock(&cifs_tcp_ses_lock); } + spin_unlock(&ses->chan_lock); + spin_unlock(&cifs_tcp_ses_lock); } static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 6d8cfe15fbaf..ca8a16896f17 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -244,10 +244,10 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, spin_unlock(&ses->chan_lock); return 0; } + spin_unlock(&ses->chan_lock); cifs_dbg(FYI, "sess reconnect mask: 0x%lx, tcon reconnect: %d", tcon->ses->chans_need_reconnect, tcon->need_reconnect); - spin_unlock(&ses->chan_lock); nls_codepage = load_nls_default(); @@ -3835,11 +3835,13 @@ void smb2_reconnect_server(struct work_struct *work) * binding session, but tcon is healthy (some other channel * is active) */ + spin_lock(&ses->chan_lock); if (!tcon_selected && cifs_chan_needs_reconnect(ses, server)) { list_add_tail(&ses->rlist, &tmp_ses_list); ses_selected = ses_exist = true; ses->ses_count++; } + spin_unlock(&ses->chan_lock); } /* * Get the reference to server struct to be sure that the last call of diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index b70a49b4edc0..2af79093b78b 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -100,6 +100,7 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) goto out; found: + spin_lock(&ses->chan_lock); if (cifs_chan_needs_reconnect(ses, server) && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) { /* @@ -108,6 +109,7 @@ found: * session key */ memcpy(key, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE); + spin_unlock(&ses->chan_lock); goto out; } @@ -119,9 +121,11 @@ found: chan = ses->chans + i; if (chan->server == server) { memcpy(key, chan->signkey, SMB3_SIGN_KEY_SIZE); + spin_unlock(&ses->chan_lock); goto out; } } + spin_unlock(&ses->chan_lock); cifs_dbg(VFS, "%s: Could not find channel signing key for session 0x%llx\n", @@ -430,8 +434,10 @@ generate_smb3signingkey(struct cifs_ses *ses, return rc; /* safe to access primary channel, since it will never go away */ + spin_lock(&ses->chan_lock); memcpy(ses->chans[0].signkey, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE); + spin_unlock(&ses->chan_lock); rc = generate_key(ses, ptriplet->encryption.label, ptriplet->encryption.context, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 548cb376942e..8540f7c13eae 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -1049,7 +1049,10 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) /* round robin */ index = (uint)atomic_inc_return(&ses->chan_seq); + + spin_lock(&ses->chan_lock); index %= ses->chan_count; + spin_unlock(&ses->chan_lock); return ses->chans[index].server; } From 8a409cda978e212661b8c032e1b08b3b0b0f9d36 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 5 Jan 2022 02:24:37 +0500 Subject: [PATCH 0196/1295] cifs: remove unused variable ses_selected ses_selected is being declared and set at several places. It is not being used. Remove it. Signed-off-by: Muhammad Usama Anjum Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index ca8a16896f17..202ddbafe939 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3795,7 +3795,7 @@ void smb2_reconnect_server(struct work_struct *work) struct cifs_tcon *tcon, *tcon2; struct list_head tmp_list, tmp_ses_list; bool tcon_exist = false, ses_exist = false; - bool tcon_selected = false, ses_selected = false; + bool tcon_selected = false; int rc; bool resched = false; @@ -3812,7 +3812,7 @@ void smb2_reconnect_server(struct work_struct *work) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { - tcon_selected = ses_selected = false; + tcon_selected = false; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->need_reconnect || tcon->need_reopen_files) { @@ -3838,7 +3838,7 @@ void smb2_reconnect_server(struct work_struct *work) spin_lock(&ses->chan_lock); if (!tcon_selected && cifs_chan_needs_reconnect(ses, server)) { list_add_tail(&ses->rlist, &tmp_ses_list); - ses_selected = ses_exist = true; + ses_exist = true; ses->ses_count++; } spin_unlock(&ses->chan_lock); From e154cb7b0ab961f9d785ed34c2d7128413e7083d Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Sun, 16 Jan 2022 11:19:36 +0000 Subject: [PATCH 0197/1295] cifs: fix the cifs_reconnect path for DFS Recently, the cifs_reconnect code was refactored into two branches for regular vs dfs codepath. Some of my recent changes were missing in the dfs path, namely the code to enable periodic DNS query, and a missing lock. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d643f9e16f4e..4a31a9b3f34f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -338,8 +338,10 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, } } while (server->tcpStatus == CifsNeedReconnect); + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedNegotiate) mod_delayed_work(cifsiod_wq, &server->echo, 0); + spin_unlock(&cifs_tcp_ses_lock); wake_up(&server->response_q); return rc; @@ -454,6 +456,7 @@ reconnect_dfs_server(struct TCP_Server_Info *server, spin_unlock(&cifs_tcp_ses_lock); cifs_swn_reset_server_dstaddr(server); mutex_unlock(&server->srv_mutex); + mod_delayed_work(cifsiod_wq, &server->reconnect, 0); } while (server->tcpStatus == CifsNeedReconnect); if (target_hint) From ece0767641740c7eea7aee5a332728e115b00eab Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Sun, 16 Jan 2022 13:28:34 +0000 Subject: [PATCH 0198/1295] cifs: remove repeated state change in dfs tree connect cifs_tree_connect checks and sets the tidStatus for the tcon. cifs_tree_connect also calls a dfs specific tree connect function, which also does similar checks. This should not happen. Removing it with this change. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4a31a9b3f34f..6740a7c39df3 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -4295,16 +4295,6 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t struct dfs_cache_tgt_iterator *tit; bool target_match; - /* only send once per connect */ - spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus != CifsNew && - tcon->tidStatus != CifsNeedTcon) { - spin_unlock(&cifs_tcp_ses_lock); - return 0; - } - tcon->tidStatus = CifsInTcon; - spin_unlock(&cifs_tcp_ses_lock); - extract_unc_hostname(server->hostname, &tcp_host, &tcp_host_len); tit = dfs_cache_get_tgt_iterator(tl); From c1604da708d345a1ca1cf6a5537d503b14aa4787 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Sun, 16 Jan 2022 13:38:14 +0000 Subject: [PATCH 0199/1295] cifs: make status checks in version independent callers The status of tcp session, smb session and tcon have the same flow, irrespective of the SMB version used. Hence these status checks and updates should happen in the version independent callers of these commands. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 29 +++++++++++++++++++++++------ fs/cifs/sess.c | 9 --------- fs/cifs/smb2pdu.c | 14 -------------- 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 6740a7c39df3..0a35503b7b46 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3770,10 +3770,6 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, if (rc == 0) { bool is_unicode; - spin_lock(&cifs_tcp_ses_lock); - tcon->tidStatus = CifsGood; - spin_unlock(&cifs_tcp_ses_lock); - tcon->need_reconnect = false; tcon->tid = smb_buffer_response->Tid; bcc_ptr = pByteArea(smb_buffer_response); bytes_left = get_bcc(smb_buffer_response); @@ -3949,7 +3945,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsInSessSetup) server->tcpStatus = CifsGood; + /* Even if one channel is active, session is in good state */ + if (ses->status == CifsInSessSetup) + ses->status = CifsGood; spin_unlock(&cifs_tcp_ses_lock); + + spin_lock(&ses->chan_lock); + cifs_chan_clear_need_reconnect(ses, server); + spin_unlock(&ses->chan_lock); } return rc; @@ -4461,8 +4464,15 @@ out: if (rc) { spin_lock(&cifs_tcp_ses_lock); - tcon->tidStatus = CifsNeedTcon; + if (tcon->tidStatus == CifsInTcon) + tcon->tidStatus = CifsNeedTcon; spin_unlock(&cifs_tcp_ses_lock); + } else { + spin_lock(&cifs_tcp_ses_lock); + if (tcon->tidStatus == CifsInTcon) + tcon->tidStatus = CifsGood; + spin_unlock(&cifs_tcp_ses_lock); + tcon->need_reconnect = false; } return rc; @@ -4487,8 +4497,15 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); if (rc) { spin_lock(&cifs_tcp_ses_lock); - tcon->tidStatus = CifsNeedTcon; + if (tcon->tidStatus == CifsInTcon) + tcon->tidStatus = CifsNeedTcon; spin_unlock(&cifs_tcp_ses_lock); + } else { + spin_lock(&cifs_tcp_ses_lock); + if (tcon->tidStatus == CifsInTcon) + tcon->tidStatus = CifsGood; + spin_unlock(&cifs_tcp_ses_lock); + tcon->need_reconnect = false; } return rc; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index f7de57f6f047..97fba1f28e4b 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1052,15 +1052,6 @@ sess_establish_session(struct sess_data *sess_data) mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "CIFS session established successfully\n"); - spin_lock(&ses->chan_lock); - cifs_chan_clear_need_reconnect(ses, server); - spin_unlock(&ses->chan_lock); - - /* Even if one channel is active, session is in good state */ - spin_lock(&cifs_tcp_ses_lock); - ses->status = CifsGood; - spin_unlock(&cifs_tcp_ses_lock); - return 0; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 202ddbafe939..1e670e56b07a 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1386,16 +1386,6 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "SMB2/3 session established successfully\n"); - - spin_lock(&ses->chan_lock); - cifs_chan_clear_need_reconnect(ses, server); - spin_unlock(&ses->chan_lock); - - /* Even if one channel is active, session is in good state */ - spin_lock(&cifs_tcp_ses_lock); - ses->status = CifsGood; - spin_unlock(&cifs_tcp_ses_lock); - return rc; } @@ -1923,10 +1913,6 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->share_flags = le32_to_cpu(rsp->ShareFlags); tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */ tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); - spin_lock(&cifs_tcp_ses_lock); - tcon->tidStatus = CifsGood; - spin_unlock(&cifs_tcp_ses_lock); - tcon->need_reconnect = false; tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId); strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); From 47de760655f329ce4b3d3e6276557220956d8c38 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 18 Jan 2022 09:24:08 +0000 Subject: [PATCH 0200/1295] cifs: update tcpStatus during negotiate and sess setup Till the end of SMB session setup, update tcpStatus and avoid updating session status field. There was a typo in cifs_setup_session, which caused ses->status to be updated instead. This was causing issues during reconnect. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0a35503b7b46..bcba3324cb4b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3908,7 +3908,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, spin_unlock(&cifs_tcp_ses_lock); return 0; } - ses->status = CifsInSessSetup; + server->tcpStatus = CifsInSessSetup; spin_unlock(&cifs_tcp_ses_lock); spin_lock(&ses->chan_lock); @@ -3946,8 +3946,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (server->tcpStatus == CifsInSessSetup) server->tcpStatus = CifsGood; /* Even if one channel is active, session is in good state */ - if (ses->status == CifsInSessSetup) - ses->status = CifsGood; + ses->status = CifsGood; spin_unlock(&cifs_tcp_ses_lock); spin_lock(&ses->chan_lock); From ba978e83255a759a4a07257a46ca6396a8b81787 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Mon, 17 Jan 2022 07:15:02 +0000 Subject: [PATCH 0201/1295] cifs: cifs_ses_mark_for_reconnect should also update reconnect bits Recent restructuring of cifs_reconnect introduced a helper func named cifs_ses_mark_for_reconnect, which updates the state of tcp session for all the channels of a session for reconnect. However, this does not update the session state and chans_need_reconnect bitmask. This change fixes that. Also, cifs_mark_tcp_sess_for_reconnect should mark set the bitmask for all channels when the whole session is marked for reconnect. Fixed that here too. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifs_swn.c | 9 +++------ fs/cifs/cifsproto.h | 3 +++ fs/cifs/connect.c | 9 ++++++--- fs/cifs/dfs_cache.c | 2 +- fs/cifs/netmisc.c | 5 +---- fs/cifs/sess.c | 15 --------------- 6 files changed, 14 insertions(+), 29 deletions(-) diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index 8f386dd9939e..463ebe34892b 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -396,11 +396,11 @@ static int cifs_swn_resource_state_changed(struct cifs_swn_reg *swnreg, const ch switch (state) { case CIFS_SWN_RESOURCE_STATE_UNAVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become unavailable\n", __func__, name); - cifs_ses_mark_for_reconnect(swnreg->tcon->ses); + cifs_reconnect(swnreg->tcon->ses->server, true); break; case CIFS_SWN_RESOURCE_STATE_AVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become available\n", __func__, name); - cifs_ses_mark_for_reconnect(swnreg->tcon->ses); + cifs_reconnect(swnreg->tcon->ses->server, true); break; case CIFS_SWN_RESOURCE_STATE_UNKNOWN: cifs_dbg(FYI, "%s: resource name '%s' changed to unknown state\n", __func__, name); @@ -498,10 +498,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a goto unlock; } - spin_lock(&cifs_tcp_ses_lock); - if (tcon->ses->server->tcpStatus != CifsExiting) - tcon->ses->server->tcpStatus = CifsNeedReconnect; - spin_unlock(&cifs_tcp_ses_lock); + cifs_reconnect(tcon->ses->server, false); unlock: mutex_unlock(&tcon->ses->server->srv_mutex); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f2029bc46215..d3701295402d 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -131,6 +131,9 @@ extern int SendReceiveBlockingLock(const unsigned int xid, struct smb_hdr *in_buf , struct smb_hdr *out_buf, int *bytes_returned); +void +cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, + bool mark_smb_session); extern int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session); extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index bcba3324cb4b..453aba3c2ad6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -168,7 +168,7 @@ static void cifs_resolve_server(struct work_struct *work) * @server needs to be previously set to CifsNeedReconnect. * */ -static void +void cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { @@ -197,7 +197,10 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) goto next_session; - cifs_chan_set_need_reconnect(ses, server); + if (mark_smb_session) + CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses); + else + cifs_chan_set_need_reconnect(ses, server); /* If all channels need reconnect, then tcon needs reconnect */ if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) @@ -4396,7 +4399,7 @@ static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tco */ if (rc && server->current_fullpath != server->origin_fullpath) { server->current_fullpath = server->origin_fullpath; - cifs_ses_mark_for_reconnect(tcon->ses); + cifs_reconnect(tcon->ses->server, true); } dfs_cache_free_tgts(tl); diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index e9b0fa2a9614..dd9643751671 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1355,7 +1355,7 @@ static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cach } cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__); - cifs_ses_mark_for_reconnect(tcon->ses); + cifs_reconnect(tcon->ses->server, true); } /* Refresh dfs referral of tcon and mark it for reconnect if needed */ diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 43b16b6d108c..ebe236b9d9f5 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -896,10 +896,7 @@ map_and_check_smb_error(struct mid_q_entry *mid, bool logErr) if (class == ERRSRV && code == ERRbaduid) { cifs_dbg(FYI, "Server returned 0x%x, reconnecting session...\n", code); - spin_lock(&cifs_tcp_ses_lock); - if (mid->server->tcpStatus != CifsExiting) - mid->server->tcpStatus = CifsNeedReconnect; - spin_unlock(&cifs_tcp_ses_lock); + cifs_reconnect(mid->server, false); } } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 97fba1f28e4b..15373a377a36 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -366,21 +366,6 @@ out: return rc; } -/* Mark all session channels for reconnect */ -void cifs_ses_mark_for_reconnect(struct cifs_ses *ses) -{ - int i; - - spin_lock(&cifs_tcp_ses_lock); - spin_lock(&ses->chan_lock); - for (i = 0; i < ses->chan_count; i++) { - if (ses->chans[i].server->tcpStatus != CifsExiting) - ses->chans[i].server->tcpStatus = CifsNeedReconnect; - } - spin_unlock(&ses->chan_lock); - spin_unlock(&cifs_tcp_ses_lock); -} - static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, struct TCP_Server_Info *server, SESSION_SETUP_ANDX *pSMB) From 7ff775aca48adc854436b92c060e5eebfffb6a4a Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Jan 2022 21:24:26 -0800 Subject: [PATCH 0202/1295] KVM: x86/pmu: Use binary search to check filtered events The PMU event filter may contain up to 300 events. Replace the linear search in reprogram_gp_counter() with a binary search. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Message-Id: <20220115052431.447232-2-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index e632693a2266..2c98f3ee8df4 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include "x86.h" #include "cpuid.h" @@ -172,12 +174,16 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } +static int cmp_u64(const void *a, const void *b) +{ + return *(__u64 *)a - *(__u64 *)b; +} + void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { unsigned config, type = PERF_TYPE_RAW; struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; - int i; bool allow_event = true; if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) @@ -192,16 +198,13 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); if (filter) { - for (i = 0; i < filter->nevents; i++) - if (filter->events[i] == - (eventsel & AMD64_RAW_EVENT_MASK_NB)) - break; - if (filter->action == KVM_PMU_EVENT_ALLOW && - i == filter->nevents) - allow_event = false; - if (filter->action == KVM_PMU_EVENT_DENY && - i < filter->nevents) - allow_event = false; + __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB; + + if (bsearch(&key, filter->events, filter->nevents, + sizeof(__u64), cmp_u64)) + allow_event = filter->action == KVM_PMU_EVENT_ALLOW; + else + allow_event = filter->action == KVM_PMU_EVENT_DENY; } if (!allow_event) return; @@ -576,6 +579,11 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) /* Ensure nevents can't be changed between the user copies. */ *filter = tmp; + /* + * Sort the in-kernel list so that we can search it with bsearch. + */ + sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL); + mutex_lock(&kvm->lock); filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, mutex_is_locked(&kvm->lock)); From b33b9c407861985713ca18cc9ea05b7540210ad4 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Jan 2022 21:24:27 -0800 Subject: [PATCH 0203/1295] selftests: kvm/x86: Parameterize the CPUID vendor string check Refactor is_intel_cpu() to make it easier to reuse the bulk of the code for other vendors in the future. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Message-Id: <20220115052431.447232-3-jmattson@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index d61e2326dc85..308a1fe687f1 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1253,10 +1253,10 @@ void kvm_x86_state_cleanup(struct kvm_x86_state *state) free(state); } -bool is_intel_cpu(void) +static bool cpu_vendor_string_is(const char *vendor) { + const uint32_t *chunk = (const uint32_t *)vendor; int eax, ebx, ecx, edx; - const uint32_t *chunk; const int leaf = 0; __asm__ __volatile__( @@ -1265,10 +1265,14 @@ bool is_intel_cpu(void) "=c"(ecx), "=d"(edx) : /* input */ "0"(leaf), "2"(0)); - chunk = (const uint32_t *)("GenuineIntel"); return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]); } +bool is_intel_cpu(void) +{ + return cpu_vendor_string_is("GenuineIntel"); +} + uint32_t kvm_get_cpuid_max_basic(void) { return kvm_get_supported_cpuid_entry(0)->eax; From 21066101f42cfd86fdd835b70ce0e36c335f5f4d Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Jan 2022 21:24:28 -0800 Subject: [PATCH 0204/1295] selftests: kvm/x86: Introduce is_amd_cpu() Replace the one ad hoc "AuthenticAMD" CPUID vendor string comparison with a new function, is_amd_cpu(). Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Message-Id: <20220115052431.447232-4-jmattson@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/processor.h | 1 + .../selftests/kvm/lib/x86_64/processor.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index bb013d101c14..e82c44aa029a 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -364,6 +364,7 @@ static inline unsigned long get_xmm(int n) } bool is_intel_cpu(void); +bool is_amd_cpu(void); struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 308a1fe687f1..6bf01ded3776 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1273,6 +1273,14 @@ bool is_intel_cpu(void) return cpu_vendor_string_is("GenuineIntel"); } +/* + * Exclude early K5 samples with a vendor string of "AMDisbetter!" + */ +bool is_amd_cpu(void) +{ + return cpu_vendor_string_is("AuthenticAMD"); +} + uint32_t kvm_get_cpuid_max_basic(void) { return kvm_get_supported_cpuid_entry(0)->eax; @@ -1508,10 +1516,6 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui return cpuid; } -#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 -#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 -#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65 - static inline unsigned x86_family(unsigned int eax) { unsigned int x86; @@ -1533,11 +1537,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm) max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1; /* Avoid reserved HyperTransport region on AMD processors. */ - eax = ecx = 0; - cpuid(&eax, &ebx, &ecx, &edx); - if (ebx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx || - ecx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx || - edx != X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) + if (!is_amd_cpu()) return max_gfn; /* On parts with <40 physical address bits, the area is fully hidden */ From 398f9240f90f4168f5882180723f743f7b682049 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Jan 2022 21:24:29 -0800 Subject: [PATCH 0205/1295] selftests: kvm/x86: Export x86_family() for use outside of processor.c Move this static inline function to processor.h, so that it can be used in individual tests, as needed. Opportunistically replace the bare 'unsigned' with 'unsigned int.' Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Message-Id: <20220115052431.447232-5-jmattson@google.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/x86_64/processor.h | 12 ++++++++++++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 12 ------------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index e82c44aa029a..4eb73959ce05 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -366,6 +366,18 @@ static inline unsigned long get_xmm(int n) bool is_intel_cpu(void); bool is_amd_cpu(void); +static inline unsigned int x86_family(unsigned int eax) +{ + unsigned int x86; + + x86 = (eax >> 8) & 0xf; + + if (x86 == 0xf) + x86 += (eax >> 20) & 0xff; + + return x86; +} + struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 6bf01ded3776..59dcfe1967cc 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1516,18 +1516,6 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui return cpuid; } -static inline unsigned x86_family(unsigned int eax) -{ - unsigned int x86; - - x86 = (eax >> 8) & 0xf; - - if (x86 == 0xf) - x86 += (eax >> 20) & 0xff; - - return x86; -} - unsigned long vm_compute_max_gfn(struct kvm_vm *vm) { const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */ From 2ba9047424fc7243c63ac57f5fdfa754aa895e3c Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Jan 2022 21:24:30 -0800 Subject: [PATCH 0206/1295] selftests: kvm/x86: Introduce x86_model() Extract the x86 model number from CPUID.01H:EAX. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Message-Id: <20220115052431.447232-6-jmattson@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/x86_64/processor.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 4eb73959ce05..122447827954 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -378,6 +378,11 @@ static inline unsigned int x86_family(unsigned int eax) return x86; } +static inline unsigned int x86_model(unsigned int eax) +{ + return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f); +} + struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state); From bef9a701f3ebfb60da259b04778d24128505a96c Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Jan 2022 21:24:31 -0800 Subject: [PATCH 0207/1295] selftests: kvm/x86: Add test for KVM_SET_PMU_EVENT_FILTER Verify that the PMU event filter works as expected. Note that the virtual PMU doesn't work as expected on AMD Zen CPUs (an intercepted rdmsr is counted as a retired branch instruction), but the PMU event filter does work. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Message-Id: <20220115052431.447232-7-jmattson@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../kvm/x86_64/pmu_event_filter_test.c | 438 ++++++++++++++++++ 3 files changed, 440 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 20b4c921c9a2..9fe19f412f44 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -22,6 +22,7 @@ /x86_64/mmio_warning_test /x86_64/mmu_role_test /x86_64/platform_info_test +/x86_64/pmu_event_filter_test /x86_64/set_boot_cpu_id /x86_64/set_sregs_test /x86_64/sev_migrate_tests diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index ec78a8692899..7fbc80e3ecdf 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -56,6 +56,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test +TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test TEST_GEN_PROGS_x86_64 += x86_64/smm_test diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c new file mode 100644 index 000000000000..aa104946e6e0 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for x86 KVM_SET_PMU_EVENT_FILTER. + * + * Copyright (C) 2022, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Verifies the expected behavior of allow lists and deny lists for + * virtual PMU events. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +/* + * In lieu of copying perf_event.h into tools... + */ +#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17) +#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) + +union cpuid10_eax { + struct { + unsigned int version_id:8; + unsigned int num_counters:8; + unsigned int bit_width:8; + unsigned int mask_length:8; + } split; + unsigned int full; +}; + +union cpuid10_ebx { + struct { + unsigned int no_unhalted_core_cycles:1; + unsigned int no_instructions_retired:1; + unsigned int no_unhalted_reference_cycles:1; + unsigned int no_llc_reference:1; + unsigned int no_llc_misses:1; + unsigned int no_branch_instruction_retired:1; + unsigned int no_branch_misses_retired:1; + } split; + unsigned int full; +}; + +/* End of stuff taken from perf_event.h. */ + +/* Oddly, this isn't in perf_event.h. */ +#define ARCH_PERFMON_BRANCHES_RETIRED 5 + +#define VCPU_ID 0 +#define NUM_BRANCHES 42 + +/* + * This is how the event selector and unit mask are stored in an AMD + * core performance event-select register. Intel's format is similar, + * but the event selector is only 8 bits. + */ +#define EVENT(select, umask) ((select & 0xf00UL) << 24 | (select & 0xff) | \ + (umask & 0xff) << 8) + +/* + * "Branch instructions retired", from the Intel SDM, volume 3, + * "Pre-defined Architectural Performance Events." + */ + +#define INTEL_BR_RETIRED EVENT(0xc4, 0) + +/* + * "Retired branch instructions", from Processor Programming Reference + * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors, + * Preliminary Processor Programming Reference (PPR) for AMD Family + * 17h Model 31h, Revision B0 Processors, and Preliminary Processor + * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision + * B1 Processors Volume 1 of 2. + */ + +#define AMD_ZEN_BR_RETIRED EVENT(0xc2, 0) + +/* + * This event list comprises Intel's eight architectural events plus + * AMD's "retired branch instructions" for Zen[123] (and possibly + * other AMD CPUs). + */ +static const uint64_t event_list[] = { + EVENT(0x3c, 0), + EVENT(0xc0, 0), + EVENT(0x3c, 1), + EVENT(0x2e, 0x4f), + EVENT(0x2e, 0x41), + EVENT(0xc4, 0), + EVENT(0xc5, 0), + EVENT(0xa4, 1), + AMD_ZEN_BR_RETIRED, +}; + +/* + * If we encounter a #GP during the guest PMU sanity check, then the guest + * PMU is not functional. Inform the hypervisor via GUEST_SYNC(0). + */ +static void guest_gp_handler(struct ex_regs *regs) +{ + GUEST_SYNC(0); +} + +/* + * Check that we can write a new value to the given MSR and read it back. + * The caller should provide a non-empty set of bits that are safe to flip. + * + * Return on success. GUEST_SYNC(0) on error. + */ +static void check_msr(uint32_t msr, uint64_t bits_to_flip) +{ + uint64_t v = rdmsr(msr) ^ bits_to_flip; + + wrmsr(msr, v); + if (rdmsr(msr) != v) + GUEST_SYNC(0); + + v ^= bits_to_flip; + wrmsr(msr, v); + if (rdmsr(msr) != v) + GUEST_SYNC(0); +} + +static void intel_guest_code(void) +{ + check_msr(MSR_CORE_PERF_GLOBAL_CTRL, 1); + check_msr(MSR_P6_EVNTSEL0, 0xffff); + check_msr(MSR_IA32_PMC0, 0xffff); + GUEST_SYNC(1); + + for (;;) { + uint64_t br0, br1; + + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); + wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | INTEL_BR_RETIRED); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 1); + br0 = rdmsr(MSR_IA32_PMC0); + __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES})); + br1 = rdmsr(MSR_IA32_PMC0); + GUEST_SYNC(br1 - br0); + } +} + +/* + * To avoid needing a check for CPUID.80000001:ECX.PerfCtrExtCore[bit 23], + * this code uses the always-available, legacy K7 PMU MSRs, which alias to + * the first four of the six extended core PMU MSRs. + */ +static void amd_guest_code(void) +{ + check_msr(MSR_K7_EVNTSEL0, 0xffff); + check_msr(MSR_K7_PERFCTR0, 0xffff); + GUEST_SYNC(1); + + for (;;) { + uint64_t br0, br1; + + wrmsr(MSR_K7_EVNTSEL0, 0); + wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BR_RETIRED); + br0 = rdmsr(MSR_K7_PERFCTR0); + __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES})); + br1 = rdmsr(MSR_K7_PERFCTR0); + GUEST_SYNC(br1 - br0); + } +} + +/* + * Run the VM to the next GUEST_SYNC(value), and return the value passed + * to the sync. Any other exit from the guest is fatal. + */ +static uint64_t run_vm_to_sync(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + get_ucall(vm, VCPU_ID, &uc); + TEST_ASSERT(uc.cmd == UCALL_SYNC, + "Received ucall other than UCALL_SYNC: %lu", uc.cmd); + return uc.args[1]; +} + +/* + * In a nested environment or if the vPMU is disabled, the guest PMU + * might not work as architected (accessing the PMU MSRs may raise + * #GP, or writes could simply be discarded). In those situations, + * there is no point in running these tests. The guest code will perform + * a sanity check and then GUEST_SYNC(success). In the case of failure, + * the behavior of the guest on resumption is undefined. + */ +static bool sanity_check_pmu(struct kvm_vm *vm) +{ + bool success; + + vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); + success = run_vm_to_sync(vm); + vm_install_exception_handler(vm, GP_VECTOR, NULL); + + return success; +} + +static struct kvm_pmu_event_filter *make_pmu_event_filter(uint32_t nevents) +{ + struct kvm_pmu_event_filter *f; + int size = sizeof(*f) + nevents * sizeof(f->events[0]); + + f = malloc(size); + TEST_ASSERT(f, "Out of memory"); + memset(f, 0, size); + f->nevents = nevents; + return f; +} + +static struct kvm_pmu_event_filter *event_filter(uint32_t action) +{ + struct kvm_pmu_event_filter *f; + int i; + + f = make_pmu_event_filter(ARRAY_SIZE(event_list)); + f->action = action; + for (i = 0; i < ARRAY_SIZE(event_list); i++) + f->events[i] = event_list[i]; + + return f; +} + +/* + * Remove the first occurrence of 'event' (if any) from the filter's + * event list. + */ +static struct kvm_pmu_event_filter *remove_event(struct kvm_pmu_event_filter *f, + uint64_t event) +{ + bool found = false; + int i; + + for (i = 0; i < f->nevents; i++) { + if (found) + f->events[i - 1] = f->events[i]; + else + found = f->events[i] == event; + } + if (found) + f->nevents--; + return f; +} + +static void test_without_filter(struct kvm_vm *vm) +{ + uint64_t count = run_vm_to_sync(vm); + + if (count != NUM_BRANCHES) + pr_info("%s: Branch instructions retired = %lu (expected %u)\n", + __func__, count, NUM_BRANCHES); + TEST_ASSERT(count, "Allowed PMU event is not counting"); +} + +static uint64_t test_with_filter(struct kvm_vm *vm, + struct kvm_pmu_event_filter *f) +{ + vm_ioctl(vm, KVM_SET_PMU_EVENT_FILTER, (void *)f); + return run_vm_to_sync(vm); +} + +static void test_member_deny_list(struct kvm_vm *vm) +{ + struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY); + uint64_t count = test_with_filter(vm, f); + + free(f); + if (count) + pr_info("%s: Branch instructions retired = %lu (expected 0)\n", + __func__, count); + TEST_ASSERT(!count, "Disallowed PMU Event is counting"); +} + +static void test_member_allow_list(struct kvm_vm *vm) +{ + struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW); + uint64_t count = test_with_filter(vm, f); + + free(f); + if (count != NUM_BRANCHES) + pr_info("%s: Branch instructions retired = %lu (expected %u)\n", + __func__, count, NUM_BRANCHES); + TEST_ASSERT(count, "Allowed PMU event is not counting"); +} + +static void test_not_member_deny_list(struct kvm_vm *vm) +{ + struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY); + uint64_t count; + + remove_event(f, INTEL_BR_RETIRED); + remove_event(f, AMD_ZEN_BR_RETIRED); + count = test_with_filter(vm, f); + free(f); + if (count != NUM_BRANCHES) + pr_info("%s: Branch instructions retired = %lu (expected %u)\n", + __func__, count, NUM_BRANCHES); + TEST_ASSERT(count, "Allowed PMU event is not counting"); +} + +static void test_not_member_allow_list(struct kvm_vm *vm) +{ + struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW); + uint64_t count; + + remove_event(f, INTEL_BR_RETIRED); + remove_event(f, AMD_ZEN_BR_RETIRED); + count = test_with_filter(vm, f); + free(f); + if (count) + pr_info("%s: Branch instructions retired = %lu (expected 0)\n", + __func__, count); + TEST_ASSERT(!count, "Disallowed PMU Event is counting"); +} + +/* + * Check for a non-zero PMU version, at least one general-purpose + * counter per logical processor, an EBX bit vector of length greater + * than 5, and EBX[5] clear. + */ +static bool check_intel_pmu_leaf(struct kvm_cpuid_entry2 *entry) +{ + union cpuid10_eax eax = { .full = entry->eax }; + union cpuid10_ebx ebx = { .full = entry->ebx }; + + return eax.split.version_id && eax.split.num_counters > 0 && + eax.split.mask_length > ARCH_PERFMON_BRANCHES_RETIRED && + !ebx.split.no_branch_instruction_retired; +} + +/* + * Note that CPUID leaf 0xa is Intel-specific. This leaf should be + * clear on AMD hardware. + */ +static bool use_intel_pmu(void) +{ + struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid2 *cpuid; + + cpuid = kvm_get_supported_cpuid(); + entry = kvm_get_supported_cpuid_index(0xa, 0); + return is_intel_cpu() && entry && check_intel_pmu_leaf(entry); +} + +static bool is_zen1(uint32_t eax) +{ + return x86_family(eax) == 0x17 && x86_model(eax) <= 0x0f; +} + +static bool is_zen2(uint32_t eax) +{ + return x86_family(eax) == 0x17 && + x86_model(eax) >= 0x30 && x86_model(eax) <= 0x3f; +} + +static bool is_zen3(uint32_t eax) +{ + return x86_family(eax) == 0x19 && x86_model(eax) <= 0x0f; +} + +/* + * Determining AMD support for a PMU event requires consulting the AMD + * PPR for the CPU or reference material derived therefrom. The AMD + * test code herein has been verified to work on Zen1, Zen2, and Zen3. + * + * Feel free to add more AMD CPUs that are documented to support event + * select 0xc2 umask 0 as "retired branch instructions." + */ +static bool use_amd_pmu(void) +{ + struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid2 *cpuid; + + cpuid = kvm_get_supported_cpuid(); + entry = kvm_get_supported_cpuid_index(1, 0); + return is_amd_cpu() && entry && + (is_zen1(entry->eax) || + is_zen2(entry->eax) || + is_zen3(entry->eax)); +} + +int main(int argc, char *argv[]) +{ + void (*guest_code)(void) = NULL; + struct kvm_vm *vm; + int r; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + r = kvm_check_cap(KVM_CAP_PMU_EVENT_FILTER); + if (!r) { + print_skip("KVM_CAP_PMU_EVENT_FILTER not supported"); + exit(KSFT_SKIP); + } + + if (use_intel_pmu()) + guest_code = intel_guest_code; + else if (use_amd_pmu()) + guest_code = amd_guest_code; + + if (!guest_code) { + print_skip("Don't know how to test this guest PMU"); + exit(KSFT_SKIP); + } + + vm = vm_create_default(VCPU_ID, 0, guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + + if (!sanity_check_pmu(vm)) { + print_skip("Guest PMU is not functional"); + exit(KSFT_SKIP); + } + + test_without_filter(vm); + test_member_deny_list(vm); + test_member_allow_list(vm); + test_not_member_deny_list(vm); + test_not_member_allow_list(vm); + + kvm_vm_free(vm); + + return 0; +} From fc4fad79fc3d8841562e2a85808079da5b4835f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 28 Dec 2021 23:24:36 +0000 Subject: [PATCH 0208/1295] KVM: VMX: Reject KVM_RUN if emulation is required with pending exception Reject KVM_RUN if emulation is required (because VMX is running without unrestricted guest) and an exception is pending, as KVM doesn't support emulating exceptions except when emulating real mode via vm86. The vCPU is hosed either way, but letting KVM_RUN proceed triggers a WARN due to the impossible condition. Alternatively, the WARN could be removed, but then userspace and/or KVM bugs would result in the vCPU silently running in a bad state, which isn't very friendly to users. Originally, the bug was hit by syzkaller with a nested guest as that doesn't require kvm_intel.unrestricted_guest=0. That particular flavor is likely fixed by commit cd0e615c49e5 ("KVM: nVMX: Synthesize TRIPLE_FAULT for L2 if emulation is required"), but it's trivial to trigger the WARN with a non-nested guest, and userspace can likely force bad state via ioctls() for a nested guest as well. Checking for the impossible condition needs to be deferred until KVM_RUN because KVM can't force specific ordering between ioctls. E.g. clearing exception.pending in KVM_SET_SREGS doesn't prevent userspace from setting it in KVM_SET_VCPU_EVENTS, and disallowing KVM_SET_VCPU_EVENTS with emulation_required would prevent userspace from queuing an exception and then stuffing sregs. Note, if KVM were to try and detect/prevent the condition prior to KVM_RUN, handle_invalid_guest_state() and/or handle_emulation_failure() would need to be modified to clear the pending exception prior to exiting to userspace. ------------[ cut here ]------------ WARNING: CPU: 6 PID: 137812 at arch/x86/kvm/vmx/vmx.c:1623 vmx_queue_exception+0x14f/0x160 [kvm_intel] CPU: 6 PID: 137812 Comm: vmx_invalid_nes Not tainted 5.15.2-7cc36c3e14ae-pop #279 Hardware name: ASUS Q87M-E/Q87M-E, BIOS 1102 03/03/2014 RIP: 0010:vmx_queue_exception+0x14f/0x160 [kvm_intel] Code: <0f> 0b e9 fd fe ff ff 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 RSP: 0018:ffffa45c83577d38 EFLAGS: 00010202 RAX: 0000000000000003 RBX: 0000000080000006 RCX: 0000000000000006 RDX: 0000000000000000 RSI: 0000000000010002 RDI: ffff9916af734000 RBP: ffff9916af734000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000006 R13: 0000000000000000 R14: ffff9916af734038 R15: 0000000000000000 FS: 00007f1e1a47c740(0000) GS:ffff99188fb80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1e1a6a8008 CR3: 000000026f83b005 CR4: 00000000001726e0 Call Trace: kvm_arch_vcpu_ioctl_run+0x13a2/0x1f20 [kvm] kvm_vcpu_ioctl+0x279/0x690 [kvm] __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae Reported-by: syzbot+82112403ace4cbd780d8@syzkaller.appspotmail.com Signed-off-by: Sean Christopherson Message-Id: <20211228232437.1875318-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/svm.c | 6 ++++++ arch/x86/kvm/vmx/vmx.c | 22 ++++++++++++++++++++-- arch/x86/kvm/x86.c | 12 +++++++++--- 5 files changed, 37 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index f658bb4dbb74..65c3f1857d6e 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -55,6 +55,7 @@ KVM_X86_OP_NULL(tlb_remote_flush) KVM_X86_OP_NULL(tlb_remote_flush_with_range) KVM_X86_OP(tlb_flush_gva) KVM_X86_OP(tlb_flush_guest) +KVM_X86_OP(vcpu_pre_run) KVM_X86_OP(run) KVM_X86_OP_NULL(handle_exit) KVM_X86_OP_NULL(skip_emulated_instruction) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 89d1fdb39c46..6a8fa26ef98c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1380,6 +1380,7 @@ struct kvm_x86_ops { */ void (*tlb_flush_guest)(struct kvm_vcpu *vcpu); + int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d8cac84fb2dc..bf0c3a67d836 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3829,6 +3829,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) svm_complete_interrupts(vcpu); } +static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + return 1; +} + static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && @@ -4658,6 +4663,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .tlb_flush_gva = svm_flush_tlb_gva, .tlb_flush_guest = svm_flush_tlb, + .vcpu_pre_run = svm_vcpu_pre_run, .run = svm_vcpu_run, .handle_exit = handle_exit, .skip_emulated_instruction = skip_emulated_instruction, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 15e30602782b..41aaa37d9eb8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5426,6 +5426,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) return 1; } +static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->emulation_required && !vmx->rmode.vm86_active && + vcpu->arch.exception.pending; +} + static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5445,8 +5453,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (!kvm_emulate_instruction(vcpu, 0)) return 0; - if (vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending) { + if (vmx_emulation_required_with_pending_exception(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5468,6 +5475,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) return 1; } +static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (vmx_emulation_required_with_pending_exception(vcpu)) { + kvm_prepare_emulation_failure_exit(vcpu); + return 0; + } + + return 1; +} + static void grow_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7708,6 +7725,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .tlb_flush_gva = vmx_flush_tlb_gva, .tlb_flush_guest = vmx_flush_tlb_guest, + .vcpu_pre_run = vmx_vcpu_pre_run, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 52df8e6eaa57..49ff85e966d7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10393,10 +10393,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) } else WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); - if (kvm_run->immediate_exit) + if (kvm_run->immediate_exit) { r = -EINTR; - else - r = vcpu_run(vcpu); + goto out; + } + + r = static_call(kvm_x86_vcpu_pre_run)(vcpu); + if (r <= 0) + goto out; + + r = vcpu_run(vcpu); out: kvm_put_guest_fpu(vcpu); From e337f7e063641ca4d040c8210d4bd790b81effb0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 28 Dec 2021 23:24:37 +0000 Subject: [PATCH 0209/1295] KVM: selftests: Add a test to force emulation with a pending exception Add a VMX specific test to verify that KVM doesn't explode if userspace attempts KVM_RUN when emulation is required with a pending exception. KVM VMX's emulation support for !unrestricted_guest punts exceptions to userspace instead of attempting to synthesize the exception with all the correct state (and stack switching, etc...). Punting is acceptable as there's never been a request to support injecting exceptions when emulating due to invalid state, but KVM has historically assumed that userspace will do the right thing and either clear the exception or kill the guest. Deliberately do the opposite and attempt to re-enter the guest with a pending exception and emulation required to verify KVM continues to punt the combination to userspace, e.g. doesn't explode, WARN, etc... Signed-off-by: Sean Christopherson Message-Id: <20211228232437.1875318-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../vmx_exception_with_invalid_guest_state.c | 139 ++++++++++++++++++ 3 files changed, 141 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 9fe19f412f44..c2165b137a50 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -37,6 +37,7 @@ /x86_64/vmx_apic_access_test /x86_64/vmx_close_while_nested_test /x86_64/vmx_dirty_log_test +/x86_64/vmx_exception_with_invalid_guest_state /x86_64/vmx_invalid_nested_guest_state /x86_64/vmx_preemption_timer_test /x86_64/vmx_set_nested_state_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 7fbc80e3ecdf..81ebf99d6ff0 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -70,6 +70,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c new file mode 100644 index 000000000000..27a850f3d7ce --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#include +#include +#include +#include + +#include "kselftest.h" + +#define VCPU_ID 0 + +static struct kvm_vm *vm; + +static void guest_ud_handler(struct ex_regs *regs) +{ + /* Loop on the ud2 until guest state is made invalid. */ +} + +static void guest_code(void) +{ + asm volatile("ud2"); +} + +static void __run_vcpu_with_invalid_state(void) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + + vcpu_run(vm, VCPU_ID); + + TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, + "Expected KVM_EXIT_INTERNAL_ERROR, got %d (%s)\n", + run->exit_reason, exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION, + "Expected emulation failure, got %d\n", + run->emulation_failure.suberror); +} + +static void run_vcpu_with_invalid_state(void) +{ + /* + * Always run twice to verify KVM handles the case where _KVM_ queues + * an exception with invalid state and then exits to userspace, i.e. + * that KVM doesn't explode if userspace ignores the initial error. + */ + __run_vcpu_with_invalid_state(); + __run_vcpu_with_invalid_state(); +} + +static void set_timer(void) +{ + struct itimerval timer; + + timer.it_value.tv_sec = 0; + timer.it_value.tv_usec = 200; + timer.it_interval = timer.it_value; + ASSERT_EQ(setitimer(ITIMER_REAL, &timer, NULL), 0); +} + +static void set_or_clear_invalid_guest_state(bool set) +{ + static struct kvm_sregs sregs; + + if (!sregs.cr0) + vcpu_sregs_get(vm, VCPU_ID, &sregs); + sregs.tr.unusable = !!set; + vcpu_sregs_set(vm, VCPU_ID, &sregs); +} + +static void set_invalid_guest_state(void) +{ + set_or_clear_invalid_guest_state(true); +} + +static void clear_invalid_guest_state(void) +{ + set_or_clear_invalid_guest_state(false); +} + +static void sigalrm_handler(int sig) +{ + struct kvm_vcpu_events events; + + TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig); + + vcpu_events_get(vm, VCPU_ID, &events); + + /* + * If an exception is pending, attempt KVM_RUN with invalid guest, + * otherwise rearm the timer and keep doing so until the timer fires + * between KVM queueing an exception and re-entering the guest. + */ + if (events.exception.pending) { + set_invalid_guest_state(); + run_vcpu_with_invalid_state(); + } else { + set_timer(); + } +} + +int main(int argc, char *argv[]) +{ + if (!is_intel_cpu() || vm_is_unrestricted_guest(NULL)) { + print_skip("Must be run with kvm_intel.unrestricted_guest=0"); + exit(KSFT_SKIP); + } + + vm = vm_create_default(VCPU_ID, 0, (void *)guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + + vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); + + /* + * Stuff invalid guest state for L2 by making TR unusuable. The next + * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support + * emulating invalid guest state for L2. + */ + set_invalid_guest_state(); + run_vcpu_with_invalid_state(); + + /* + * Verify KVM also handles the case where userspace gains control while + * an exception is pending and stuffs invalid state. Run with valid + * guest state and a timer firing every 200us, and attempt to enter the + * guest with invalid state when the handler interrupts KVM with an + * exception pending. + */ + clear_invalid_guest_state(); + TEST_ASSERT(signal(SIGALRM, sigalrm_handler) != SIG_ERR, + "Failed to register SIGALRM handler, errno = %d (%s)", + errno, strerror(errno)); + + set_timer(); + run_vcpu_with_invalid_state(); +} From e09fccb5435d7b9ab3fd5dfeada8ae40cfa56e08 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 13 Jan 2022 13:29:24 +0100 Subject: [PATCH 0210/1295] KVM: avoid warning on s390 in mark_page_dirty Avoid warnings on s390 like [ 1801.980931] CPU: 12 PID: 117600 Comm: kworker/12:0 Tainted: G E 5.17.0-20220113.rc0.git0.32ce2abb03cf.300.fc35.s390x+next #1 [ 1801.980938] Workqueue: events irqfd_inject [kvm] [...] [ 1801.981057] Call Trace: [ 1801.981060] [<000003ff805f0f5c>] mark_page_dirty_in_slot+0xa4/0xb0 [kvm] [ 1801.981083] [<000003ff8060e9fe>] adapter_indicators_set+0xde/0x268 [kvm] [ 1801.981104] [<000003ff80613c24>] set_adapter_int+0x64/0xd8 [kvm] [ 1801.981124] [<000003ff805fb9aa>] kvm_set_irq+0xc2/0x130 [kvm] [ 1801.981144] [<000003ff805f8d86>] irqfd_inject+0x76/0xa0 [kvm] [ 1801.981164] [<0000000175e56906>] process_one_work+0x1fe/0x470 [ 1801.981173] [<0000000175e570a4>] worker_thread+0x64/0x498 [ 1801.981176] [<0000000175e5ef2c>] kthread+0x10c/0x110 [ 1801.981180] [<0000000175de73c8>] __ret_from_fork+0x40/0x58 [ 1801.981185] [<000000017698440a>] ret_from_fork+0xa/0x40 when writing to a guest from an irqfd worker as long as we do not have the dirty ring. Signed-off-by: Christian Borntraeger Reluctantly-acked-by: David Woodhouse Message-Id: <20220113122924.740496-1-borntraeger@linux.ibm.com> Fixes: 2efd61a608b0 ("KVM: Warn if mark_page_dirty() is called without an active vCPU") Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6e8e9d36f382..dc6d1823a9d9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3163,8 +3163,10 @@ void mark_page_dirty_in_slot(struct kvm *kvm, { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); +#ifdef CONFIG_HAVE_KVM_DIRTY_RING if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm)) return; +#endif if (memslot && kvm_slot_dirty_track_enabled(memslot)) { unsigned long rel_gfn = gfn - memslot->base_gfn; From d76fb40637fc0e84b27bf431cd72cf8fe3f813ef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:14 +0000 Subject: [PATCH 0211/1295] KVM: VMX: Handle PI descriptor updates during vcpu_put/load Move the posted interrupt pre/post_block logic into vcpu_put/load respectively, using the kvm_vcpu_is_blocking() to determining whether or not the wakeup handler needs to be set (and unset). This avoids updating the PI descriptor if halt-polling is successful, reduces the number of touchpoints for updating the descriptor, and eliminates the confusing behavior of intentionally leaving a "stale" PI.NDST when a blocking vCPU is scheduled back in after preemption. The downside is that KVM will do the PID update twice if the vCPU is preempted after prepare_to_rcuwait() but before schedule(), but that's a rare case (and non-existent on !PREEMPT kernels). The notable wart is the need to send a self-IPI on the wakeup vector if an outstanding notification is pending after configuring the wakeup vector. Ideally, KVM would just do a kvm_vcpu_wake_up() in this case, but the scheduler doesn't support waking a task from its preemption notifier callback, i.e. while the task is right in the middle of being scheduled out. Note, setting the wakeup vector before halt-polling is not necessary: once the pending IRQ will be recorded in the PIR, kvm_vcpu_has_events() will detect this (via kvm_cpu_get_interrupt(), kvm_apic_get_interrupt(), apic_has_interrupt_for_ppr() and finally vmx_sync_pir_to_irr()) and terminate the polling. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 154 ++++++++++++++------------------- arch/x86/kvm/vmx/posted_intr.h | 8 +- arch/x86/kvm/vmx/vmx.c | 5 -- 3 files changed, 72 insertions(+), 95 deletions(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index a82a15aa1982..63e2399f353a 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -52,6 +52,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct pi_desc old, new; + unsigned long flags; unsigned int dest; /* @@ -62,23 +63,34 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!enable_apicv || !lapic_in_kernel(vcpu)) return; - /* Nothing to do if PI.SN and PI.NDST both have the desired value. */ - if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) - return; - /* - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block is the one expected to change PID.NDST and the - * wakeup handler expects the vCPU to be on the blocked_vcpu_list that - * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up - * correctly. + * If the vCPU wasn't on the wakeup list and wasn't migrated, then the + * full update can be skipped as neither the vector nor the destination + * needs to be changed. */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - goto after_clear_sn; + if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { + /* + * Clear SN if it was set due to being preempted. Again, do + * this even if there is no assigned device for simplicity. + */ + if (pi_test_and_clear_sn(pi_desc)) + goto after_clear_sn; + return; + } + + local_irq_save(flags); + + /* + * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup + * list of the _previous_ pCPU, which will not be the same as the + * current pCPU if the task was migrated. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { + raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); + list_del(&vcpu->blocked_vcpu_list); + raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); } - /* The full case. Set the new destination and clear SN. */ dest = cpu_physical_id(cpu); if (!x2apic_mode) dest = (dest << 8) & 0xFF00; @@ -86,10 +98,22 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) do { old.control = new.control = READ_ONCE(pi_desc->control); + /* + * Clear SN (as above) and refresh the destination APIC ID to + * handle task migration (@cpu != vcpu->cpu). + */ new.ndst = dest; new.sn = 0; + + /* + * Restore the notification vector; in the blocking case, the + * descriptor was modified on "put" to use the wakeup vector. + */ + new.nv = POSTED_INTR_VECTOR; } while (pi_try_set_control(pi_desc, old.control, new.control)); + local_irq_restore(flags); + after_clear_sn: /* @@ -111,83 +135,24 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm) irq_remapping_cap(IRQ_POSTING_CAP); } -void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!vmx_can_use_vtd_pi(vcpu->kvm)) - return; - - /* Set SN when the vCPU is preempted */ - if (vcpu->preempted) - pi_set_sn(pi_desc); -} - -static void __pi_post_block(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - /* - * Remove the vCPU from the wakeup list of the _previous_ pCPU, which - * will not be the same as the current pCPU if the task was migrated. - */ - raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_del(&vcpu->blocked_vcpu_list); - raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - - dest = cpu_physical_id(vcpu->cpu); - if (!x2apic_mode) - dest = (dest << 8) & 0xFF00; - - WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR, - "Wakeup handler not enabled while the vCPU was blocking"); - - do { - old.control = new.control = READ_ONCE(pi_desc->control); - - new.ndst = dest; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); - - vcpu->pre_pcpu = -1; -} - /* - * This routine does the following things for vCPU which is going - * to be blocked if VT-d PI is enabled. - * - Store the vCPU to the wakeup list, so when interrupts happen - * we can find the right vCPU to wake up. - * - Change the Posted-interrupt descriptor as below: - * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR - * - If 'ON' is set during this process, which means at least one - * interrupt is posted for this vCPU, we cannot block it, in - * this case, return 1, otherwise, return 0. - * + * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set + * WAKEUP as the notification vector in the PI descriptor. */ -int pi_pre_block(struct kvm_vcpu *vcpu) +static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { - struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; unsigned long flags; - if (!vmx_can_use_vtd_pi(vcpu->kvm) || - vmx_interrupt_blocked(vcpu)) - return 0; - local_irq_save(flags); - vcpu->pre_pcpu = vcpu->cpu; raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); list_add_tail(&vcpu->blocked_vcpu_list, &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); - WARN(pi_desc->sn == 1, - "Posted Interrupt Suppress Notification set before blocking"); + WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); do { old.control = new.control = READ_ONCE(pi_desc->control); @@ -196,24 +161,37 @@ int pi_pre_block(struct kvm_vcpu *vcpu) new.nv = POSTED_INTR_WAKEUP_VECTOR; } while (pi_try_set_control(pi_desc, old.control, new.control)); - /* We should not block the vCPU if an interrupt is posted for it. */ - if (pi_test_on(pi_desc)) - __pi_post_block(vcpu); + /* + * Send a wakeup IPI to this CPU if an interrupt may have been posted + * before the notification vector was updated, in which case the IRQ + * will arrive on the non-wakeup vector. An IPI is needed as calling + * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not + * enabled until it is safe to call try_to_wake_up() on the task being + * scheduled out). + */ + if (pi_test_on(&new)) + apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); local_irq_restore(flags); - return (vcpu->pre_pcpu == -1); } -void pi_post_block(struct kvm_vcpu *vcpu) +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { - unsigned long flags; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (vcpu->pre_pcpu == -1) + if (!vmx_can_use_vtd_pi(vcpu->kvm)) return; - local_irq_save(flags); - __pi_post_block(vcpu); - local_irq_restore(flags); + if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) + pi_enable_wakeup_handler(vcpu); + + /* + * Set SN when the vCPU is preempted. Note, the vCPU can both be seen + * as blocking and preempted, e.g. if it's preempted between setting + * its wait state and manually scheduling out. + */ + if (vcpu->preempted) + pi_set_sn(pi_desc); } /* @@ -254,7 +232,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) * Bail out of the block loop if the VM has an assigned * device, but the blocking vCPU didn't reconfigure the * PI.NV to the wakeup vector, i.e. the assigned device - * came along after the initial check in pi_pre_block(). + * came along after the initial check in vmx_vcpu_pi_put(). */ void vmx_pi_start_assignment(struct kvm *kvm) { diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 36ae035f14aa..eb14e76b84ef 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -40,6 +40,12 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); @@ -88,8 +94,6 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); -int pi_pre_block(struct kvm_vcpu *vcpu); -void pi_post_block(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 41aaa37d9eb8..3a9a49a87b9d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7566,9 +7566,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) static int vmx_pre_block(struct kvm_vcpu *vcpu) { - if (pi_pre_block(vcpu)) - return 1; - if (kvm_lapic_hv_timer_in_use(vcpu)) kvm_lapic_switch_to_sw_timer(vcpu); @@ -7579,8 +7576,6 @@ static void vmx_post_block(struct kvm_vcpu *vcpu) { if (kvm_x86_ops.set_hv_timer) kvm_lapic_switch_to_hv_timer(vcpu); - - pi_post_block(vcpu); } static void vmx_setup_mce(struct kvm_vcpu *vcpu) From e6eec09b7bc7869a49ac0ff376415bad40030ade Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:15 +0000 Subject: [PATCH 0212/1295] KVM: Drop unused kvm_vcpu.pre_pcpu field Remove kvm_vcpu.pre_pcpu as it no longer has any users. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 - virt/kvm/kvm_main.c | 1 - 2 files changed, 2 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3c47b146851a..5c3c67b6318f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -309,7 +309,6 @@ struct kvm_vcpu { u64 requests; unsigned long guest_debug; - int pre_pcpu; struct list_head blocked_vcpu_list; struct mutex mutex; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index dc6d1823a9d9..c301cd8d583e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -427,7 +427,6 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) #endif kvm_async_pf_vcpu_init(vcpu); - vcpu->pre_pcpu = -1; INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); kvm_vcpu_set_in_spin_loop(vcpu, false); From 12a8eee5686ef3ea7d8db90cd664f11e4a39e349 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:16 +0000 Subject: [PATCH 0213/1295] KVM: Move x86 VMX's posted interrupt list_head to vcpu_vmx Move the seemingly generic block_vcpu_list from kvm_vcpu to vcpu_vmx, and rename the list and all associated variables to clarify that it tracks the set of vCPU that need to be poked on a posted interrupt to the wakeup vector. The list is not used to track _all_ vCPUs that are blocking, and the term "blocked" can be misleading as it may refer to a blocking condition in the host or the guest, where as the PI wakeup case is specifically for the vCPUs that are actively blocking from within the guest. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 39 +++++++++++++++++----------------- arch/x86/kvm/vmx/vmx.c | 2 ++ arch/x86/kvm/vmx/vmx.h | 3 +++ include/linux/kvm_host.h | 2 -- virt/kvm/kvm_main.c | 2 -- 5 files changed, 25 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 63e2399f353a..901eea44cf24 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -19,7 +19,7 @@ * wake the target vCPUs. vCPUs are removed from the list and the notification * vector is reset when the vCPU is scheduled in. */ -static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); /* * Protect the per-CPU list with a per-CPU spinlock to handle task migration. * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the @@ -27,7 +27,7 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will * occur if a wakeup IRQ arrives and attempts to acquire the lock. */ -static DEFINE_PER_CPU(raw_spinlock_t, blocked_vcpu_on_cpu_lock); +static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { @@ -51,6 +51,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); struct pi_desc old, new; unsigned long flags; unsigned int dest; @@ -86,9 +87,9 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * current pCPU if the task was migrated. */ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { - raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); - list_del(&vcpu->blocked_vcpu_list); - raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + list_del(&vmx->pi_wakeup_list); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); } dest = cpu_physical_id(cpu); @@ -142,15 +143,16 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm) static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); struct pi_desc old, new; unsigned long flags; local_irq_save(flags); - raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu)); - raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + list_add_tail(&vmx->pi_wakeup_list, + &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); @@ -199,24 +201,23 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) */ void pi_wakeup_handler(void) { - struct kvm_vcpu *vcpu; int cpu = smp_processor_id(); + struct vcpu_vmx *vmx; - raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), - blocked_vcpu_list) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); + list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu), + pi_wakeup_list) { - if (pi_test_on(pi_desc)) - kvm_vcpu_kick(vcpu); + if (pi_test_on(&vmx->pi_desc)) + kvm_vcpu_kick(&vmx->vcpu); } - raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } void __init pi_init_cpu(int cpu) { - INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - raw_spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu)); + raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3a9a49a87b9d..1840898bb184 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6943,6 +6943,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); + INIT_LIST_HEAD(&vmx->pi_wakeup_list); + err = -ENOMEM; vmx->vpid = allocate_vpid(); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f8fc7441baea..7f2c82e7f38f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -317,6 +317,9 @@ struct vcpu_vmx { /* Posted interrupt descriptor */ struct pi_desc pi_desc; + /* Used if this vCPU is waiting for PI notification wakeup. */ + struct list_head pi_wakeup_list; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5c3c67b6318f..f079820f52b5 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -309,8 +309,6 @@ struct kvm_vcpu { u64 requests; unsigned long guest_debug; - struct list_head blocked_vcpu_list; - struct mutex mutex; struct kvm_run *run; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c301cd8d583e..5a1164483e6c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -427,8 +427,6 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) #endif kvm_async_pf_vcpu_init(vcpu); - INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); - kvm_vcpu_set_in_spin_loop(vcpu, false); kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; From 98c25ead5eda5e9d41abe57839ad3e8caf19500c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:17 +0000 Subject: [PATCH 0214/1295] KVM: VMX: Move preemption timer <=> hrtimer dance to common x86 Handle the switch to/from the hypervisor/software timer when a vCPU is blocking in common x86 instead of in VMX. Even though VMX is the only user of a hypervisor timer, the logic and all functions involved are generic x86 (unless future CPUs do something completely different and implement a hypervisor timer that runs regardless of mode). Handling the switch in common x86 will allow for the elimination of the pre/post_blocks hooks, and also lets KVM switch back to the hypervisor timer if and only if it was in use (without additional params). Add a comment explaining why the switch cannot be deferred to kvm_sched_out() or kvm_vcpu_block(). Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 +----- arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1840898bb184..70be166833ac 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7568,16 +7568,12 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) static int vmx_pre_block(struct kvm_vcpu *vcpu) { - if (kvm_lapic_hv_timer_in_use(vcpu)) - kvm_lapic_switch_to_sw_timer(vcpu); - return 0; } static void vmx_post_block(struct kvm_vcpu *vcpu) { - if (kvm_x86_ops.set_hv_timer) - kvm_lapic_switch_to_hv_timer(vcpu); + } static void vmx_setup_mce(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 49ff85e966d7..943706a09e6e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10146,8 +10146,21 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { + bool hv_timer; + if (!kvm_arch_vcpu_runnable(vcpu) && (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { + /* + * Switch to the software timer before halt-polling/blocking as + * the guest's timer may be a break event for the vCPU, and the + * hypervisor timer runs only when the CPU is in guest mode. + * Switch before halt-polling so that KVM recognizes an expired + * timer before blocking. + */ + hv_timer = kvm_lapic_hv_timer_in_use(vcpu); + if (hv_timer) + kvm_lapic_switch_to_sw_timer(vcpu); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) kvm_vcpu_halt(vcpu); @@ -10155,6 +10168,9 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + if (hv_timer) + kvm_lapic_switch_to_hv_timer(vcpu); + if (kvm_x86_ops.post_block) static_call(kvm_x86_post_block)(vcpu); @@ -10349,6 +10365,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = -EINTR; goto out; } + /* + * It should be impossible for the hypervisor timer to be in + * use before KVM has ever run the vCPU. + */ + WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu)); kvm_vcpu_block(vcpu); if (kvm_apic_accept_events(vcpu) < 0) { r = 0; From b6d42baddf85310eb2c455cf78af2580773c4ff0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:18 +0000 Subject: [PATCH 0215/1295] KVM: x86: Unexport LAPIC's switch_to_{hv,sw}_timer() helpers Unexport switch_to_{hv,sw}_timer() now that common x86 handles the transitions. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c5028e6b0f96..baca9fa37a91 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1950,7 +1950,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { restart_apic_timer(vcpu->arch.apic); } -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) { @@ -1962,7 +1961,6 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) start_sw_timer(apic); preempt_enable(); } -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) { From c3e8abf0f3536a46a235b0533149c2b2c2bbac27 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:19 +0000 Subject: [PATCH 0216/1295] KVM: x86: Remove defunct pre_block/post_block kvm_x86_ops hooks Drop kvm_x86_ops' pre/post_block() now that all implementations are nops. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 2 -- arch/x86/include/asm/kvm_host.h | 12 ------------ arch/x86/kvm/vmx/vmx.c | 13 ------------- arch/x86/kvm/x86.c | 6 +----- 4 files changed, 1 insertion(+), 32 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 65c3f1857d6e..631d5040b31e 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -99,8 +99,6 @@ KVM_X86_OP(handle_exit_irqoff) KVM_X86_OP_NULL(request_immediate_exit) KVM_X86_OP(sched_in) KVM_X86_OP_NULL(update_cpu_dirty_logging) -KVM_X86_OP_NULL(pre_block) -KVM_X86_OP_NULL(post_block) KVM_X86_OP_NULL(vcpu_blocking) KVM_X86_OP_NULL(vcpu_unblocking) KVM_X86_OP_NULL(update_pi_irte) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6a8fa26ef98c..682ad02a4e58 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1454,18 +1454,6 @@ struct kvm_x86_ops { const struct kvm_pmu_ops *pmu_ops; const struct kvm_x86_nested_ops *nested_ops; - /* - * Architecture specific hooks for vCPU blocking due to - * HLT instruction. - * Returns for .pre_block(): - * - 0 means continue to block the vCPU. - * - 1 means we cannot block the vCPU since some event - * happens during this period, such as, 'ON' bit in - * posted-interrupts descriptor is set. - */ - int (*pre_block)(struct kvm_vcpu *vcpu); - void (*post_block)(struct kvm_vcpu *vcpu); - void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 70be166833ac..9ab98e1bd65a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7566,16 +7566,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } -static int vmx_pre_block(struct kvm_vcpu *vcpu) -{ - return 0; -} - -static void vmx_post_block(struct kvm_vcpu *vcpu) -{ - -} - static void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) @@ -7777,9 +7767,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cpu_dirty_log_size = PML_ENTITY_NUM, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - .pre_block = vmx_pre_block, - .post_block = vmx_post_block, - .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 943706a09e6e..4b9728a53de6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10148,8 +10148,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { bool hv_timer; - if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { + if (!kvm_arch_vcpu_runnable(vcpu)) { /* * Switch to the software timer before halt-polling/blocking as * the guest's timer may be a break event for the vCPU, and the @@ -10171,9 +10170,6 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) if (hv_timer) kvm_lapic_switch_to_hv_timer(vcpu); - if (kvm_x86_ops.post_block) - static_call(kvm_x86_post_block)(vcpu); - if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; } From 31f251d4ddfa464c6dd92ee873b9b223e992a085 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:20 +0000 Subject: [PATCH 0217/1295] KVM: SVM: Signal AVIC doorbell iff vCPU is in guest mode Signal the AVIC doorbell iff the vCPU is running in the guest. If the vCPU is not IN_GUEST_MODE, it's guaranteed to pick up any pending IRQs on the next VMRUN, which unconditionally processes the vIRR. Add comments to document the logic. Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 0e5b49294086..40039477bebb 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -672,9 +672,22 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) return -1; kvm_lapic_set_irr(vec, vcpu->arch.apic); + + /* + * Pairs with the smp_mb_*() after setting vcpu->guest_mode in + * vcpu_enter_guest() to ensure the write to the vIRR is ordered before + * the read of guest_mode, which guarantees that either VMRUN will see + * and process the new vIRR entry, or that the below code will signal + * the doorbell if the vCPU is already running in the guest. + */ smp_mb__after_atomic(); - if (avic_vcpu_is_running(vcpu)) { + /* + * Signal the doorbell to tell hardware to inject the IRQ if the vCPU + * is in the guest. If the vCPU is not in the guest, hardware will + * automatically process AVIC interrupts at VMRUN. + */ + if (vcpu->mode == IN_GUEST_MODE) { int cpu = READ_ONCE(vcpu->cpu); /* @@ -688,8 +701,13 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) if (cpu != get_cpu()) wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); put_cpu(); - } else + } else { + /* + * Wake the vCPU if it was blocking. KVM will then detect the + * pending IRQ when checking if the vCPU has a wake event. + */ kvm_vcpu_wake_up(vcpu); + } return 0; } From 202470d536b2cad22fa859f3e01202571c49ded9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:21 +0000 Subject: [PATCH 0218/1295] KVM: SVM: Don't bother checking for "running" AVIC when kicking for IPIs Drop the avic_vcpu_is_running() check when waking vCPUs in response to a VM-Exit due to incomplete IPI delivery. The check isn't wrong per se, but it's not 100% accurate in the sense that it doesn't guarantee that the vCPU was one of the vCPUs that didn't receive the IPI. The check isn't required for correctness as blocking == !running in this context. From a performance perspective, waking a live task is not expensive as the only moderately costly operation is a locked operation to temporarily disable preemption. And if that is indeed a performance issue, kvm_vcpu_is_blocking() would be a better check than poking into the AVIC. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-12-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 15 +++++++++------ arch/x86/kvm/svm/svm.h | 11 ----------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 40039477bebb..2d8278167c0f 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -295,13 +295,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, struct kvm_vcpu *vcpu; unsigned long i; + /* + * Wake any target vCPUs that are blocking, i.e. waiting for a wake + * event. There's no need to signal doorbells, as hardware has handled + * vCPUs that were in guest at the time of the IPI, and vCPUs that have + * since entered the guest will have processed pending IRQs at VMRUN. + */ kvm_for_each_vcpu(i, vcpu, kvm) { - bool m = kvm_apic_match_dest(vcpu, source, - icrl & APIC_SHORT_MASK, - GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK); - - if (m && !avic_vcpu_is_running(vcpu)) + if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, + GET_APIC_DEST_FIELD(icrh), + icrl & APIC_DEST_MASK)) kvm_vcpu_wake_up(vcpu); } } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6d29421dd4bf..fd0685b2ce55 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -573,17 +573,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL -static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 *entry = svm->avic_physical_id_cache; - - if (!entry) - return false; - - return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); -} - int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); From e422b88969489e67fd1a87a6ef4ef5c30bb53edb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:22 +0000 Subject: [PATCH 0219/1295] KVM: SVM: Remove unnecessary APICv/AVIC update in vCPU unblocking path Remove handling of KVM_REQ_APICV_UPDATE from svm_vcpu_unblocking(), it's no longer needed as it was made obsolete by commit df7e4827c549 ("KVM: SVM: call avic_vcpu_load/avic_vcpu_put when enabling/disabling AVIC"). Prior to that commit, the manual check was necessary to ensure the AVIC stuff was updated by avic_set_running() when a request to enable APICv became pending while the vCPU was blocking, as the request handling itself would not do the update. But, as evidenced by the commit, that logic was flawed and subject to various races. Now that svm_refresh_apicv_exec_ctrl() does avic_vcpu_load/put() in response to an APICv status change, drop the manual check in the unblocking path. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-13-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 2d8278167c0f..7efe034c44c1 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1040,7 +1040,5 @@ void svm_vcpu_blocking(struct kvm_vcpu *vcpu) void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) { - if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) - kvm_vcpu_update_apicv(vcpu); avic_set_running(vcpu, true); } From af52f5aa5c1b46809834b728a13a1af5aab451e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:23 +0000 Subject: [PATCH 0220/1295] KVM: SVM: Use kvm_vcpu_is_blocking() in AVIC load to handle preemption Use kvm_vcpu_is_blocking() to determine whether or not the vCPU should be marked running during avic_vcpu_load(). Drop avic_is_running, which really should have been named "vcpu_is_not_blocking", as it tracked if the vCPU was blocking, not if it was actually running, e.g. it was set during svm_create_vcpu() when the vCPU was obviously not running. This is technically a teeny tiny functional change, as the vCPU will be marked IsRunning=1 on being reloaded if the vCPU is preempted between svm_vcpu_blocking() and prepare_to_rcuwait(). But that's a benign change as the vCPU will be marked IsRunning=0 when KVM voluntarily schedules out the vCPU. Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 14 +++++++++----- arch/x86/kvm/svm/svm.c | 6 ------ arch/x86/kvm/svm/svm.h | 1 - 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 7efe034c44c1..368813320121 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -975,6 +975,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { u64 entry; /* ID = 0xff (broadcast), ID > 0xff (reserved) */ + bool is_blocking = kvm_vcpu_is_blocking(vcpu); int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -992,12 +993,17 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - if (svm->avic_is_running) + + /* + * Don't mark the vCPU as running if its blocking, i.e. if the vCPU is + * preempted after svm_vcpu_blocking() but before KVM voluntarily + * schedules out the vCPU. + */ + if (!is_blocking) entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, - svm->avic_is_running); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, !is_blocking); } void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -1018,11 +1024,9 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) */ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) { - struct vcpu_svm *svm = to_svm(vcpu); int cpu = get_cpu(); WARN_ON(cpu != vcpu->cpu); - svm->avic_is_running = is_run; if (kvm_vcpu_apicv_active(vcpu)) { if (is_run) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bf0c3a67d836..d21ef0381d29 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1440,12 +1440,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) if (err) goto error_free_vmsa_page; - /* We initialize this flag to true to make sure that the is_running - * bit would be set the first time the vcpu is loaded. - */ - if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm)) - svm->avic_is_running = true; - svm->msrpm = svm_vcpu_alloc_msrpm(); if (!svm->msrpm) { err = -ENOMEM; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index fd0685b2ce55..5178f6b245e1 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -225,7 +225,6 @@ struct vcpu_svm { u32 dfr_reg; struct page *avic_backing_page; u64 *avic_physical_id_cache; - bool avic_is_running; /* * Per-vcpu list of struct amd_svm_iommu_ir: From 782f64558de7bef84b90ea812deb38f0e53a8c7a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:24 +0000 Subject: [PATCH 0221/1295] KVM: SVM: Skip AVIC and IRTE updates when loading blocking vCPU Don't bother updating the Physical APIC table or IRTE when loading a vCPU that is blocking, i.e. won't be marked IsRun{ning}=1, as the pCPU is queried if and only if IsRunning is '1'. If the vCPU was migrated, the new pCPU will be picked up when avic_vcpu_load() is called by svm_vcpu_unblocking(). Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-15-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 368813320121..b7353e11da2e 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -975,7 +975,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { u64 entry; /* ID = 0xff (broadcast), ID > 0xff (reserved) */ - bool is_blocking = kvm_vcpu_is_blocking(vcpu); int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -986,24 +985,25 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) return; + /* + * No need to update anything if the vCPU is blocking, i.e. if the vCPU + * is being scheduled in after being preempted. The CPU entries in the + * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. + * If the vCPU was migrated, its new CPU value will be stuffed when the + * vCPU unblocks. + */ + if (kvm_vcpu_is_blocking(vcpu)) + return; + entry = READ_ONCE(*(svm->avic_physical_id_cache)); WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); - - entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - - /* - * Don't mark the vCPU as running if its blocking, i.e. if the vCPU is - * preempted after svm_vcpu_blocking() but before KVM voluntarily - * schedules out the vCPU. - */ - if (!is_blocking) - entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, !is_blocking); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); } void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -1012,8 +1012,12 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); entry = READ_ONCE(*(svm->avic_physical_id_cache)); - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + + /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ + if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) + return; + + avic_update_iommu_vcpu_affinity(vcpu, -1, 0); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); From 0f65a9d337676b966316db17374fbef910ab8e4a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:26 +0000 Subject: [PATCH 0222/1295] KVM: VMX: Don't do full kick when triggering posted interrupt "fails" Replace the full "kick" with just the "wake" in the fallback path when triggering a virtual interrupt via a posted interrupt fails because the guest is not IN_GUEST_MODE. If the guest transitions into guest mode between the check and the kick, then it's guaranteed to see the pending interrupt as KVM syncs the PIR to IRR (and onto GUEST_RVI) after setting IN_GUEST_MODE. Kicking the guest in this case is nothing more than an unnecessary VM-Exit (and host IRQ). Opportunistically update comments to explain the various ordering rules and barriers at play. Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-17-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9ab98e1bd65a..ac3e943ec2a4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3998,7 +3998,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, /* the PIR and ON have been set by L1. */ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) - kvm_vcpu_kick(vcpu); + kvm_vcpu_wake_up(vcpu); return 0; } return -1; @@ -4036,7 +4036,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. */ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) - kvm_vcpu_kick(vcpu); + kvm_vcpu_wake_up(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b9728a53de6..55518b7d3b96 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9978,10 +9978,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_mb__after_srcu_read_unlock(); /* - * This handles the case where a posted interrupt was - * notified with kvm_vcpu_kick. Assigned devices can - * use the POSTED_INTR_VECTOR even if APICv is disabled, - * so do it even if APICv is disabled on this vCPU. + * Process pending posted interrupts to handle the case where the + * notification IRQ arrived in the host, or was never sent (because the + * target vCPU wasn't running). Do this regardless of the vCPU's APICv + * status, KVM doesn't update assigned devices when APICv is inhibited, + * i.e. they can post interrupts even if APICv is temporarily disabled. */ if (kvm_lapic_enabled(vcpu)) static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); From 296aa26644d088d8ccf0d62b0a93443f7188d5e5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:28 +0000 Subject: [PATCH 0223/1295] KVM: VMX: Pass desired vector instead of bool for triggering posted IRQ Refactor the posted interrupt helper to take the desired notification vector instead of a bool so that the callers are self-documenting. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-19-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ac3e943ec2a4..063c21559aa1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3932,11 +3932,9 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, - bool nested) + int pi_vec) { #ifdef CONFIG_SMP - int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; - if (vcpu->mode == IN_GUEST_MODE) { /* * The vector of interrupt to be delivered to vcpu had @@ -3997,7 +3995,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, smp_mb__after_atomic(); /* the PIR and ON have been set by L1. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR)) kvm_vcpu_wake_up(vcpu); return 0; } @@ -4035,7 +4033,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR)) kvm_vcpu_wake_up(vcpu); return 0; From ccf8d687542f6a7288b79727bec1cc084b3771b3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:29 +0000 Subject: [PATCH 0224/1295] KVM: VMX: Fold fallback path into triggering posted IRQ helper Move the fallback "wake_up" path into the helper to trigger posted interrupt helper now that the nested and non-nested paths are identical. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-20-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 063c21559aa1..a02a28ce7cc3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3931,7 +3931,7 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) pt_update_intercept_for_msr(vcpu); } -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, +static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, int pi_vec) { #ifdef CONFIG_SMP @@ -3962,10 +3962,15 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, */ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); - return true; + return; } #endif - return false; + /* + * The vCPU isn't in the guest; wake the vCPU in case it is blocking, + * otherwise do nothing as KVM will grab the highest priority pending + * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). + */ + kvm_vcpu_wake_up(vcpu); } static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, @@ -3995,8 +4000,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, smp_mb__after_atomic(); /* the PIR and ON have been set by L1. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR)) - kvm_vcpu_wake_up(vcpu); + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); return 0; } return -1; @@ -4033,9 +4037,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR)) - kvm_vcpu_wake_up(vcpu); - + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); return 0; } From 635e6357f948d57bc98af8d37eb81896333822e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:30 +0000 Subject: [PATCH 0225/1295] KVM: VMX: Don't do full kick when handling posted interrupt wakeup When waking vCPUs in the posted interrupt wakeup handling, do exactly that and no more. There is no need to kick the vCPU as the wakeup handler just needs to get the vCPU task running, and if it's in the guest then it's definitely running. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-21-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 901eea44cf24..aa1fe9085d77 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -209,7 +209,7 @@ void pi_wakeup_handler(void) pi_wakeup_list) { if (pi_test_on(&vmx->pi_desc)) - kvm_vcpu_kick(&vmx->vcpu); + kvm_vcpu_wake_up(&vmx->vcpu); } raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } From 935a7333958e91b5d0c1b0ebc75a5cefdbb34dd5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:31 +0000 Subject: [PATCH 0226/1295] KVM: SVM: Drop AVIC's intermediate avic_set_running() helper Drop avic_set_running() in favor of calling avic_vcpu_{load,put}() directly, and modify the block+put path to use preempt_disable/enable() instead of get/put_cpu(), as it doesn't actually care about the current pCPU associated with the vCPU. Opportunistically add lockdep assertions as being preempted in avic_vcpu_put() would lead to consuming stale data, even though doing so _in the current code base_ would not be fatal. Add a much needed comment explaining why svm_vcpu_blocking() needs to unload the AVIC and update the IRTE _before_ the vCPU starts blocking. Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-22-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 56 ++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b7353e11da2e..522f859408cb 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -978,6 +978,8 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); + lockdep_assert_preemption_disabled(); + /* * Since the host physical APIC id is 8 bits, * we can support host APIC ID upto 255. @@ -1011,6 +1013,8 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) u64 entry; struct vcpu_svm *svm = to_svm(vcpu); + lockdep_assert_preemption_disabled(); + entry = READ_ONCE(*(svm->avic_physical_id_cache)); /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ @@ -1023,30 +1027,42 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } -/* - * This function is called during VCPU halt/unhalt. - */ -static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) -{ - int cpu = get_cpu(); - - WARN_ON(cpu != vcpu->cpu); - - if (kvm_vcpu_apicv_active(vcpu)) { - if (is_run) - avic_vcpu_load(vcpu, cpu); - else - avic_vcpu_put(vcpu); - } - put_cpu(); -} - void svm_vcpu_blocking(struct kvm_vcpu *vcpu) { - avic_set_running(vcpu, false); + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + preempt_disable(); + + /* + * Unload the AVIC when the vCPU is about to block, _before_ + * the vCPU actually blocks. + * + * Any IRQs that arrive before IsRunning=0 will not cause an + * incomplete IPI vmexit on the source, therefore vIRR will also + * be checked by kvm_vcpu_check_block() before blocking. The + * memory barrier implicit in set_current_state orders writing + * IsRunning=0 before reading the vIRR. The processor needs a + * matching memory barrier on interrupt delivery between writing + * IRR and reading IsRunning; the lack of this barrier might be + * the cause of errata #1235). + */ + avic_vcpu_put(vcpu); + + preempt_enable(); } void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) { - avic_set_running(vcpu, true); + int cpu; + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + cpu = get_cpu(); + WARN_ON(cpu != vcpu->cpu); + + avic_vcpu_load(vcpu, cpu); + + put_cpu(); } From 54744e17f031cbc5c5b995b1e275df1520c8a739 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:32 +0000 Subject: [PATCH 0227/1295] KVM: SVM: Move svm_hardware_setup() and its helpers below svm_x86_ops Move svm_hardware_setup() below svm_x86_ops so that KVM can modify ops during setup, e.g. the vcpu_(un)blocking hooks can be nullified if AVIC is disabled or unsupported. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-23-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 467 +++++++++++++++++++++-------------------- 1 file changed, 234 insertions(+), 233 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d21ef0381d29..9b15d59e6344 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -869,47 +869,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -/* - * The default MMIO mask is a single bit (excluding the present bit), - * which could conflict with the memory encryption bit. Check for - * memory encryption support and override the default MMIO mask if - * memory encryption is enabled. - */ -static __init void svm_adjust_mmio_mask(void) -{ - unsigned int enc_bit, mask_bit; - u64 msr, mask; - - /* If there is no memory encryption support, use existing mask */ - if (cpuid_eax(0x80000000) < 0x8000001f) - return; - - /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_AMD64_SYSCFG, msr); - if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) - return; - - enc_bit = cpuid_ebx(0x8000001f) & 0x3f; - mask_bit = boot_cpu_data.x86_phys_bits; - - /* Increment the mask bit if it is the same as the encryption bit */ - if (enc_bit == mask_bit) - mask_bit++; - - /* - * If the mask bit location is below 52, then some bits above the - * physical addressing limit will always be reserved, so use the - * rsvd_bits() function to generate the mask. This mask, along with - * the present bit, will be used to generate a page fault with - * PFER.RSV = 1. - * - * If the mask bit location is 52 (or above), then clear the mask. - */ - mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; - - kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); -} - static void svm_hardware_teardown(void) { int cpu; @@ -924,198 +883,6 @@ static void svm_hardware_teardown(void) iopm_base = 0; } -static __init void svm_set_cpu_caps(void) -{ - kvm_set_cpu_caps(); - - supported_xss = 0; - - /* CPUID 0x80000001 and 0x8000000A (SVM features) */ - if (nested) { - kvm_cpu_cap_set(X86_FEATURE_SVM); - - if (nrips) - kvm_cpu_cap_set(X86_FEATURE_NRIPS); - - if (npt_enabled) - kvm_cpu_cap_set(X86_FEATURE_NPT); - - if (tsc_scaling) - kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); - - /* Nested VM can receive #VMEXIT instead of triggering #GP */ - kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); - } - - /* CPUID 0x80000008 */ - if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || - boot_cpu_has(X86_FEATURE_AMD_SSBD)) - kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); - - /* AMD PMU PERFCTR_CORE CPUID */ - if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) - kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); - - /* CPUID 0x8000001F (SME/SEV features) */ - sev_set_cpu_caps(); -} - -static __init int svm_hardware_setup(void) -{ - int cpu; - struct page *iopm_pages; - void *iopm_va; - int r; - unsigned int order = get_order(IOPM_SIZE); - - /* - * NX is required for shadow paging and for NPT if the NX huge pages - * mitigation is enabled. - */ - if (!boot_cpu_has(X86_FEATURE_NX)) { - pr_err_ratelimited("NX (Execute Disable) not supported\n"); - return -EOPNOTSUPP; - } - kvm_enable_efer_bits(EFER_NX); - - iopm_pages = alloc_pages(GFP_KERNEL, order); - - if (!iopm_pages) - return -ENOMEM; - - iopm_va = page_address(iopm_pages); - memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; - - init_msrpm_offsets(); - - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); - - if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) - kvm_enable_efer_bits(EFER_FFXSR); - - if (tsc_scaling) { - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - tsc_scaling = false; - } else { - pr_info("TSC scaling supported\n"); - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; - } - } - - tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); - - /* Check for pause filtering support */ - if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { - pause_filter_count = 0; - pause_filter_thresh = 0; - } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { - pause_filter_thresh = 0; - } - - if (nested) { - printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); - kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); - } - - /* - * KVM's MMU doesn't support using 2-level paging for itself, and thus - * NPT isn't supported if the host is using 2-level paging since host - * CR4 is unchanged on VMRUN. - */ - if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) - npt_enabled = false; - - if (!boot_cpu_has(X86_FEATURE_NPT)) - npt_enabled = false; - - /* Force VM NPT level equal to the host's paging level */ - kvm_configure_mmu(npt_enabled, get_npt_level(), - get_npt_level(), PG_LEVEL_1G); - pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); - - /* Note, SEV setup consumes npt_enabled. */ - sev_hardware_setup(); - - svm_hv_hardware_setup(); - - svm_adjust_mmio_mask(); - - for_each_possible_cpu(cpu) { - r = svm_cpu_init(cpu); - if (r) - goto err; - } - - if (nrips) { - if (!boot_cpu_has(X86_FEATURE_NRIPS)) - nrips = false; - } - - enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); - - if (enable_apicv) { - pr_info("AVIC enabled\n"); - - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - } - - if (vls) { - if (!npt_enabled || - !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || - !IS_ENABLED(CONFIG_X86_64)) { - vls = false; - } else { - pr_info("Virtual VMLOAD VMSAVE supported\n"); - } - } - - if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) - svm_gp_erratum_intercept = false; - - if (vgif) { - if (!boot_cpu_has(X86_FEATURE_VGIF)) - vgif = false; - else - pr_info("Virtual GIF supported\n"); - } - - if (lbrv) { - if (!boot_cpu_has(X86_FEATURE_LBRV)) - lbrv = false; - else - pr_info("LBR virtualization supported\n"); - } - - if (!enable_pmu) - pr_info("PMU virtualization is disabled\n"); - - svm_set_cpu_caps(); - - /* - * It seems that on AMD processors PTE's accessed bit is - * being set by the CPU hardware before the NPF vmexit. - * This is not expected behaviour and our tests fail because - * of it. - * A workaround here is to disable support for - * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. - * In this case userspace can know if there is support using - * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle - * it - * If future AMD CPU models change the behaviour described above, - * this variable can be changed accordingly - */ - allow_smaller_maxphyaddr = !npt_enabled; - - return 0; - -err: - svm_hardware_teardown(); - return r; -} - static void init_seg(struct vmcb_seg *seg) { seg->selector = 0; @@ -4738,6 +4505,240 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, }; +/* + * The default MMIO mask is a single bit (excluding the present bit), + * which could conflict with the memory encryption bit. Check for + * memory encryption support and override the default MMIO mask if + * memory encryption is enabled. + */ +static __init void svm_adjust_mmio_mask(void) +{ + unsigned int enc_bit, mask_bit; + u64 msr, mask; + + /* If there is no memory encryption support, use existing mask */ + if (cpuid_eax(0x80000000) < 0x8000001f) + return; + + /* If memory encryption is not enabled, use existing mask */ + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) + return; + + enc_bit = cpuid_ebx(0x8000001f) & 0x3f; + mask_bit = boot_cpu_data.x86_phys_bits; + + /* Increment the mask bit if it is the same as the encryption bit */ + if (enc_bit == mask_bit) + mask_bit++; + + /* + * If the mask bit location is below 52, then some bits above the + * physical addressing limit will always be reserved, so use the + * rsvd_bits() function to generate the mask. This mask, along with + * the present bit, will be used to generate a page fault with + * PFER.RSV = 1. + * + * If the mask bit location is 52 (or above), then clear the mask. + */ + mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; + + kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); +} + +static __init void svm_set_cpu_caps(void) +{ + kvm_set_cpu_caps(); + + supported_xss = 0; + + /* CPUID 0x80000001 and 0x8000000A (SVM features) */ + if (nested) { + kvm_cpu_cap_set(X86_FEATURE_SVM); + + if (nrips) + kvm_cpu_cap_set(X86_FEATURE_NRIPS); + + if (npt_enabled) + kvm_cpu_cap_set(X86_FEATURE_NPT); + + if (tsc_scaling) + kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); + + /* Nested VM can receive #VMEXIT instead of triggering #GP */ + kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); + } + + /* CPUID 0x80000008 */ + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || + boot_cpu_has(X86_FEATURE_AMD_SSBD)) + kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + + /* AMD PMU PERFCTR_CORE CPUID */ + if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) + kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); + + /* CPUID 0x8000001F (SME/SEV features) */ + sev_set_cpu_caps(); +} + +static __init int svm_hardware_setup(void) +{ + int cpu; + struct page *iopm_pages; + void *iopm_va; + int r; + unsigned int order = get_order(IOPM_SIZE); + + /* + * NX is required for shadow paging and for NPT if the NX huge pages + * mitigation is enabled. + */ + if (!boot_cpu_has(X86_FEATURE_NX)) { + pr_err_ratelimited("NX (Execute Disable) not supported\n"); + return -EOPNOTSUPP; + } + kvm_enable_efer_bits(EFER_NX); + + iopm_pages = alloc_pages(GFP_KERNEL, order); + + if (!iopm_pages) + return -ENOMEM; + + iopm_va = page_address(iopm_pages); + memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); + iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + + init_msrpm_offsets(); + + supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + + if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) + kvm_enable_efer_bits(EFER_FFXSR); + + if (tsc_scaling) { + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + tsc_scaling = false; + } else { + pr_info("TSC scaling supported\n"); + kvm_has_tsc_control = true; + kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; + kvm_tsc_scaling_ratio_frac_bits = 32; + } + } + + tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); + + /* Check for pause filtering support */ + if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { + pause_filter_count = 0; + pause_filter_thresh = 0; + } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { + pause_filter_thresh = 0; + } + + if (nested) { + printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); + kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); + } + + /* + * KVM's MMU doesn't support using 2-level paging for itself, and thus + * NPT isn't supported if the host is using 2-level paging since host + * CR4 is unchanged on VMRUN. + */ + if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) + npt_enabled = false; + + if (!boot_cpu_has(X86_FEATURE_NPT)) + npt_enabled = false; + + /* Force VM NPT level equal to the host's paging level */ + kvm_configure_mmu(npt_enabled, get_npt_level(), + get_npt_level(), PG_LEVEL_1G); + pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + + /* Note, SEV setup consumes npt_enabled. */ + sev_hardware_setup(); + + svm_hv_hardware_setup(); + + svm_adjust_mmio_mask(); + + for_each_possible_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err; + } + + if (nrips) { + if (!boot_cpu_has(X86_FEATURE_NRIPS)) + nrips = false; + } + + enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); + + if (enable_apicv) { + pr_info("AVIC enabled\n"); + + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + } + + if (vls) { + if (!npt_enabled || + !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || + !IS_ENABLED(CONFIG_X86_64)) { + vls = false; + } else { + pr_info("Virtual VMLOAD VMSAVE supported\n"); + } + } + + if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) + svm_gp_erratum_intercept = false; + + if (vgif) { + if (!boot_cpu_has(X86_FEATURE_VGIF)) + vgif = false; + else + pr_info("Virtual GIF supported\n"); + } + + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } + + if (!enable_pmu) + pr_info("PMU virtualization is disabled\n"); + + svm_set_cpu_caps(); + + /* + * It seems that on AMD processors PTE's accessed bit is + * being set by the CPU hardware before the NPF vmexit. + * This is not expected behaviour and our tests fail because + * of it. + * A workaround here is to disable support for + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. + * In this case userspace can know if there is support using + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle + * it + * If future AMD CPU models change the behaviour described above, + * this variable can be changed accordingly + */ + allow_smaller_maxphyaddr = !npt_enabled; + + return 0; + +err: + svm_hardware_teardown(); + return r; +} + + static struct kvm_x86_init_ops svm_init_ops __initdata = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, From a3c19d5beaad25fcaa703b251c72c3a22fc09100 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:33 +0000 Subject: [PATCH 0228/1295] KVM: SVM: Nullify vcpu_(un)blocking() hooks if AVIC is disabled Nullify svm_x86_ops.vcpu_(un)blocking if AVIC/APICv is disabled as the hooks are necessary only to clear the vCPU's IsRunning entry in the Physical APIC and to update IRTE entries if the VM has a pass-through device attached. Opportunistically rename the helpers to clarify their AVIC relationship. Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-24-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 4 ++-- arch/x86/kvm/svm/svm.c | 7 +++++-- arch/x86/kvm/svm/svm.h | 4 ++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 522f859408cb..90364d02f22a 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1027,7 +1027,7 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } -void svm_vcpu_blocking(struct kvm_vcpu *vcpu) +void avic_vcpu_blocking(struct kvm_vcpu *vcpu) { if (!kvm_vcpu_apicv_active(vcpu)) return; @@ -1052,7 +1052,7 @@ void svm_vcpu_blocking(struct kvm_vcpu *vcpu) preempt_enable(); } -void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) +void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) { int cpu; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9b15d59e6344..6d31d357a83b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4391,8 +4391,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .prepare_guest_switch = svm_prepare_guest_switch, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, - .vcpu_blocking = svm_vcpu_blocking, - .vcpu_unblocking = svm_vcpu_unblocking, + .vcpu_blocking = avic_vcpu_blocking, + .vcpu_unblocking = avic_vcpu_unblocking, .update_exception_bitmap = svm_update_exception_bitmap, .get_msr_feature = svm_get_msr_feature, @@ -4682,6 +4682,9 @@ static __init int svm_hardware_setup(void) pr_info("AVIC enabled\n"); amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + } else { + svm_x86_ops.vcpu_blocking = NULL; + svm_x86_ops.vcpu_unblocking = NULL; } if (vls) { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 5178f6b245e1..47ef8f4a9358 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -592,8 +592,8 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec); bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); -void svm_vcpu_blocking(struct kvm_vcpu *vcpu); -void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); +void avic_vcpu_blocking(struct kvm_vcpu *vcpu); +void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); /* sev.c */ From d5ad5b1c04c85f01850e88231cad7dfbc9e1d30c Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Tue, 18 Jan 2022 17:20:52 +0500 Subject: [PATCH 0229/1295] selftests: kvm: add amx_test to .gitignore amx_test's binary should be present in the .gitignore file for the git to ignore it. Fixes: bf70636d9443 ("selftest: kvm: Add amx selftest") Signed-off-by: Muhammad Usama Anjum Message-Id: <20220118122053.1941915-1-usama.anjum@collabora.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index c2165b137a50..dce7de7755e6 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -8,6 +8,7 @@ /s390x/memop /s390x/resets /s390x/sync_regs_test +/x86_64/amx_test /x86_64/cpuid_test /x86_64/cr4_cpuid_sync_test /x86_64/debug_regs From 6b34cd8e175bfbf4f3f01b6d19eae18245e1a8cc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 17 Jan 2022 16:28:29 +0000 Subject: [PATCH 0230/1295] btrfs: fix too long loop when defragging a 1 byte file When attempting to defrag a file with a single byte, we can end up in a too long loop, which is nearly infinite because at btrfs_defrag_file() we end up with the variable last_byte assigned with a value of 18446744073709551615 (which is (u64)-1). The problem comes from the fact we end up doing: last_byte = round_up(last_byte, fs_info->sectorsize) - 1; So if last_byte was assigned 0, which is i_size - 1, we underflow and end up with the value 18446744073709551615. This is trivial to reproduce and the following script triggers it: $ cat test.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj mkfs.btrfs -f $DEV mount $DEV $MNT echo -n "X" > $MNT/foobar btrfs filesystem defragment $MNT/foobar umount $MNT So fix this by not decrementing last_byte by 1 before doing the sector size round up. Also, to make it easier to follow, make the round up right after computing last_byte. Reported-by: Anthony Ruhier Fixes: 7b508037d4cac3 ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()") Link: https://lore.kernel.org/linux-btrfs/0a269612-e43f-da22-c5bc-b34b1b56ebe8@mailbox.org/ CC: stable@vger.kernel.org # 5.16 Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a5bd6926f7ff..6ad2bc2e5af3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1518,12 +1518,16 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, if (range->start + range->len > range->start) { /* Got a specific range */ - last_byte = min(isize, range->start + range->len) - 1; + last_byte = min(isize, range->start + range->len); } else { /* Defrag until file end */ - last_byte = isize - 1; + last_byte = isize; } + /* Align the range */ + cur = round_down(range->start, fs_info->sectorsize); + last_byte = round_up(last_byte, fs_info->sectorsize) - 1; + /* * If we were not given a ra, allocate a readahead context. As * readahead is just an optimization, defrag will work without it so @@ -1536,10 +1540,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, file_ra_state_init(ra, inode->i_mapping); } - /* Align the range */ - cur = round_down(range->start, fs_info->sectorsize); - last_byte = round_up(last_byte, fs_info->sectorsize) - 1; - while (cur < last_byte) { u64 cluster_end; From b767c2fc787e992daeadfff40d61c05f66c82da0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 18 Jan 2022 13:43:31 +0000 Subject: [PATCH 0231/1295] btrfs: allow defrag to be interruptible During defrag, at btrfs_defrag_file(), we have this loop that iterates over a file range in steps no larger than 256K subranges. If the range is too long, there's no way to interrupt it. So make the loop check in each iteration if there's signal pending, and if there is, break and return -AGAIN to userspace. Before kernel 5.16, we used to allow defrag to be cancelled through a signal, but that was lost with commit 7b508037d4cac3 ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()"). This change adds back the possibility to cancel a defrag with a signal and keeps the same semantics, returning -EAGAIN to user space (and not the usually more expected -EINTR). This is also motivated by a recent bug on 5.16 where defragging a 1 byte file resulted in iterating from file range 0 to (u64)-1, as hitting the bug triggered a too long loop, basically requiring one to reboot the machine, as it was not possible to cancel defrag. Fixes: 7b508037d4cac3 ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()") CC: stable@vger.kernel.org # 5.16 Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6ad2bc2e5af3..6391be7409d8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1546,6 +1546,11 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, /* The cluster size 256K should always be page aligned */ BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); + if (btrfs_defrag_cancelled(fs_info)) { + ret = -EAGAIN; + break; + } + /* We want the cluster end at page boundary when possible */ cluster_end = (((cur >> PAGE_SHIFT) + (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; From 70431bfd825d9cd5d93412c0456f253ecad6c415 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 17 Nov 2020 15:56:59 +0000 Subject: [PATCH 0232/1295] cifs: Support fscache indexing rewrite Change the cifs filesystem to take account of the changes to fscache's indexing rewrite and reenable caching in cifs. The following changes have been made: (1) The fscache_netfs struct is no more, and there's no need to register the filesystem as a whole. (2) The session cookie is now an fscache_volume cookie, allocated with fscache_acquire_volume(). That takes three parameters: a string representing the "volume" in the index, a string naming the cache to use (or NULL) and a u64 that conveys coherency metadata for the volume. For cifs, I've made it render the volume name string as: "cifs,," where the sharename has '/' characters replaced with ';'. This probably needs rethinking a bit as the total name could exceed the maximum filename component length. Further, the coherency data is currently just set to 0. It needs something else doing with it - I wonder if it would suffice simply to sum the resource_id, vol_create_time and vol_serial_number or maybe hash them. (3) The fscache_cookie_def is no more and needed information is passed directly to fscache_acquire_cookie(). The cache no longer calls back into the filesystem, but rather metadata changes are indicated at other times. fscache_acquire_cookie() is passed the same keying and coherency information as before. (4) The functions to set/reset cookies are removed and fscache_use_cookie() and fscache_unuse_cookie() are used instead. fscache_use_cookie() is passed a flag to indicate if the cookie is opened for writing. fscache_unuse_cookie() is passed updates for the metadata if we changed it (ie. if the file was opened for writing). These are called when the file is opened or closed. (5) cifs_setattr_*() are made to call fscache_resize() to change the size of the cache object. (6) The functions to read and write data are stubbed out pending a conversion to use netfslib. Changes ======= ver #8: - Abstract cache invalidation into a helper function. - Fix some checkpatch warnings[3]. ver #7: - Removed the accidentally added-back call to get the super cookie in cifs_root_iget(). - Fixed the right call to cifs_fscache_get_super_cookie() to take account of the "-o fsc" mount flag. ver #6: - Moved the change of gfpflags_allow_blocking() to current_is_kswapd() for cifs here. - Fixed one of the error paths in cifs_atomic_open() to jump around the call to use the cookie. - Fixed an additional successful return in the middle of cifs_open() to use the cookie on the way out. - Only get a volume cookie (and thus inode cookies) when "-o fsc" is supplied to mount. ver #5: - Fixed a couple of bits of cookie handling[2]: - The cookie should be released in cifs_evict_inode(), not cifsFileInfo_put_final(). The cookie needs to persist beyond file closure so that writepages will be able to write to it. - fscache_use_cookie() needs to be called in cifs_atomic_open() as it is for cifs_open(). ver #4: - Fixed the use of sizeof with memset. - tcon->vol_create_time is __le64 so doesn't need cpu_to_le64(). ver #3: - Canonicalise the cifs coherency data to make the cache portable. - Set volume coherency data. ver #2: - Use gfpflags_allow_blocking() rather than using flag directly. - Upgraded to -rc4 to allow for upstream changes[1]. - fscache_acquire_volume() now returns errors. Signed-off-by: David Howells Acked-by: Jeff Layton cc: Steve French cc: Shyam Prasad N cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=23b55d673d7527b093cd97b7c217c82e70cd1af0 [1] Link: https://lore.kernel.org/r/3419813.1641592362@warthog.procyon.org.uk/ [2] Link: https://lore.kernel.org/r/CAH2r5muTanw9pJqzAHd01d9A8keeChkzGsCEH6=0rHutVLAF-A@mail.gmail.com/ [3] Link: https://lore.kernel.org/r/163819671009.215744.11230627184193298714.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906982979.143852.10672081929614953210.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967187187.1823006.247415138444991444.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021579335.640689.2681324337038770579.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/3462849.1641593783@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/1318953.1642024578@warthog.procyon.org.uk/ # v6 Signed-off-by: Steve French --- fs/cifs/Kconfig | 2 +- fs/cifs/Makefile | 2 +- fs/cifs/cache.c | 105 -------------- fs/cifs/cifsfs.c | 19 +-- fs/cifs/cifsglob.h | 5 +- fs/cifs/connect.c | 15 +- fs/cifs/dir.c | 5 + fs/cifs/file.c | 66 +++++---- fs/cifs/fscache.c | 343 +++++++++++---------------------------------- fs/cifs/fscache.h | 130 ++++++----------- fs/cifs/inode.c | 19 ++- 11 files changed, 205 insertions(+), 506 deletions(-) delete mode 100644 fs/cifs/cache.c diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 346ae8716deb..3b7e3b9e4fd2 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -188,7 +188,7 @@ config CIFS_SMB_DIRECT config CIFS_FSCACHE bool "Provide CIFS client caching support" - depends on CIFS=m && FSCACHE_OLD_API || CIFS=y && FSCACHE_OLD_API=y + depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y help Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data to be cached locally on disk through the general filesystem cache diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 87fcacdf3de7..cc8fdcb35b71 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -25,7 +25,7 @@ cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o -cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o +cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c deleted file mode 100644 index 8be57aaedab6..000000000000 --- a/fs/cifs/cache.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: LGPL-2.1 -/* - * CIFS filesystem cache index structure definitions - * - * Copyright (c) 2010 Novell, Inc. - * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> - * - */ -#include "fscache.h" -#include "cifs_debug.h" - -/* - * CIFS filesystem definition for FS-Cache - */ -struct fscache_netfs cifs_fscache_netfs = { - .name = "cifs", - .version = 0, -}; - -/* - * Register CIFS for caching with FS-Cache - */ -int cifs_fscache_register(void) -{ - return fscache_register_netfs(&cifs_fscache_netfs); -} - -/* - * Unregister CIFS for caching - */ -void cifs_fscache_unregister(void) -{ - fscache_unregister_netfs(&cifs_fscache_netfs); -} - -/* - * Server object for FS-Cache - */ -const struct fscache_cookie_def cifs_fscache_server_index_def = { - .name = "CIFS.server", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -static enum -fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size) -{ - struct cifs_fscache_super_auxdata auxdata; - const struct cifs_tcon *tcon = cookie_netfs_data; - - if (datalen != sizeof(auxdata)) - return FSCACHE_CHECKAUX_OBSOLETE; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.resource_id = tcon->resource_id; - auxdata.vol_create_time = tcon->vol_create_time; - auxdata.vol_serial_number = tcon->vol_serial_number; - - if (memcmp(data, &auxdata, datalen) != 0) - return FSCACHE_CHECKAUX_OBSOLETE; - - return FSCACHE_CHECKAUX_OKAY; -} - -/* - * Superblock object for FS-Cache - */ -const struct fscache_cookie_def cifs_fscache_super_index_def = { - .name = "CIFS.super", - .type = FSCACHE_COOKIE_TYPE_INDEX, - .check_aux = cifs_fscache_super_check_aux, -}; - -static enum -fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size) -{ - struct cifs_fscache_inode_auxdata auxdata; - struct cifsInodeInfo *cifsi = cookie_netfs_data; - - if (datalen != sizeof(auxdata)) - return FSCACHE_CHECKAUX_OBSOLETE; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.eof = cifsi->server_eof; - auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec; - auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec; - auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec; - auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec; - - if (memcmp(data, &auxdata, datalen) != 0) - return FSCACHE_CHECKAUX_OBSOLETE; - - return FSCACHE_CHECKAUX_OKAY; -} - -const struct fscache_cookie_def cifs_fscache_inode_object_def = { - .name = "CIFS.uniqueid", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .check_aux = cifs_fscache_inode_check_aux, -}; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 36b2e0cb9736..199edac0cb59 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -397,6 +397,9 @@ static void cifs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); + if (inode->i_state & I_PINNING_FSCACHE_WB) + cifs_fscache_unuse_inode_cookie(inode, true); + cifs_fscache_release_inode_cookie(inode); clear_inode(inode); } @@ -721,6 +724,12 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root) } #endif +static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + fscache_unpin_writeback(wbc, cifs_inode_cookie(inode)); + return 0; +} + static int cifs_drop_inode(struct inode *inode) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -733,6 +742,7 @@ static int cifs_drop_inode(struct inode *inode) static const struct super_operations cifs_super_ops = { .statfs = cifs_statfs, .alloc_inode = cifs_alloc_inode, + .write_inode = cifs_write_inode, .free_inode = cifs_free_inode, .drop_inode = cifs_drop_inode, .evict_inode = cifs_evict_inode, @@ -1625,13 +1635,9 @@ init_cifs(void) goto out_destroy_cifsoplockd_wq; } - rc = cifs_fscache_register(); - if (rc) - goto out_destroy_deferredclose_wq; - rc = cifs_init_inodecache(); if (rc) - goto out_unreg_fscache; + goto out_destroy_deferredclose_wq; rc = cifs_init_mids(); if (rc) @@ -1693,8 +1699,6 @@ out_destroy_mids: cifs_destroy_mids(); out_destroy_inodecache: cifs_destroy_inodecache(); -out_unreg_fscache: - cifs_fscache_unregister(); out_destroy_deferredclose_wq: destroy_workqueue(deferredclose_wq); out_destroy_cifsoplockd_wq: @@ -1730,7 +1734,6 @@ exit_cifs(void) cifs_destroy_request_bufs(); cifs_destroy_mids(); cifs_destroy_inodecache(); - cifs_fscache_unregister(); destroy_workqueue(deferredclose_wq); destroy_workqueue(cifsoplockd_wq); destroy_workqueue(decrypt_wq); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index cace0818d510..48b343d03430 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -668,9 +668,6 @@ struct TCP_Server_Info { unsigned int total_read; /* total amount of data read in this pass */ atomic_t in_send; /* requests trying to send */ atomic_t num_waiters; /* blocked waiting to get in sendrecv */ -#ifdef CONFIG_CIFS_FSCACHE - struct fscache_cookie *fscache; /* client index cache cookie */ -#endif #ifdef CONFIG_CIFS_STATS2 atomic_t num_cmds[NUMBER_OF_SMB2_COMMANDS]; /* total requests by cmd */ atomic_t smb2slowcmd[NUMBER_OF_SMB2_COMMANDS]; /* count resps > 1 sec */ @@ -1112,7 +1109,7 @@ struct cifs_tcon { __u32 max_bytes_copy; #ifdef CONFIG_CIFS_FSCACHE u64 resource_id; /* server resource id */ - struct fscache_cookie *fscache; /* cookie for share */ + struct fscache_volume *fscache; /* cookie for share */ #endif struct list_head pending_opens; /* list of incomplete opens */ struct cached_fid crfid; /* Cached root fid */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 453aba3c2ad6..11a22a30ee14 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1444,10 +1444,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) cifs_crypto_secmech_release(server); - /* fscache server cookies are based on primary channel only */ - if (!CIFS_SERVER_IS_CHAN(server)) - cifs_fscache_release_client_cookie(server); - kfree(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; @@ -1609,14 +1605,6 @@ smbd_connected: list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); spin_unlock(&cifs_tcp_ses_lock); - /* fscache server cookies are based on primary channel only */ - if (!CIFS_SERVER_IS_CHAN(tcp_ses)) - cifs_fscache_get_client_cookie(tcp_ses); -#ifdef CONFIG_CIFS_FSCACHE - else - tcp_ses->fscache = tcp_ses->primary_server->fscache; -#endif /* CONFIG_CIFS_FSCACHE */ - /* queue echo request delayed work */ queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval); @@ -3128,7 +3116,8 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx) * Inside cifs_fscache_get_super_cookie it checks * that we do not get super cookie twice. */ - cifs_fscache_get_super_cookie(tcon); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) + cifs_fscache_get_super_cookie(tcon); out: mnt_ctx->server = server; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 6e8e7cc26ae2..ce9b22aecfba 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -22,6 +22,7 @@ #include "cifs_unicode.h" #include "fs_context.h" #include "cifs_ioctl.h" +#include "fscache.h" static void renew_parental_timestamps(struct dentry *direntry) @@ -507,8 +508,12 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, server->ops->close(xid, tcon, &fid); cifs_del_pending_open(&open); rc = -ENOMEM; + goto out; } + fscache_use_cookie(cifs_inode_cookie(file_inode(file)), + file->f_mode & FMODE_WRITE); + out: cifs_put_tlink(tlink); out_free_xid: diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 9fee3af83a73..59334be9ed3b 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -376,8 +376,6 @@ static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) struct cifsLockInfo *li, *tmp; struct super_block *sb = inode->i_sb; - cifs_fscache_release_inode_cookie(inode); - /* * Delete any outstanding lock records. We'll lose them when the file * is closed anyway. @@ -570,7 +568,7 @@ int cifs_open(struct inode *inode, struct file *file) spin_lock(&CIFS_I(inode)->deferred_lock); cifs_del_deferred_close(cfile); spin_unlock(&CIFS_I(inode)->deferred_lock); - goto out; + goto use_cache; } else { _cifsFileInfo_put(cfile, true, false); } @@ -632,8 +630,6 @@ int cifs_open(struct inode *inode, struct file *file) goto out; } - cifs_fscache_set_inode_cookie(inode, file); - if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) { /* * Time to set mode which we can not set earlier due to @@ -652,6 +648,15 @@ int cifs_open(struct inode *inode, struct file *file) cfile->pid); } +use_cache: + fscache_use_cookie(cifs_inode_cookie(file_inode(file)), + file->f_mode & FMODE_WRITE); + if (file->f_flags & O_DIRECT && + (!((file->f_flags & O_ACCMODE) != O_RDONLY) || + file->f_flags & O_APPEND)) + cifs_invalidate_cache(file_inode(file), + FSCACHE_INVAL_DIO_WRITE); + out: free_dentry_path(page); free_xid(xid); @@ -876,6 +881,8 @@ int cifs_close(struct inode *inode, struct file *file) struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_deferred_close *dclose; + cifs_fscache_unuse_inode_cookie(inode, file->f_mode & FMODE_WRITE); + if (file->private_data != NULL) { cfile = file->private_data; file->private_data = NULL; @@ -886,7 +893,6 @@ int cifs_close(struct inode *inode, struct file *file) dclose) { if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { inode->i_ctime = inode->i_mtime = current_time(inode); - cifs_fscache_update_inode_cookie(inode); } spin_lock(&cinode->deferred_lock); cifs_add_deferred_close(cfile, dclose); @@ -4198,10 +4204,12 @@ static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct file *file = vmf->vma->vm_file; - struct inode *inode = file_inode(file); - cifs_fscache_wait_on_page_write(inode, page); +#ifdef CONFIG_CIFS_FSCACHE + if (PageFsCache(page) && + wait_on_page_fscache_killable(page) < 0) + return VM_FAULT_RETRY; +#endif lock_page(page); return VM_FAULT_LOCKED; @@ -4275,8 +4283,6 @@ cifs_readv_complete(struct work_struct *work) if (rdata->result == 0 || (rdata->result == -EAGAIN && got_bytes)) cifs_readpage_to_fscache(rdata->mapping->host, page); - else - cifs_fscache_uncache_page(rdata->mapping->host, page); got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes); @@ -4593,11 +4599,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, kref_put(&rdata->refcount, cifs_readdata_release); } - /* Any pages that have been shown to fscache but didn't get added to - * the pagecache must be uncached before they get returned to the - * allocator. - */ - cifs_fscache_readpages_cancel(mapping->host, page_list); free_xid(xid); return rc; } @@ -4801,17 +4802,19 @@ static int cifs_release_page(struct page *page, gfp_t gfp) { if (PagePrivate(page)) return 0; - - return cifs_fscache_release_page(page, gfp); + if (PageFsCache(page)) { + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + wait_on_page_fscache(page); + } + fscache_note_page_release(cifs_inode_cookie(page->mapping->host)); + return true; } static void cifs_invalidate_page(struct page *page, unsigned int offset, unsigned int length) { - struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); - - if (offset == 0 && length == PAGE_SIZE) - cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); + wait_on_page_fscache(page); } static int cifs_launder_page(struct page *page) @@ -4831,7 +4834,7 @@ static int cifs_launder_page(struct page *page) if (clear_page_dirty_for_io(page)) rc = cifs_writepage_locked(page, &wbc); - cifs_fscache_invalidate_page(page, page->mapping->host); + wait_on_page_fscache(page); return rc; } @@ -4988,6 +4991,19 @@ static void cifs_swap_deactivate(struct file *file) /* do we need to unpin (or unlock) the file */ } +/* + * Mark a page as having been made dirty and thus needing writeback. We also + * need to pin the cache object to write back to. + */ +#ifdef CONFIG_CIFS_FSCACHE +static int cifs_set_page_dirty(struct page *page) +{ + return fscache_set_page_dirty(page, cifs_inode_cookie(page->mapping->host)); +} +#else +#define cifs_set_page_dirty __set_page_dirty_nobuffers +#endif + const struct address_space_operations cifs_addr_ops = { .readpage = cifs_readpage, .readpages = cifs_readpages, @@ -4995,7 +5011,7 @@ const struct address_space_operations cifs_addr_ops = { .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = cifs_set_page_dirty, .releasepage = cifs_release_page, .direct_IO = cifs_direct_io, .invalidatepage = cifs_invalidate_page, @@ -5020,7 +5036,7 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = cifs_set_page_dirty, .releasepage = cifs_release_page, .invalidatepage = cifs_invalidate_page, .launder_page = cifs_launder_page, diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 003c5f1f4dfb..efaac4d5ff55 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -12,250 +12,136 @@ #include "cifs_fs_sb.h" #include "cifsproto.h" -/* - * Key layout of CIFS server cache index object - */ -struct cifs_server_key { - __u64 conn_id; -} __packed; - -/* - * Get a cookie for a server object keyed by {IPaddress,port,family} tuple - */ -void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) +static void cifs_fscache_fill_volume_coherency( + struct cifs_tcon *tcon, + struct cifs_fscache_volume_coherency_data *cd) { - struct cifs_server_key key; + memset(cd, 0, sizeof(*cd)); + cd->resource_id = cpu_to_le64(tcon->resource_id); + cd->vol_create_time = tcon->vol_create_time; + cd->vol_serial_number = cpu_to_le32(tcon->vol_serial_number); +} - /* - * Check if cookie was already initialized so don't reinitialize it. - * In the future, as we integrate with newer fscache features, - * we may want to instead add a check if cookie has changed - */ - if (server->fscache) - return; +int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) +{ + struct cifs_fscache_volume_coherency_data cd; + struct TCP_Server_Info *server = tcon->ses->server; + struct fscache_volume *vcookie; + const struct sockaddr *sa = (struct sockaddr *)&server->dstaddr; + size_t slen, i; + char *sharename; + char *key; + int ret = -ENOMEM; + + tcon->fscache = NULL; + switch (sa->sa_family) { + case AF_INET: + case AF_INET6: + break; + default: + cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family); + return -EINVAL; + } memset(&key, 0, sizeof(key)); - key.conn_id = server->conn_id; - - server->fscache = - fscache_acquire_cookie(cifs_fscache_netfs.primary_index, - &cifs_fscache_server_index_def, - &key, sizeof(key), - NULL, 0, - server, 0, true); - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", - __func__, server, server->fscache); -} - -void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) -{ - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", - __func__, server, server->fscache); - fscache_relinquish_cookie(server->fscache, NULL, false); - server->fscache = NULL; -} - -void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) -{ - struct TCP_Server_Info *server = tcon->ses->server; - char *sharename; - struct cifs_fscache_super_auxdata auxdata; - - /* - * Check if cookie was already initialized so don't reinitialize it. - * In the future, as we integrate with newer fscache features, - * we may want to instead add a check if cookie has changed - */ - if (tcon->fscache) - return; sharename = extract_sharename(tcon->treeName); if (IS_ERR(sharename)) { cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); - tcon->fscache = NULL; - return; + return -EINVAL; } - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.resource_id = tcon->resource_id; - auxdata.vol_create_time = tcon->vol_create_time; - auxdata.vol_serial_number = tcon->vol_serial_number; + slen = strlen(sharename); + for (i = 0; i < slen; i++) + if (sharename[i] == '/') + sharename[i] = ';'; - tcon->fscache = - fscache_acquire_cookie(server->fscache, - &cifs_fscache_super_index_def, - sharename, strlen(sharename), - &auxdata, sizeof(auxdata), - tcon, 0, true); + key = kasprintf(GFP_KERNEL, "cifs,%pISpc,%s", sa, sharename); + if (!key) + goto out; + + cifs_fscache_fill_volume_coherency(tcon, &cd); + vcookie = fscache_acquire_volume(key, + NULL, /* preferred_cache */ + &cd, sizeof(cd)); + cifs_dbg(FYI, "%s: (%s/0x%p)\n", __func__, key, vcookie); + if (IS_ERR(vcookie)) { + if (vcookie != ERR_PTR(-EBUSY)) { + ret = PTR_ERR(vcookie); + goto out_2; + } + pr_err("Cache volume key already in use (%s)\n", key); + vcookie = NULL; + } + + tcon->fscache = vcookie; + ret = 0; +out_2: + kfree(key); +out: kfree(sharename); - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", - __func__, server->fscache, tcon->fscache); + return ret; } void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) { - struct cifs_fscache_super_auxdata auxdata; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.resource_id = tcon->resource_id; - auxdata.vol_create_time = tcon->vol_create_time; - auxdata.vol_serial_number = tcon->vol_serial_number; + struct cifs_fscache_volume_coherency_data cd; cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache); - fscache_relinquish_cookie(tcon->fscache, &auxdata, false); + + cifs_fscache_fill_volume_coherency(tcon, &cd); + fscache_relinquish_volume(tcon->fscache, &cd, false); tcon->fscache = NULL; } -static void cifs_fscache_acquire_inode_cookie(struct cifsInodeInfo *cifsi, - struct cifs_tcon *tcon) -{ - struct cifs_fscache_inode_auxdata auxdata; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.eof = cifsi->server_eof; - auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec; - auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec; - auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec; - auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec; - - cifsi->fscache = - fscache_acquire_cookie(tcon->fscache, - &cifs_fscache_inode_object_def, - &cifsi->uniqueid, sizeof(cifsi->uniqueid), - &auxdata, sizeof(auxdata), - cifsi, cifsi->vfs_inode.i_size, true); -} - -static void cifs_fscache_enable_inode_cookie(struct inode *inode) +void cifs_fscache_get_inode_cookie(struct inode *inode) { + struct cifs_fscache_inode_coherency_data cd; struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - if (cifsi->fscache) - return; + cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd); - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)) - return; + cifsi->fscache = + fscache_acquire_cookie(tcon->fscache, 0, + &cifsi->uniqueid, sizeof(cifsi->uniqueid), + &cd, sizeof(cd), + i_size_read(&cifsi->vfs_inode)); +} - cifs_fscache_acquire_inode_cookie(cifsi, tcon); +void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) +{ + if (update) { + struct cifs_fscache_inode_coherency_data cd; + loff_t i_size = i_size_read(inode); - cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n", - __func__, tcon->fscache, cifsi->fscache); + cifs_fscache_fill_coherency(inode, &cd); + fscache_unuse_cookie(cifs_inode_cookie(inode), &cd, &i_size); + } else { + fscache_unuse_cookie(cifs_inode_cookie(inode), NULL, NULL); + } } void cifs_fscache_release_inode_cookie(struct inode *inode) { - struct cifs_fscache_inode_auxdata auxdata; struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.eof = cifsi->server_eof; - auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec; - auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec; - auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec; - auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec; - cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); - /* fscache_relinquish_cookie does not seem to update auxdata */ - fscache_update_cookie(cifsi->fscache, &auxdata); - fscache_relinquish_cookie(cifsi->fscache, &auxdata, false); + fscache_relinquish_cookie(cifsi->fscache, false); cifsi->fscache = NULL; } } -void cifs_fscache_update_inode_cookie(struct inode *inode) -{ - struct cifs_fscache_inode_auxdata auxdata; - struct cifsInodeInfo *cifsi = CIFS_I(inode); - - if (cifsi->fscache) { - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.eof = cifsi->server_eof; - auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec; - auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec; - auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec; - auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec; - - cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); - fscache_update_cookie(cifsi->fscache, &auxdata); - } -} - -void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) -{ - cifs_fscache_enable_inode_cookie(inode); -} - -void cifs_fscache_reset_inode_cookie(struct inode *inode) -{ - struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - struct fscache_cookie *old = cifsi->fscache; - - if (cifsi->fscache) { - /* retire the current fscache cache and get a new one */ - fscache_relinquish_cookie(cifsi->fscache, NULL, true); - - cifs_fscache_acquire_inode_cookie(cifsi, tcon); - cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n", - __func__, cifsi->fscache, old); - } -} - -int cifs_fscache_release_page(struct page *page, gfp_t gfp) -{ - if (PageFsCache(page)) { - struct inode *inode = page->mapping->host; - struct cifsInodeInfo *cifsi = CIFS_I(inode); - - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", - __func__, page, cifsi->fscache); - if (!fscache_maybe_release_page(cifsi->fscache, page, gfp)) - return 0; - } - - return 1; -} - -static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx, - int error) -{ - cifs_dbg(FYI, "%s: (0x%p/%d)\n", __func__, page, error); - if (!error) - SetPageUptodate(page); - unlock_page(page); -} - /* * Retrieve a page from FS-Cache */ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) { - int ret; - cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n", __func__, CIFS_I(inode)->fscache, page, inode); - ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page, - cifs_readpage_from_fscache_complete, - NULL, - GFP_KERNEL); - switch (ret) { - - case 0: /* page found in fscache, read submitted */ - cifs_dbg(FYI, "%s: submitted\n", __func__); - return ret; - case -ENOBUFS: /* page won't be cached */ - case -ENODATA: /* page not in cache */ - cifs_dbg(FYI, "%s: %d\n", __func__, ret); - return 1; - - default: - cifs_dbg(VFS, "unknown error ret = %d\n", ret); - } - return ret; + return -ENOBUFS; // Needs conversion to using netfslib } /* @@ -266,78 +152,19 @@ int __cifs_readpages_from_fscache(struct inode *inode, struct list_head *pages, unsigned *nr_pages) { - int ret; - cifs_dbg(FYI, "%s: (0x%p/%u/0x%p)\n", __func__, CIFS_I(inode)->fscache, *nr_pages, inode); - ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping, - pages, nr_pages, - cifs_readpage_from_fscache_complete, - NULL, - mapping_gfp_mask(mapping)); - switch (ret) { - case 0: /* read submitted to the cache for all pages */ - cifs_dbg(FYI, "%s: submitted\n", __func__); - return ret; - - case -ENOBUFS: /* some pages are not cached and can't be */ - case -ENODATA: /* some pages are not cached */ - cifs_dbg(FYI, "%s: no page\n", __func__); - return 1; - - default: - cifs_dbg(FYI, "unknown error ret = %d\n", ret); - } - - return ret; + return -ENOBUFS; // Needs conversion to using netfslib } void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) { struct cifsInodeInfo *cifsi = CIFS_I(inode); - int ret; WARN_ON(!cifsi->fscache); cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", __func__, cifsi->fscache, page, inode); - ret = fscache_write_page(cifsi->fscache, page, - cifsi->vfs_inode.i_size, GFP_KERNEL); - if (ret != 0) - fscache_uncache_page(cifsi->fscache, page); -} - -void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) -{ - cifs_dbg(FYI, "%s: (fsc: %p, i: %p)\n", - __func__, CIFS_I(inode)->fscache, inode); - fscache_readpages_cancel(CIFS_I(inode)->fscache, pages); -} - -void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) -{ - struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct fscache_cookie *cookie = cifsi->fscache; - - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie); - fscache_wait_on_page_write(cookie, page); - fscache_uncache_page(cookie, page); -} - -void __cifs_fscache_wait_on_page_write(struct inode *inode, struct page *page) -{ - struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct fscache_cookie *cookie = cifsi->fscache; - - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie); - fscache_wait_on_page_write(cookie, page); -} - -void __cifs_fscache_uncache_page(struct inode *inode, struct page *page) -{ - struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct fscache_cookie *cookie = cifsi->fscache; - - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie); - fscache_uncache_page(cookie, page); + + // Needs conversion to using netfslib } diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 9baa1d0f22bd..c6ca49ac33d4 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -13,84 +13,71 @@ #include "cifsglob.h" -#ifdef CONFIG_CIFS_FSCACHE - /* - * Auxiliary data attached to CIFS superblock within the cache + * Coherency data attached to CIFS volume within the cache */ -struct cifs_fscache_super_auxdata { - u64 resource_id; /* unique server resource id */ +struct cifs_fscache_volume_coherency_data { + __le64 resource_id; /* unique server resource id */ __le64 vol_create_time; - u32 vol_serial_number; + __le32 vol_serial_number; } __packed; /* - * Auxiliary data attached to CIFS inode within the cache + * Coherency data attached to CIFS inode within the cache. */ -struct cifs_fscache_inode_auxdata { - u64 last_write_time_sec; - u64 last_change_time_sec; - u32 last_write_time_nsec; - u32 last_change_time_nsec; - u64 eof; +struct cifs_fscache_inode_coherency_data { + __le64 last_write_time_sec; + __le64 last_change_time_sec; + __le32 last_write_time_nsec; + __le32 last_change_time_nsec; }; -/* - * cache.c - */ -extern struct fscache_netfs cifs_fscache_netfs; -extern const struct fscache_cookie_def cifs_fscache_server_index_def; -extern const struct fscache_cookie_def cifs_fscache_super_index_def; -extern const struct fscache_cookie_def cifs_fscache_inode_object_def; - -extern int cifs_fscache_register(void); -extern void cifs_fscache_unregister(void); +#ifdef CONFIG_CIFS_FSCACHE /* * fscache.c */ -extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *); -extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *); -extern void cifs_fscache_get_super_cookie(struct cifs_tcon *); +extern int cifs_fscache_get_super_cookie(struct cifs_tcon *); extern void cifs_fscache_release_super_cookie(struct cifs_tcon *); +extern void cifs_fscache_get_inode_cookie(struct inode *inode); extern void cifs_fscache_release_inode_cookie(struct inode *); -extern void cifs_fscache_update_inode_cookie(struct inode *inode); -extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *); -extern void cifs_fscache_reset_inode_cookie(struct inode *); +extern void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update); + +static inline +void cifs_fscache_fill_coherency(struct inode *inode, + struct cifs_fscache_inode_coherency_data *cd) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + memset(cd, 0, sizeof(*cd)); + cd->last_write_time_sec = cpu_to_le64(cifsi->vfs_inode.i_mtime.tv_sec); + cd->last_write_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_mtime.tv_nsec); + cd->last_change_time_sec = cpu_to_le64(cifsi->vfs_inode.i_ctime.tv_sec); + cd->last_change_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_ctime.tv_nsec); +} + -extern void __cifs_fscache_invalidate_page(struct page *, struct inode *); -extern void __cifs_fscache_wait_on_page_write(struct inode *inode, struct page *page); -extern void __cifs_fscache_uncache_page(struct inode *inode, struct page *page); extern int cifs_fscache_release_page(struct page *page, gfp_t gfp); extern int __cifs_readpage_from_fscache(struct inode *, struct page *); extern int __cifs_readpages_from_fscache(struct inode *, struct address_space *, struct list_head *, unsigned *); -extern void __cifs_fscache_readpages_cancel(struct inode *, struct list_head *); - extern void __cifs_readpage_to_fscache(struct inode *, struct page *); -static inline void cifs_fscache_invalidate_page(struct page *page, - struct inode *inode) +static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { - if (PageFsCache(page)) - __cifs_fscache_invalidate_page(page, inode); + return CIFS_I(inode)->fscache; } -static inline void cifs_fscache_wait_on_page_write(struct inode *inode, - struct page *page) +static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) { - if (PageFsCache(page)) - __cifs_fscache_wait_on_page_write(inode, page); -} + struct cifs_fscache_inode_coherency_data cd; -static inline void cifs_fscache_uncache_page(struct inode *inode, - struct page *page) -{ - if (PageFsCache(page)) - __cifs_fscache_uncache_page(inode, page); + cifs_fscache_fill_coherency(inode, &cd); + fscache_invalidate(cifs_inode_cookie(inode), &cd, + i_size_read(inode), flags); } static inline int cifs_readpage_from_fscache(struct inode *inode, @@ -120,41 +107,21 @@ static inline void cifs_readpage_to_fscache(struct inode *inode, __cifs_readpage_to_fscache(inode, page); } -static inline void cifs_fscache_readpages_cancel(struct inode *inode, - struct list_head *pages) -{ - if (CIFS_I(inode)->fscache) - return __cifs_fscache_readpages_cancel(inode, pages); -} - #else /* CONFIG_CIFS_FSCACHE */ -static inline int cifs_fscache_register(void) { return 0; } -static inline void cifs_fscache_unregister(void) {} - -static inline void -cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {} -static inline void -cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {} -static inline void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) {} -static inline void -cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {} - -static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {} -static inline void cifs_fscache_update_inode_cookie(struct inode *inode) {} -static inline void cifs_fscache_set_inode_cookie(struct inode *inode, - struct file *filp) {} -static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {} -static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp) +static inline +void cifs_fscache_fill_coherency(struct inode *inode, + struct cifs_fscache_inode_coherency_data *cd) { - return 1; /* May release page */ } -static inline void cifs_fscache_invalidate_page(struct page *page, - struct inode *inode) {} -static inline void cifs_fscache_wait_on_page_write(struct inode *inode, - struct page *page) {} -static inline void cifs_fscache_uncache_page(struct inode *inode, - struct page *page) {} +static inline int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) { return 0; } +static inline void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {} + +static inline void cifs_fscache_get_inode_cookie(struct inode *inode) {} +static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {} +static inline void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) {} +static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { return NULL; } +static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) {} static inline int cifs_readpage_from_fscache(struct inode *inode, struct page *page) @@ -173,11 +140,6 @@ static inline int cifs_readpages_from_fscache(struct inode *inode, static inline void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {} -static inline void cifs_fscache_readpages_cancel(struct inode *inode, - struct list_head *pages) -{ -} - #endif /* CONFIG_CIFS_FSCACHE */ #endif /* _CIFS_FSCACHE_H */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index baa197edd8c5..7d8b3ceb2af3 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1304,10 +1304,7 @@ retry_iget5_locked: inode->i_flags |= S_NOATIME | S_NOCMTIME; if (inode->i_state & I_NEW) { inode->i_ino = hash; -#ifdef CONFIG_CIFS_FSCACHE - /* initialize per-inode cache cookie pointer */ - CIFS_I(inode)->fscache = NULL; -#endif + cifs_fscache_get_inode_cookie(inode); unlock_new_inode(inode); } } @@ -1376,6 +1373,7 @@ iget_no_retry: iget_failed(inode); inode = ERR_PTR(rc); } + out: kfree(path); free_xid(xid); @@ -2263,6 +2261,8 @@ cifs_dentry_needs_reval(struct dentry *dentry) int cifs_invalidate_mapping(struct inode *inode) { + struct cifs_fscache_inode_coherency_data cd; + struct cifsInodeInfo *cifsi = CIFS_I(inode); int rc = 0; if (inode->i_mapping && inode->i_mapping->nrpages != 0) { @@ -2272,7 +2272,8 @@ cifs_invalidate_mapping(struct inode *inode) __func__, inode); } - cifs_fscache_reset_inode_cookie(inode); + cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd); + fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0); return rc; } @@ -2777,8 +2778,10 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) goto out; if ((attrs->ia_valid & ATTR_SIZE) && - attrs->ia_size != i_size_read(inode)) + attrs->ia_size != i_size_read(inode)) { truncate_setsize(inode, attrs->ia_size); + fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); + } setattr_copy(&init_user_ns, inode, attrs); mark_inode_dirty(inode); @@ -2973,8 +2976,10 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) goto cifs_setattr_exit; if ((attrs->ia_valid & ATTR_SIZE) && - attrs->ia_size != i_size_read(inode)) + attrs->ia_size != i_size_read(inode)) { truncate_setsize(inode, attrs->ia_size); + fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); + } setattr_copy(&init_user_ns, inode, attrs); mark_inode_dirty(inode); From 484167da77739a8d0e225008c48e697fd3f781ae Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 18 Jan 2022 15:19:04 +0800 Subject: [PATCH 0233/1295] btrfs: defrag: fix wrong number of defragged sectors [BUG] There are users using autodefrag mount option reporting obvious increase in IO: > If I compare the write average (in total, I don't have it per process) > when taking idle periods on the same machine: > Linux 5.16: > without autodefrag: ~ 10KiB/s > with autodefrag: between 1 and 2MiB/s. > > Linux 5.15: > with autodefrag:~ 10KiB/s (around the same as without > autodefrag on 5.16) [CAUSE] When autodefrag mount option is enabled, btrfs_defrag_file() will be called with @max_sectors = BTRFS_DEFRAG_BATCH (1024) to limit how many sectors we can defrag in one try. And then use the number of sectors defragged to determine if we need to re-defrag. But commit b18c3ab2343d ("btrfs: defrag: introduce helper to defrag one cluster") uses wrong unit to increase @sectors_defragged, which should be in unit of sector, not byte. This means, if we have defragged any sector, then @sectors_defragged will be >= sectorsize (normally 4096), which is larger than BTRFS_DEFRAG_BATCH. This makes the @max_sectors check in defrag_one_cluster() to underflow, rendering the whole @max_sectors check useless. Thus causing way more IO for autodefrag mount options, as now there is no limit on how many sectors can really be defragged. [FIX] Fix the problems by: - Use sector as unit when increasing @sectors_defragged - Include @sectors_defragged > @max_sectors case to break the loop - Add extra comment on the return value of btrfs_defrag_file() Reported-by: Anthony Ruhier Fixes: b18c3ab2343d ("btrfs: defrag: introduce helper to defrag one cluster") Link: https://lore.kernel.org/linux-btrfs/0a269612-e43f-da22-c5bc-b34b1b56ebe8@mailbox.org/ CC: stable@vger.kernel.org # 5.16 Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6391be7409d8..7029ee75e301 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1442,8 +1442,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode, list_for_each_entry(entry, &target_list, list) { u32 range_len = entry->len; - /* Reached the limit */ - if (max_sectors && max_sectors == *sectors_defragged) + /* Reached or beyond the limit */ + if (max_sectors && *sectors_defragged >= max_sectors) break; if (max_sectors) @@ -1465,7 +1465,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode, extent_thresh, newer_than, do_compress); if (ret < 0) break; - *sectors_defragged += range_len; + *sectors_defragged += range_len >> + inode->root->fs_info->sectorsize_bits; } out: list_for_each_entry_safe(entry, tmp, &target_list, list) { @@ -1484,6 +1485,9 @@ out: * @newer_than: minimum transid to defrag * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode * will be defragged. + * + * Return <0 for error. + * Return >=0 for the number of sectors defragged. */ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, From c080b4144b9dd3b7af838a194ffad3204ca15166 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 18 Jan 2022 19:53:52 +0800 Subject: [PATCH 0234/1295] btrfs: defrag: properly update range->start for autodefrag [BUG] After commit 7b508037d4ca ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()") autodefrag no longer properly re-defrag the file from previously finished location. [CAUSE] The recent refactoring of defrag only focuses on defrag ioctl subpage support, doesn't take autodefrag into consideration. There are two problems involved which prevents autodefrag to restart its scan: - No range.start update Previously when one defrag target is found, range->start will be updated to indicate where next search should start from. But now btrfs_defrag_file() doesn't update it anymore, making all autodefrag to rescan from file offset 0. This would also make autodefrag to mark the same range dirty again and again, causing extra IO. - No proper quick exit for defrag_one_cluster() Currently if we reached or exceed @max_sectors limit, we just exit defrag_one_cluster(), and let next defrag_one_cluster() call to do a quick exit. This makes @cur increase, thus no way to properly know which range is defragged and which range is skipped. [FIX] The fix involves two modifications: - Update range->start to next cluster start This is a little different from the old behavior. Previously range->start is updated to the next defrag target. But in the end, the behavior should still be pretty much the same, as now we skip to next defrag target inside btrfs_defrag_file(). Thus if auto-defrag determines to re-scan, then we still do the skip, just at a different timing. - Make defrag_one_cluster() to return >0 to indicate a quick exit So that btrfs_defrag_file() can also do a quick exit, without increasing @cur to the range end, and re-use @cur to update @range->start. - Add comment for btrfs_defrag_file() to mention the range->start update Currently only autodefrag utilize this behavior, as defrag ioctl won't set @max_to_defrag parameter, thus unless interrupted it will always try to defrag the whole range. Reported-by: Filipe Manana Fixes: 7b508037d4ca ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()") Link: https://lore.kernel.org/linux-btrfs/0a269612-e43f-da22-c5bc-b34b1b56ebe8@mailbox.org/ CC: stable@vger.kernel.org # 5.16 Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7029ee75e301..6d5d6d2c4ad1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1443,8 +1443,10 @@ static int defrag_one_cluster(struct btrfs_inode *inode, u32 range_len = entry->len; /* Reached or beyond the limit */ - if (max_sectors && *sectors_defragged >= max_sectors) + if (max_sectors && *sectors_defragged >= max_sectors) { + ret = 1; break; + } if (max_sectors) range_len = min_t(u32, range_len, @@ -1487,7 +1489,10 @@ out: * will be defragged. * * Return <0 for error. - * Return >=0 for the number of sectors defragged. + * Return >=0 for the number of sectors defragged, and range->start will be updated + * to indicate the file offset where next defrag should be started at. + * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without + * defragging all the range). */ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, @@ -1580,10 +1585,19 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, if (ret < 0) break; cur = cluster_end + 1; + if (ret > 0) { + ret = 0; + break; + } } if (ra_allocated) kfree(ra); + /* + * Update range.start for autodefrag, this will indicate where to start + * in next run. + */ + range->start = cur; if (sectors_defragged) { /* * We have defragged some sectors, for compression case they From 8326c79d10be2ddbfd3d3804206949a71cb15675 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 13 Nov 2021 10:43:52 -0300 Subject: [PATCH 0235/1295] tools headers UAPI: Sync x86 arch prctl headers with the kernel sources To pick the changes in this cset: 980fe2fddcff2193 ("x86/fpu: Extend fpu_xstate_prctl() with guest permissions") This picks these new prctls: $ tools/perf/trace/beauty/x86_arch_prctl.sh > /tmp/before $ cp arch/x86/include/uapi/asm/prctl.h tools/arch/x86/include/uapi/asm/prctl.h $ tools/perf/trace/beauty/x86_arch_prctl.sh > /tmp/after $ diff -u /tmp/before /tmp/after --- /tmp/before 2022-01-19 14:40:05.049394977 -0300 +++ /tmp/after 2022-01-19 14:40:35.628154565 -0300 @@ -9,6 +9,8 @@ [0x1021 - 0x1001]= "GET_XCOMP_SUPP", [0x1022 - 0x1001]= "GET_XCOMP_PERM", [0x1023 - 0x1001]= "REQ_XCOMP_PERM", + [0x1024 - 0x1001]= "GET_XCOMP_GUEST_PERM", + [0x1025 - 0x1001]= "REQ_XCOMP_GUEST_PERM", }; #define x86_arch_prctl_codes_2_offset 0x2001 $ With this 'perf trace' can translate those numbers into strings and use the strings in filter expressions: # perf trace -e prctl 0.000 ( 0.011 ms): DOM Worker/3722622 prctl(option: SET_NAME, arg2: 0x7f9c014b7df5) = 0 0.032 ( 0.002 ms): DOM Worker/3722622 prctl(option: SET_NAME, arg2: 0x7f9bb6b51580) = 0 5.452 ( 0.003 ms): StreamT~ns #30/3722623 prctl(option: SET_NAME, arg2: 0x7f9bdbdfeb70) = 0 5.468 ( 0.002 ms): StreamT~ns #30/3722623 prctl(option: SET_NAME, arg2: 0x7f9bdbdfea70) = 0 24.494 ( 0.009 ms): IndexedDB #556/3722624 prctl(option: SET_NAME, arg2: 0x7f562a32ae28) = 0 24.540 ( 0.002 ms): IndexedDB #556/3722624 prctl(option: SET_NAME, arg2: 0x7f563c6d4b30) = 0 670.281 ( 0.008 ms): systemd-userwo/3722339 prctl(option: SET_NAME, arg2: 0x564be30805c8) = 0 670.293 ( 0.002 ms): systemd-userwo/3722339 prctl(option: SET_NAME, arg2: 0x564be30800f0) = 0 ^C# This addresses these perf build warnings: Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/prctl.h' differs from latest version at 'arch/x86/include/uapi/asm/prctl.h' diff -u tools/arch/x86/include/uapi/asm/prctl.h arch/x86/include/uapi/asm/prctl.h Cc: Paolo Bonzini Cc: Thomas Gleixner Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/prctl.h | 26 +++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tools/arch/x86/include/uapi/asm/prctl.h b/tools/arch/x86/include/uapi/asm/prctl.h index 754a07856817..500b96e71f18 100644 --- a/tools/arch/x86/include/uapi/asm/prctl.h +++ b/tools/arch/x86/include/uapi/asm/prctl.h @@ -2,20 +2,22 @@ #ifndef _ASM_X86_PRCTL_H #define _ASM_X86_PRCTL_H -#define ARCH_SET_GS 0x1001 -#define ARCH_SET_FS 0x1002 -#define ARCH_GET_FS 0x1003 -#define ARCH_GET_GS 0x1004 +#define ARCH_SET_GS 0x1001 +#define ARCH_SET_FS 0x1002 +#define ARCH_GET_FS 0x1003 +#define ARCH_GET_GS 0x1004 -#define ARCH_GET_CPUID 0x1011 -#define ARCH_SET_CPUID 0x1012 +#define ARCH_GET_CPUID 0x1011 +#define ARCH_SET_CPUID 0x1012 -#define ARCH_GET_XCOMP_SUPP 0x1021 -#define ARCH_GET_XCOMP_PERM 0x1022 -#define ARCH_REQ_XCOMP_PERM 0x1023 +#define ARCH_GET_XCOMP_SUPP 0x1021 +#define ARCH_GET_XCOMP_PERM 0x1022 +#define ARCH_REQ_XCOMP_PERM 0x1023 +#define ARCH_GET_XCOMP_GUEST_PERM 0x1024 +#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 -#define ARCH_MAP_VDSO_X32 0x2001 -#define ARCH_MAP_VDSO_32 0x2002 -#define ARCH_MAP_VDSO_64 0x2003 +#define ARCH_MAP_VDSO_X32 0x2001 +#define ARCH_MAP_VDSO_32 0x2002 +#define ARCH_MAP_VDSO_64 0x2003 #endif /* _ASM_X86_PRCTL_H */ From 902d6364aad5fc8731ac5b19ed9a1d9c9ee0bd91 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 6 Dec 2021 23:03:49 +0800 Subject: [PATCH 0236/1295] riscv: mm: init: remove unnecessary "#ifdef CONFIG_CRASH_DUMP" The is_kdump_kernel() returns false for !CRASH_DUMP case, so we don't need the #ifdef CONFIG_CRASH_DUMP for is_kdump_kernel() checking. Signed-off-by: Jisheng Zhang Reviewed-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 0624c68331d8..122352823835 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -791,12 +791,10 @@ static void __init reserve_crashkernel(void) * since it doesn't make much sense and we have limited memory * resources. */ -#ifdef CONFIG_CRASH_DUMP if (is_kdump_kernel()) { pr_info("crashkernel: ignoring reservation request\n"); return; } -#endif ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base); From 07aabe8fb6d1ac3163cc74c856521f2ee746270b Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 6 Dec 2021 23:03:50 +0800 Subject: [PATCH 0237/1295] riscv: mm: init: try best to use IS_ENABLED(CONFIG_64BIT) instead of #ifdef Try our best to replace the conditional compilation using "#ifdef CONFIG_64BIT" by a check for "IS_ENABLED(CONFIG_64BIT)", to simplify the code and to increase compile coverage. Now we can also remove the __maybe_unused used in max_mapped_addr declaration. We also remove the BUG_ON check of mapping the last 4K bytes of the addressable memory since this is always true for every kernel actually. Signed-off-by: Jisheng Zhang Reviewed-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 43 ++++++++++++++++--------------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 122352823835..fd506637c372 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -102,10 +102,9 @@ static void __init print_vm_layout(void) (unsigned long)VMALLOC_END); print_mlm("lowmem", (unsigned long)PAGE_OFFSET, (unsigned long)high_memory); -#ifdef CONFIG_64BIT - print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR, - (unsigned long)ADDRESS_SPACE_END); -#endif + if (IS_ENABLED(CONFIG_64BIT)) + print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR, + (unsigned long)ADDRESS_SPACE_END); } #else static void print_vm_layout(void) { } @@ -163,7 +162,7 @@ static void __init setup_bootmem(void) { phys_addr_t vmlinux_end = __pa_symbol(&_end); phys_addr_t vmlinux_start = __pa_symbol(&_start); - phys_addr_t __maybe_unused max_mapped_addr; + phys_addr_t max_mapped_addr; phys_addr_t phys_ram_end; #ifdef CONFIG_XIP_KERNEL @@ -172,17 +171,16 @@ static void __init setup_bootmem(void) memblock_enforce_memory_limit(memory_limit); - /* - * Reserve from the start of the kernel to the end of the kernel - */ -#if defined(CONFIG_64BIT) && defined(CONFIG_STRICT_KERNEL_RWX) /* * Make sure we align the reservation on PMD_SIZE since we will * map the kernel in the linear mapping as read-only: we do not want * any allocation to happen between _end and the next pmd aligned page. */ - vmlinux_end = (vmlinux_end + PMD_SIZE - 1) & PMD_MASK; -#endif + if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) + vmlinux_end = (vmlinux_end + PMD_SIZE - 1) & PMD_MASK; + /* + * Reserve from the start of the kernel to the end of the kernel + */ memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); @@ -190,7 +188,6 @@ static void __init setup_bootmem(void) #ifndef CONFIG_XIP_KERNEL phys_ram_base = memblock_start_of_DRAM(); #endif -#ifndef CONFIG_64BIT /* * memblock allocator is not aware of the fact that last 4K bytes of * the addressable memory can not be mapped because of IS_ERR_VALUE @@ -200,10 +197,11 @@ static void __init setup_bootmem(void) * address space is occupied by the kernel mapping then this check must * be done as soon as the kernel mapping base address is determined. */ - max_mapped_addr = __pa(~(ulong)0); - if (max_mapped_addr == (phys_ram_end - 1)) - memblock_set_current_limit(max_mapped_addr - 4096); -#endif + if (!IS_ENABLED(CONFIG_64BIT)) { + max_mapped_addr = __pa(~(ulong)0); + if (max_mapped_addr == (phys_ram_end - 1)) + memblock_set_current_limit(max_mapped_addr - 4096); + } min_low_pfn = PFN_UP(phys_ram_base); max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end); @@ -617,14 +615,6 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0); BUG_ON((kernel_map.phys_addr % PMD_SIZE) != 0); -#ifdef CONFIG_64BIT - /* - * The last 4K bytes of the addressable memory can not be mapped because - * of IS_ERR_VALUE macro. - */ - BUG_ON((kernel_map.virt_addr + kernel_map.size) > ADDRESS_SPACE_END - SZ_4K); -#endif - pt_ops.alloc_pte = alloc_pte_early; pt_ops.get_pte_virt = get_pte_virt_early; #ifndef __PAGETABLE_PMD_FOLDED @@ -736,10 +726,9 @@ static void __init setup_vm_final(void) } } -#ifdef CONFIG_64BIT /* Map the kernel */ - create_kernel_page_table(swapper_pg_dir, false); -#endif + if (IS_ENABLED(CONFIG_64BIT)) + create_kernel_page_table(swapper_pg_dir, false); /* Clear fixmap PTE and PMD mappings */ clear_fixmap(FIX_PTE); From 3274a6ef3b1bd9faef940c40ef201723d5d3d056 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 6 Dec 2021 23:03:51 +0800 Subject: [PATCH 0238/1295] riscv: mm: init: remove _pt_ops and use pt_ops directly Except "pt_ops", other global vars when CONFIG_XIP_KERNEL=y is defined as below: |foo_type foo; |#ifdef CONFIG_XIP_KERNEL |#define foo (*(foo_type *)XIP_FIXUP(&foo)) |#endif Follow the same way for pt_ops to unify the style and to simplify code. Signed-off-by: Jisheng Zhang Reviewed-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index fd506637c372..f96acd26801f 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -227,12 +227,10 @@ static void __init setup_bootmem(void) } #ifdef CONFIG_MMU -static struct pt_alloc_ops _pt_ops __initdata; +static struct pt_alloc_ops pt_ops __initdata; #ifdef CONFIG_XIP_KERNEL -#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops)) -#else -#define pt_ops _pt_ops +#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops)) #endif unsigned long riscv_pfn_base __ro_after_init; From fe036db7d8a93cfbfd254f76e0ecf81aecbbf6b4 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 6 Dec 2021 23:03:52 +0800 Subject: [PATCH 0239/1295] riscv: mm: init: try IS_ENABLED(CONFIG_XIP_KERNEL) instead of #ifdef Try our best to replace the conditional compilation using "#ifdef CONFIG_XIP_KERNEL" with "IS_ENABLED(CONFIG_XIP_KERNEL)", to simplify the code and to increase compile coverage. Signed-off-by: Jisheng Zhang Reviewed-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index f96acd26801f..b20d6d2e184c 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -161,13 +161,13 @@ early_param("mem", early_mem); static void __init setup_bootmem(void) { phys_addr_t vmlinux_end = __pa_symbol(&_end); - phys_addr_t vmlinux_start = __pa_symbol(&_start); phys_addr_t max_mapped_addr; - phys_addr_t phys_ram_end; + phys_addr_t phys_ram_end, vmlinux_start; -#ifdef CONFIG_XIP_KERNEL - vmlinux_start = __pa_symbol(&_sdata); -#endif + if (IS_ENABLED(CONFIG_XIP_KERNEL)) + vmlinux_start = __pa_symbol(&_sdata); + else + vmlinux_start = __pa_symbol(&_start); memblock_enforce_memory_limit(memory_limit); @@ -183,11 +183,9 @@ static void __init setup_bootmem(void) */ memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); - phys_ram_end = memblock_end_of_DRAM(); -#ifndef CONFIG_XIP_KERNEL - phys_ram_base = memblock_start_of_DRAM(); -#endif + if (!IS_ENABLED(CONFIG_XIP_KERNEL)) + phys_ram_base = memblock_start_of_DRAM(); /* * memblock allocator is not aware of the fact that last 4K bytes of * the addressable memory can not be mapped because of IS_ERR_VALUE From 805a3ebed59f81155ded218648db58bdc886a881 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 6 Dec 2021 23:03:53 +0800 Subject: [PATCH 0240/1295] riscv: mm: init: try best to remove #ifdef CONFIG_XIP_KERNEL usage Currently, the #ifdef CONFIG_XIP_KERNEL usage can be divided into the following three types: The first one is for functions/declarations only used in XIP case. The second one is for XIP_FIXUP case. Something as below: |foo_type foo; |#ifdef CONFIG_XIP_KERNEL |#define foo (*(foo_type *)XIP_FIXUP(&foo)) |#endif Usually, it's better to let the foo macro sit with the foo var together. But if various foos are defined adjacently, we can save some #ifdef CONFIG_XIP_KERNEL usage by grouping them together. The third one is for different implementations for XIP, usually, this is a #ifdef...#else...#endif case. This patch moves the pt_ops macro to adjacent #ifdef CONFIG_XIP_KERNEL and group first type usage cases into one. Signed-off-by: Jisheng Zhang Reviewed-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index b20d6d2e184c..c1fffec37749 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -40,10 +40,6 @@ EXPORT_SYMBOL(kernel_map); phys_addr_t phys_ram_base __ro_after_init; EXPORT_SYMBOL(phys_ram_base); -#ifdef CONFIG_XIP_KERNEL -extern char _xiprom[], _exiprom[], __data_loc; -#endif - unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); @@ -227,10 +223,6 @@ static void __init setup_bootmem(void) #ifdef CONFIG_MMU static struct pt_alloc_ops pt_ops __initdata; -#ifdef CONFIG_XIP_KERNEL -#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops)) -#endif - unsigned long riscv_pfn_base __ro_after_init; EXPORT_SYMBOL(riscv_pfn_base); @@ -242,6 +234,7 @@ pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #ifdef CONFIG_XIP_KERNEL +#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops)) #define trampoline_pg_dir ((pgd_t *)XIP_FIXUP(trampoline_pg_dir)) #define fixmap_pte ((pte_t *)XIP_FIXUP(fixmap_pte)) #define early_pg_dir ((pgd_t *)XIP_FIXUP(early_pg_dir)) @@ -446,6 +439,8 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) } #ifdef CONFIG_XIP_KERNEL +extern char _xiprom[], _exiprom[], __data_loc; + /* called from head.S with MMU off */ asmlinkage void __init __copy_data(void) { From fa68118144c63e292628945c5b8feb16b84fea7d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 19 Jan 2022 13:30:23 -0500 Subject: [PATCH 0241/1295] kvm: selftests: sync uapi/linux/kvm.h with Linux header KVM_CAP_XSAVE2 is out of sync due to a conflict. Copy the whole file while at it. Reported-by: Yang Zhong Signed-off-by: Paolo Bonzini --- tools/include/uapi/linux/kvm.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index f066637ee206..9563d294f181 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1131,7 +1131,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204 #define KVM_CAP_ARM_MTE 205 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 -#define KVM_CAP_XSAVE2 207 +#define KVM_CAP_VM_GPA_BITS 207 +#define KVM_CAP_XSAVE2 208 #ifdef KVM_CAP_IRQ_ROUTING @@ -1163,11 +1164,20 @@ struct kvm_irq_routing_hv_sint { __u32 sint; }; +struct kvm_irq_routing_xen_evtchn { + __u32 port; + __u32 vcpu; + __u32 priority; +}; + +#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1)) + /* gsi routing entry types */ #define KVM_IRQ_ROUTING_IRQCHIP 1 #define KVM_IRQ_ROUTING_MSI 2 #define KVM_IRQ_ROUTING_S390_ADAPTER 3 #define KVM_IRQ_ROUTING_HV_SINT 4 +#define KVM_IRQ_ROUTING_XEN_EVTCHN 5 struct kvm_irq_routing_entry { __u32 gsi; @@ -1179,6 +1189,7 @@ struct kvm_irq_routing_entry { struct kvm_irq_routing_msi msi; struct kvm_irq_routing_s390_adapter adapter; struct kvm_irq_routing_hv_sint hv_sint; + struct kvm_irq_routing_xen_evtchn xen_evtchn; __u32 pad[8]; } u; }; @@ -1209,6 +1220,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) +#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) struct kvm_xen_hvm_config { __u32 flags; @@ -1552,8 +1564,6 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_XSAVE */ #define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave) #define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave) -/* Available with KVM_CAP_XSAVE2 */ -#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) /* Available with KVM_CAP_XCRS */ #define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) #define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) @@ -1613,6 +1623,9 @@ struct kvm_enc_region { #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) +/* Available with KVM_CAP_XSAVE2 */ +#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) + struct kvm_s390_pv_sec_parm { __u64 origin; __u64 length; From 96c852c8bf52af2d34654e700d9b5d2e8a99bae5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 19 Jan 2022 13:34:30 -0500 Subject: [PATCH 0242/1295] kvm: selftests: Do not indent with spaces Some indentation with spaces crept in, likely due to terminal-based cut and paste. Clean it up. Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/processor.h | 10 ++-- tools/testing/selftests/kvm/lib/kvm_util.c | 8 ++- .../selftests/kvm/lib/x86_64/processor.c | 60 +++++++++---------- .../selftests/kvm/x86_64/tsc_msrs_test.c | 4 +- .../kvm/x86_64/vmx_close_while_nested_test.c | 4 +- .../selftests/kvm/x86_64/xen_shinfo_test.c | 34 +++++------ 6 files changed, 61 insertions(+), 59 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 122447827954..423d8a61bd2e 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -368,14 +368,14 @@ bool is_amd_cpu(void); static inline unsigned int x86_family(unsigned int eax) { - unsigned int x86; + unsigned int x86; - x86 = (eax >> 8) & 0xf; + x86 = (eax >> 8) & 0xf; - if (x86 == 0xf) - x86 += (eax >> 20) & 0xff; + if (x86 == 0xf) + x86 += (eax >> 20) & 0xff; - return x86; + return x86; } static inline unsigned int x86_model(unsigned int eax) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index c22a17aac6b0..8c53f96ab7fe 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -499,9 +499,11 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, uint64_t first_page, uint32_t num_pages) { - struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot, - .first_page = first_page, - .num_pages = num_pages }; + struct kvm_clear_dirty_log args = { + .dirty_bitmap = log, .slot = slot, + .first_page = first_page, + .num_pages = num_pages + }; int ret; ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 59dcfe1967cc..5c8c270a9158 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1144,25 +1144,25 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); list->nmsrs = nmsrs; r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", + r); state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0])); r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i", + r); r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i", + r); r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i", + r); r = vcpu_save_xsave_state(vm, vcpu, state); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", + r); if (kvm_check_cap(KVM_CAP_XCRS)) { r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs); @@ -1171,17 +1171,17 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) } r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", + r); if (nested_size) { state->nested.size = sizeof(state->nested_); r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested); TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i", - r); + r); TEST_ASSERT(state->nested.size <= nested_size, - "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", - state->nested.size, nested_size); + "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", + state->nested.size, nested_size); } else state->nested.size = 0; @@ -1189,12 +1189,12 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) for (i = 0; i < nmsrs; i++) state->msrs.entries[i].index = list->indices[i]; r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs); - TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", - r, r == nmsrs ? -1 : list->indices[r]); + TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", + r, r == nmsrs ? -1 : list->indices[r]); r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i", + r); free(list); return state; @@ -1207,7 +1207,7 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs); TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i", - r); + r); r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs); TEST_ASSERT(r == state->msrs.nmsrs, @@ -1222,28 +1222,28 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave); TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i", - r); + r); r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i", + r); r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i", + r); r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i", + r); r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i", + r); if (state->nested.size) { r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested); TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i", - r); + r); } } diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c index 5a6a662f2e59..a426078b16a3 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c @@ -77,8 +77,8 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage) switch (get_ucall(vm, vcpuid, &uc)) { case UCALL_SYNC: TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && - uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx", - stage + 1, (ulong)uc.args[1]); + uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx", + stage + 1, (ulong)uc.args[1]); return; case UCALL_DONE: return; diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c index 2835a17f1b7a..edac8839e717 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c @@ -30,8 +30,8 @@ static struct kvm_vm *vm; static void l2_guest_code(void) { /* Exit to L0 */ - asm volatile("inb %%dx, %%al" - : : [port] "d" (PORT_L0_EXIT) : "rax"); + asm volatile("inb %%dx, %%al" + : : [port] "d" (PORT_L0_EXIT) : "rax"); } static void l1_guest_code(struct vmx_pages *vmx_pages) diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 478e0ae8b93e..865e17146815 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -46,20 +46,20 @@ static struct kvm_vm *vm; #define MIN_STEAL_TIME 50000 struct pvclock_vcpu_time_info { - u32 version; - u32 pad0; - u64 tsc_timestamp; - u64 system_time; - u32 tsc_to_system_mul; - s8 tsc_shift; - u8 flags; - u8 pad[2]; + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 flags; + u8 pad[2]; } __attribute__((__packed__)); /* 32 bytes */ struct pvclock_wall_clock { - u32 version; - u32 sec; - u32 nsec; + u32 version; + u32 sec; + u32 nsec; } __attribute__((__packed__)); struct vcpu_runstate_info { @@ -74,11 +74,11 @@ struct arch_vcpu_info { }; struct vcpu_info { - uint8_t evtchn_upcall_pending; - uint8_t evtchn_upcall_mask; - unsigned long evtchn_pending_sel; - struct arch_vcpu_info arch; - struct pvclock_vcpu_time_info time; + uint8_t evtchn_upcall_pending; + uint8_t evtchn_upcall_mask; + unsigned long evtchn_pending_sel; + struct arch_vcpu_info arch; + struct pvclock_vcpu_time_info time; }; /* 64 bytes (x86) */ struct shared_info { @@ -493,7 +493,7 @@ int main(int argc, char *argv[]) vm_ts.tv_sec = wc->sec; vm_ts.tv_nsec = wc->nsec; - TEST_ASSERT(wc->version && !(wc->version & 1), + TEST_ASSERT(wc->version && !(wc->version & 1), "Bad wallclock version %x", wc->version); TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old"); TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new"); From fc839c6d33c8828514f595822f457e51328507e5 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 11 Jan 2022 00:52:08 +0800 Subject: [PATCH 0243/1295] riscv: bpf: Fix eBPF's exception tables eBPF's exception tables needs to be modified to relative synchronously. Suggested-by: Tong Tiangen Signed-off-by: Jisheng Zhang Fixes: 1f77ed9422cb ("riscv: switch to relative extable and other improvements") Signed-off-by: Palmer Dabbelt --- arch/riscv/net/bpf_jit_comp64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 293dd6e171ed..0bcda99d1d68 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -497,7 +497,7 @@ static int add_exception_handler(const struct bpf_insn *insn, offset = pc - (long)&ex->insn; if (WARN_ON_ONCE(offset >= 0 || offset < INT_MIN)) return -ERANGE; - ex->insn = pc; + ex->insn = offset; /* * Since the extable follows the program, the fixup offset is always From 6191cf3ad59fda5901160633fef8e41b064a5246 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 18 Jan 2022 11:32:35 -0800 Subject: [PATCH 0244/1295] xfs: flush inodegc workqueue tasks before cancel The xfs_inodegc_stop() helper performs a high level flush of pending work on the percpu queues and then runs a cancel_work_sync() on each of the percpu work tasks to ensure all work has completed before returning. While cancel_work_sync() waits for wq tasks to complete, it does not guarantee work tasks have started. This means that the _stop() helper can queue and instantly cancel a wq task without having completed the associated work. This can be observed by tracepoint inspection of a simple "rm -f ; fsfreeze -f " test: xfs_destroy_inode: ... ino 0x83 ... xfs_inode_set_need_inactive: ... ino 0x83 ... xfs_inodegc_stop: ... ... xfs_inodegc_start: ... xfs_inodegc_worker: ... xfs_inode_inactivating: ... ino 0x83 ... The first few lines show that the inode is removed and need inactive state set, but the inactivation work has not completed before the inodegc mechanism stops. The inactivation doesn't actually occur until the fs is unfrozen and the gc mechanism starts back up. Note that this test requires fsfreeze to reproduce because xfs_freeze indirectly invokes xfs_fs_statfs(), which calls xfs_inodegc_flush(). When this occurs, the workqueue try_to_grab_pending() logic first tries to steal the pending bit, which does not succeed because the bit has been set by queue_work_on(). Subsequently, it checks for association of a pool workqueue from the work item under the pool lock. This association is set at the point a work item is queued and cleared when dequeued for processing. If the association exists, the work item is removed from the queue and cancel_work_sync() returns true. If the pwq association is cleared, the remove attempt assumes the task is busy and retries (eventually returning false to the caller after waiting for the work task to complete). To avoid this race, we can flush each work item explicitly before cancel. However, since the _queue_all() already schedules each underlying work item, the workqueue level helpers are sufficient to achieve the same ordering effect. E.g., the inodegc enabled flag prevents scheduling any further work in the _stop() case. Use the drain_workqueue() helper in this particular case to make the intent a bit more self explanatory. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index d019c98eb839..7a2a5e2be3cf 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1852,28 +1852,20 @@ xfs_inodegc_worker( } /* - * Force all currently queued inode inactivation work to run immediately, and - * wait for the work to finish. Two pass - queue all the work first pass, wait - * for it in a second pass. + * Force all currently queued inode inactivation work to run immediately and + * wait for the work to finish. */ void xfs_inodegc_flush( struct xfs_mount *mp) { - struct xfs_inodegc *gc; - int cpu; - if (!xfs_is_inodegc_enabled(mp)) return; trace_xfs_inodegc_flush(mp, __return_address); xfs_inodegc_queue_all(mp); - - for_each_online_cpu(cpu) { - gc = per_cpu_ptr(mp->m_inodegc, cpu); - flush_work(&gc->work); - } + flush_workqueue(mp->m_inodegc_wq); } /* @@ -1884,18 +1876,12 @@ void xfs_inodegc_stop( struct xfs_mount *mp) { - struct xfs_inodegc *gc; - int cpu; - if (!xfs_clear_inodegc_enabled(mp)) return; xfs_inodegc_queue_all(mp); + drain_workqueue(mp->m_inodegc_wq); - for_each_online_cpu(cpu) { - gc = per_cpu_ptr(mp->m_inodegc, cpu); - cancel_work_sync(&gc->work); - } trace_xfs_inodegc_stop(mp, __return_address); } From db1503d355a79d1d4255a9996f20e72848b74a56 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 17 Jan 2022 10:57:16 +0100 Subject: [PATCH 0245/1295] riscv: Get rid of MAXPHYSMEM configs CONFIG_MAXPHYSMEM_* are actually never used, even the nommu defconfigs selecting the MAXPHYSMEM_2GB had no effects on PAGE_OFFSET since it was preempted by !MMU case right before. In addition, the move of the kernel mapping at the end of the address space broke the use of MAXPHYSMEM_2G with MMU since it defines PAGE_OFFSET at the same address as the kernel mapping. Reported-by: Geert Uytterhoeven Fixes: 2bfc6cd81bd1 ("riscv: Move kernel mapping outside of linear mapping") Signed-off-by: Alexandre Ghiti Tested-by: Geert Uytterhoeven Tested-by: Conor Dooley Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 23 ++----------------- arch/riscv/configs/nommu_k210_defconfig | 2 -- .../riscv/configs/nommu_k210_sdcard_defconfig | 2 -- arch/riscv/configs/nommu_virt_defconfig | 1 - 4 files changed, 2 insertions(+), 26 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 821252b65f89..7a68a4106e5a 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -158,10 +158,9 @@ config PA_BITS config PAGE_OFFSET hex - default 0xC0000000 if 32BIT && MAXPHYSMEM_1GB + default 0xC0000000 if 32BIT default 0x80000000 if 64BIT && !MMU - default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB - default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB + default 0xffffffe000000000 if 64BIT config KASAN_SHADOW_OFFSET hex @@ -270,24 +269,6 @@ config MODULE_SECTIONS bool select HAVE_MOD_ARCH_SPECIFIC -choice - prompt "Maximum Physical Memory" - default MAXPHYSMEM_1GB if 32BIT - default MAXPHYSMEM_2GB if 64BIT && CMODEL_MEDLOW - default MAXPHYSMEM_128GB if 64BIT && CMODEL_MEDANY - - config MAXPHYSMEM_1GB - depends on 32BIT - bool "1GiB" - config MAXPHYSMEM_2GB - depends on 64BIT && CMODEL_MEDLOW - bool "2GiB" - config MAXPHYSMEM_128GB - depends on 64BIT && CMODEL_MEDANY - bool "128GiB" -endchoice - - config SMP bool "Symmetric Multi-Processing" help diff --git a/arch/riscv/configs/nommu_k210_defconfig b/arch/riscv/configs/nommu_k210_defconfig index b16a2a12c82a..3b9f83221f9c 100644 --- a/arch/riscv/configs/nommu_k210_defconfig +++ b/arch/riscv/configs/nommu_k210_defconfig @@ -29,8 +29,6 @@ CONFIG_EMBEDDED=y CONFIG_SLOB=y # CONFIG_MMU is not set CONFIG_SOC_CANAAN=y -CONFIG_SOC_CANAAN_K210_DTB_SOURCE="k210_generic" -CONFIG_MAXPHYSMEM_2GB=y CONFIG_SMP=y CONFIG_NR_CPUS=2 CONFIG_CMDLINE="earlycon console=ttySIF0" diff --git a/arch/riscv/configs/nommu_k210_sdcard_defconfig b/arch/riscv/configs/nommu_k210_sdcard_defconfig index 61f887f65419..d68b743d580f 100644 --- a/arch/riscv/configs/nommu_k210_sdcard_defconfig +++ b/arch/riscv/configs/nommu_k210_sdcard_defconfig @@ -21,8 +21,6 @@ CONFIG_EMBEDDED=y CONFIG_SLOB=y # CONFIG_MMU is not set CONFIG_SOC_CANAAN=y -CONFIG_SOC_CANAAN_K210_DTB_SOURCE="k210_generic" -CONFIG_MAXPHYSMEM_2GB=y CONFIG_SMP=y CONFIG_NR_CPUS=2 CONFIG_CMDLINE="earlycon console=ttySIF0 rootdelay=2 root=/dev/mmcblk0p1 ro" diff --git a/arch/riscv/configs/nommu_virt_defconfig b/arch/riscv/configs/nommu_virt_defconfig index e046a0babde4..f224be697785 100644 --- a/arch/riscv/configs/nommu_virt_defconfig +++ b/arch/riscv/configs/nommu_virt_defconfig @@ -27,7 +27,6 @@ CONFIG_SLOB=y # CONFIG_SLAB_MERGE_DEFAULT is not set # CONFIG_MMU is not set CONFIG_SOC_VIRT=y -CONFIG_MAXPHYSMEM_2GB=y CONFIG_SMP=y CONFIG_CMDLINE="root=/dev/vda rw earlycon=uart8250,mmio,0x10000000,115200n8 console=ttyS0" CONFIG_CMDLINE_FORCE=y From f7ae02333d13f598da6ff6b94cf643255707f752 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:45 +0100 Subject: [PATCH 0246/1295] riscv: Move KASAN mapping next to the kernel mapping Now that KASAN_SHADOW_OFFSET is defined at compile time as a config, this value must remain constant whatever the size of the virtual address space, which is only possible by pushing this region at the end of the address space next to the kernel mapping. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- Documentation/riscv/vm-layout.rst | 12 ++++++------ arch/riscv/Kconfig | 4 ++-- arch/riscv/include/asm/kasan.h | 4 ++-- arch/riscv/include/asm/page.h | 2 -- arch/riscv/include/asm/pgtable.h | 22 +++++++++++++++------- arch/riscv/mm/init.c | 25 +++++++++++++------------ 6 files changed, 38 insertions(+), 31 deletions(-) diff --git a/Documentation/riscv/vm-layout.rst b/Documentation/riscv/vm-layout.rst index b7f98930d38d..1bd687b97104 100644 --- a/Documentation/riscv/vm-layout.rst +++ b/Documentation/riscv/vm-layout.rst @@ -47,12 +47,12 @@ RISC-V Linux Kernel SV39 | Kernel-space virtual memory, shared between all processes: ____________________________________________________________|___________________________________________________________ | | | | - ffffffc000000000 | -256 GB | ffffffc7ffffffff | 32 GB | kasan - ffffffcefee00000 | -196 GB | ffffffcefeffffff | 2 MB | fixmap - ffffffceff000000 | -196 GB | ffffffceffffffff | 16 MB | PCI io - ffffffcf00000000 | -196 GB | ffffffcfffffffff | 4 GB | vmemmap - ffffffd000000000 | -192 GB | ffffffdfffffffff | 64 GB | vmalloc/ioremap space - ffffffe000000000 | -128 GB | ffffffff7fffffff | 124 GB | direct mapping of all physical memory + ffffffc6fee00000 | -228 GB | ffffffc6feffffff | 2 MB | fixmap + ffffffc6ff000000 | -228 GB | ffffffc6ffffffff | 16 MB | PCI io + ffffffc700000000 | -228 GB | ffffffc7ffffffff | 4 GB | vmemmap + ffffffc800000000 | -224 GB | ffffffd7ffffffff | 64 GB | vmalloc/ioremap space + ffffffd800000000 | -160 GB | fffffff6ffffffff | 124 GB | direct mapping of all physical memory + fffffff700000000 | -36 GB | fffffffeffffffff | 32 GB | kasan __________________|____________|__________________|_________|____________________________________________________________ | | diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 7a68a4106e5a..7f8aea333291 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -160,12 +160,12 @@ config PAGE_OFFSET hex default 0xC0000000 if 32BIT default 0x80000000 if 64BIT && !MMU - default 0xffffffe000000000 if 64BIT + default 0xffffffd800000000 if 64BIT config KASAN_SHADOW_OFFSET hex depends on KASAN_GENERIC - default 0xdfffffc800000000 if 64BIT + default 0xdfffffff00000000 if 64BIT default 0xffffffff if 32BIT config ARCH_FLATMEM_ENABLE diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index b00f503ec124..257a2495145a 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -28,8 +28,8 @@ #define KASAN_SHADOW_SCALE_SHIFT 3 #define KASAN_SHADOW_SIZE (UL(1) << ((CONFIG_VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) -#define KASAN_SHADOW_START KERN_VIRT_START -#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) +#define KASAN_SHADOW_START (KASAN_SHADOW_END - KASAN_SHADOW_SIZE) +#define KASAN_SHADOW_END MODULES_LOWEST_VADDR #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) void kasan_init(void); diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index b3e5ff0125fe..a18c989e600c 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -33,8 +33,6 @@ */ #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) -#define KERN_VIRT_SIZE (-PAGE_OFFSET) - #ifndef __ASSEMBLY__ #define clear_page(pgaddr) memset((pgaddr), 0, PAGE_SIZE) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index bf204e7c1f74..fa400b2abb36 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -24,6 +24,17 @@ #define KERNEL_LINK_ADDR PAGE_OFFSET #endif +/* Number of entries in the page global directory */ +#define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) +/* Number of entries in the page table */ +#define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t)) + +/* + * Half of the kernel address space (half of the entries of the page global + * directory) is for the direct mapping. + */ +#define KERN_VIRT_SIZE ((PTRS_PER_PGD / 2 * PGDIR_SIZE) / 2) + #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) @@ -39,8 +50,10 @@ /* Modules always live before the kernel */ #ifdef CONFIG_64BIT -#define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G) -#define MODULES_END (PFN_ALIGN((unsigned long)&_start)) +/* This is used to define the end of the KASAN shadow region */ +#define MODULES_LOWEST_VADDR (KERNEL_LINK_ADDR - SZ_2G) +#define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G) +#define MODULES_END (PFN_ALIGN((unsigned long)&_start)) #endif /* @@ -108,11 +121,6 @@ #endif /* CONFIG_XIP_KERNEL */ #ifdef CONFIG_MMU -/* Number of entries in the page global directory */ -#define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) -/* Number of entries in the page table */ -#define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t)) - /* Number of PGD entries that a user-mode program can use */ #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 24b2b8044602..f515964e9c00 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -103,6 +103,9 @@ static void __init print_vm_layout(void) print_mlm("lowmem", (unsigned long)PAGE_OFFSET, (unsigned long)high_memory); #ifdef CONFIG_64BIT +#ifdef CONFIG_KASAN + print_mlm("kasan", KASAN_SHADOW_START, KASAN_SHADOW_END); +#endif print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR, (unsigned long)ADDRESS_SPACE_END); #endif @@ -130,18 +133,8 @@ void __init mem_init(void) print_vm_layout(); } -/* - * The default maximal physical memory size is -PAGE_OFFSET for 32-bit kernel, - * whereas for 64-bit kernel, the end of the virtual address space is occupied - * by the modules/BPF/kernel mappings which reduces the available size of the - * linear mapping. - * Limit the memory size via mem. - */ -#ifdef CONFIG_64BIT -static phys_addr_t memory_limit = -PAGE_OFFSET - SZ_4G; -#else -static phys_addr_t memory_limit = -PAGE_OFFSET; -#endif +/* Limit the memory size via mem. */ +static phys_addr_t memory_limit; static int __init early_mem(char *p) { @@ -612,6 +605,14 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) riscv_pfn_base = PFN_DOWN(kernel_map.phys_addr); + /* + * The default maximal physical memory size is KERN_VIRT_SIZE for 32-bit + * kernel, whereas for 64-bit kernel, the end of the virtual address + * space is occupied by the modules/BPF/kernel mappings which reduces + * the available size of the linear mapping. + */ + memory_limit = KERN_VIRT_SIZE - (IS_ENABLED(CONFIG_64BIT) ? SZ_4G : 0); + /* Sanity check alignment and size */ BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0); BUG_ON((kernel_map.phys_addr % PMD_SIZE) != 0); From 2efad17e5794f4223bbeff1b2c568e3afd9a8c22 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:46 +0100 Subject: [PATCH 0247/1295] riscv: Split early kasan mapping to prepare sv48 introduction Now that kasan shadow region is next to the kernel, for sv48, this region won't be aligned on PGDIR_SIZE and then when populating this region, we'll need to get down to lower levels of the page table. So instead of reimplementing the page table walk for the early population, take advantage of the existing functions used for the final population. Note that kasan swapper initialization must also be split since memblock is not initialized at this point and as the last PGD is shared with the kernel, we'd need to allocate a PUD so postpone the kasan final population after the kernel population is done. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/kasan.h | 1 + arch/riscv/mm/init.c | 4 ++ arch/riscv/mm/kasan_init.c | 113 ++++++++++++++++++--------------- 3 files changed, 67 insertions(+), 51 deletions(-) diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index 257a2495145a..2788e2c46609 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -34,6 +34,7 @@ void kasan_init(void); asmlinkage void kasan_early_init(void); +void kasan_swapper_init(void); #endif #endif diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index f515964e9c00..f5d0052c95ad 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -741,6 +741,10 @@ static void __init setup_vm_final(void) create_kernel_page_table(swapper_pg_dir, false); #endif +#ifdef CONFIG_KASAN + kasan_swapper_init(); +#endif + /* Clear fixmap PTE and PMD mappings */ clear_fixmap(FIX_PTE); clear_fixmap(FIX_PMD); diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 54294f83513d..1434a0225140 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -12,44 +12,6 @@ #include extern pgd_t early_pg_dir[PTRS_PER_PGD]; -asmlinkage void __init kasan_early_init(void) -{ - uintptr_t i; - pgd_t *pgd = early_pg_dir + pgd_index(KASAN_SHADOW_START); - - BUILD_BUG_ON(KASAN_SHADOW_OFFSET != - KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT))); - - for (i = 0; i < PTRS_PER_PTE; ++i) - set_pte(kasan_early_shadow_pte + i, - mk_pte(virt_to_page(kasan_early_shadow_page), - PAGE_KERNEL)); - - for (i = 0; i < PTRS_PER_PMD; ++i) - set_pmd(kasan_early_shadow_pmd + i, - pfn_pmd(PFN_DOWN - (__pa((uintptr_t) kasan_early_shadow_pte)), - __pgprot(_PAGE_TABLE))); - - for (i = KASAN_SHADOW_START; i < KASAN_SHADOW_END; - i += PGDIR_SIZE, ++pgd) - set_pgd(pgd, - pfn_pgd(PFN_DOWN - (__pa(((uintptr_t) kasan_early_shadow_pmd))), - __pgprot(_PAGE_TABLE))); - - /* init for swapper_pg_dir */ - pgd = pgd_offset_k(KASAN_SHADOW_START); - - for (i = KASAN_SHADOW_START; i < KASAN_SHADOW_END; - i += PGDIR_SIZE, ++pgd) - set_pgd(pgd, - pfn_pgd(PFN_DOWN - (__pa(((uintptr_t) kasan_early_shadow_pmd))), - __pgprot(_PAGE_TABLE))); - - local_flush_tlb_all(); -} static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) { @@ -108,26 +70,35 @@ static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); } -static void __init kasan_populate_pgd(unsigned long vaddr, unsigned long end) +static void __init kasan_populate_pgd(pgd_t *pgdp, + unsigned long vaddr, unsigned long end, + bool early) { phys_addr_t phys_addr; - pgd_t *pgdp = pgd_offset_k(vaddr); unsigned long next; do { next = pgd_addr_end(vaddr, end); - /* - * pgdp can't be none since kasan_early_init initialized all KASAN - * shadow region with kasan_early_shadow_pmd: if this is stillthe case, - * that means we can try to allocate a hugepage as a replacement. - */ - if (pgd_page_vaddr(*pgdp) == (unsigned long)lm_alias(kasan_early_shadow_pmd) && - IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) { - phys_addr = memblock_phys_alloc(PGDIR_SIZE, PGDIR_SIZE); - if (phys_addr) { - set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_KERNEL)); + if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) { + if (early) { + phys_addr = __pa((uintptr_t)kasan_early_shadow_pgd_next); + set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_TABLE)); continue; + } else if (pgd_page_vaddr(*pgdp) == + (unsigned long)lm_alias(kasan_early_shadow_pgd_next)) { + /* + * pgdp can't be none since kasan_early_init + * initialized all KASAN shadow region with + * kasan_early_shadow_pud: if this is still the + * case, that means we can try to allocate a + * hugepage as a replacement. + */ + phys_addr = memblock_phys_alloc(PGDIR_SIZE, PGDIR_SIZE); + if (phys_addr) { + set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_KERNEL)); + continue; + } } } @@ -135,12 +106,52 @@ static void __init kasan_populate_pgd(unsigned long vaddr, unsigned long end) } while (pgdp++, vaddr = next, vaddr != end); } +asmlinkage void __init kasan_early_init(void) +{ + uintptr_t i; + + BUILD_BUG_ON(KASAN_SHADOW_OFFSET != + KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT))); + + for (i = 0; i < PTRS_PER_PTE; ++i) + set_pte(kasan_early_shadow_pte + i, + mk_pte(virt_to_page(kasan_early_shadow_page), + PAGE_KERNEL)); + + for (i = 0; i < PTRS_PER_PMD; ++i) + set_pmd(kasan_early_shadow_pmd + i, + pfn_pmd(PFN_DOWN + (__pa((uintptr_t)kasan_early_shadow_pte)), + PAGE_TABLE)); + + if (pgtable_l4_enabled) { + for (i = 0; i < PTRS_PER_PUD; ++i) + set_pud(kasan_early_shadow_pud + i, + pfn_pud(PFN_DOWN + (__pa(((uintptr_t)kasan_early_shadow_pmd))), + PAGE_TABLE)); + } + + kasan_populate_pgd(early_pg_dir + pgd_index(KASAN_SHADOW_START), + KASAN_SHADOW_START, KASAN_SHADOW_END, true); + + local_flush_tlb_all(); +} + +void __init kasan_swapper_init(void) +{ + kasan_populate_pgd(pgd_offset_k(KASAN_SHADOW_START), + KASAN_SHADOW_START, KASAN_SHADOW_END, true); + + local_flush_tlb_all(); +} + static void __init kasan_populate(void *start, void *end) { unsigned long vaddr = (unsigned long)start & PAGE_MASK; unsigned long vend = PAGE_ALIGN((unsigned long)end); - kasan_populate_pgd(vaddr, vend); + kasan_populate_pgd(pgd_offset_k(vaddr), vaddr, vend, false); local_flush_tlb_all(); memset(start, KASAN_SHADOW_INIT, end - start); From 840125a97abc7e676d839adc2743e8f703a156b3 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:47 +0100 Subject: [PATCH 0248/1295] riscv: Introduce functions to switch pt_ops This simply gathers the different pt_ops initialization in functions where a comment was added to explain why the page table operations must be changed along the boot process. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 74 ++++++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 23 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index f5d0052c95ad..424d7777f4df 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -581,6 +581,52 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) dtb_early_pa = dtb_pa; } +/* + * MMU is not enabled, the page tables are allocated directly using + * early_pmd/pud/p4d and the address returned is the physical one. + */ +void pt_ops_set_early(void) +{ + pt_ops.alloc_pte = alloc_pte_early; + pt_ops.get_pte_virt = get_pte_virt_early; +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = alloc_pmd_early; + pt_ops.get_pmd_virt = get_pmd_virt_early; +#endif +} + +/* + * MMU is enabled but page table setup is not complete yet. + * fixmap page table alloc functions must be used as a means to temporarily + * map the allocated physical pages since the linear mapping does not exist yet. + * + * Note that this is called with MMU disabled, hence kernel_mapping_pa_to_va, + * but it will be used as described above. + */ +void pt_ops_set_fixmap(void) +{ + pt_ops.alloc_pte = kernel_mapping_pa_to_va((uintptr_t)alloc_pte_fixmap); + pt_ops.get_pte_virt = kernel_mapping_pa_to_va((uintptr_t)get_pte_virt_fixmap); +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap); + pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap); +#endif +} + +/* + * MMU is enabled and page table setup is complete, so from now, we can use + * generic page allocation functions to setup page table. + */ +void pt_ops_set_late(void) +{ + pt_ops.alloc_pte = alloc_pte_late; + pt_ops.get_pte_virt = get_pte_virt_late; +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = alloc_pmd_late; + pt_ops.get_pmd_virt = get_pmd_virt_late; +#endif +} + asmlinkage void __init setup_vm(uintptr_t dtb_pa) { pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; @@ -625,12 +671,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) BUG_ON((kernel_map.virt_addr + kernel_map.size) > ADDRESS_SPACE_END - SZ_4K); #endif - pt_ops.alloc_pte = alloc_pte_early; - pt_ops.get_pte_virt = get_pte_virt_early; -#ifndef __PAGETABLE_PMD_FOLDED - pt_ops.alloc_pmd = alloc_pmd_early; - pt_ops.get_pmd_virt = get_pmd_virt_early; -#endif + pt_ops_set_early(); + /* Setup early PGD for fixmap */ create_pgd_mapping(early_pg_dir, FIXADDR_START, (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); @@ -694,6 +736,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) pr_warn("FIX_BTMAP_BEGIN: %d\n", FIX_BTMAP_BEGIN); } #endif + + pt_ops_set_fixmap(); } static void __init setup_vm_final(void) @@ -702,16 +746,6 @@ static void __init setup_vm_final(void) phys_addr_t pa, start, end; u64 i; - /** - * MMU is enabled at this point. But page table setup is not complete yet. - * fixmap page table alloc functions should be used at this point - */ - pt_ops.alloc_pte = alloc_pte_fixmap; - pt_ops.get_pte_virt = get_pte_virt_fixmap; -#ifndef __PAGETABLE_PMD_FOLDED - pt_ops.alloc_pmd = alloc_pmd_fixmap; - pt_ops.get_pmd_virt = get_pmd_virt_fixmap; -#endif /* Setup swapper PGD for fixmap */ create_pgd_mapping(swapper_pg_dir, FIXADDR_START, __pa_symbol(fixmap_pgd_next), @@ -753,13 +787,7 @@ static void __init setup_vm_final(void) csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); local_flush_tlb_all(); - /* generic page allocation functions must be used to setup page table */ - pt_ops.alloc_pte = alloc_pte_late; - pt_ops.get_pte_virt = get_pte_virt_late; -#ifndef __PAGETABLE_PMD_FOLDED - pt_ops.alloc_pmd = alloc_pmd_late; - pt_ops.get_pmd_virt = get_pmd_virt_late; -#endif + pt_ops_set_late(); } #else asmlinkage void __init setup_vm(uintptr_t dtb_pa) From 3270bfdb9e4a01bb15d018612a6354c1837b5f97 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:48 +0100 Subject: [PATCH 0249/1295] riscv: Allow to dynamically define VA_BITS With 4-level page table folding at runtime, we don't know at compile time the size of the virtual address space so we must set VA_BITS dynamically so that sparsemem reserves the right amount of memory for struct pages. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 10 ---------- arch/riscv/include/asm/kasan.h | 2 +- arch/riscv/include/asm/pgtable.h | 10 ++++++++-- arch/riscv/include/asm/sparsemem.h | 6 +++++- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 7f8aea333291..7945bacb1d0e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -146,16 +146,6 @@ config MMU Select if you want MMU-based virtualised addressing space support by paged memory management. If unsure, say 'Y'. -config VA_BITS - int - default 32 if 32BIT - default 39 if 64BIT - -config PA_BITS - int - default 34 if 32BIT - default 56 if 64BIT - config PAGE_OFFSET hex default 0xC0000000 if 32BIT diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index 2788e2c46609..743e6ff57996 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -27,7 +27,7 @@ */ #define KASAN_SHADOW_SCALE_SHIFT 3 -#define KASAN_SHADOW_SIZE (UL(1) << ((CONFIG_VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) +#define KASAN_SHADOW_SIZE (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) #define KASAN_SHADOW_START (KASAN_SHADOW_END - KASAN_SHADOW_SIZE) #define KASAN_SHADOW_END MODULES_LOWEST_VADDR #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index fa400b2abb36..23c7c66f2d79 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -61,8 +61,14 @@ * struct pages to map half the virtual address space. Then * position vmemmap directly below the VMALLOC region. */ +#ifdef CONFIG_64BIT +#define VA_BITS 39 +#else +#define VA_BITS 32 +#endif + #define VMEMMAP_SHIFT \ - (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) + (VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) #define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT) #define VMEMMAP_END (VMALLOC_START - 1) #define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) @@ -661,7 +667,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, * and give the kernel the other (upper) half. */ #ifdef CONFIG_64BIT -#define KERN_VIRT_START (-(BIT(CONFIG_VA_BITS)) + TASK_SIZE) +#define KERN_VIRT_START (-(BIT(VA_BITS)) + TASK_SIZE) #else #define KERN_VIRT_START FIXADDR_START #endif diff --git a/arch/riscv/include/asm/sparsemem.h b/arch/riscv/include/asm/sparsemem.h index 45a7018a8118..63acaecc3374 100644 --- a/arch/riscv/include/asm/sparsemem.h +++ b/arch/riscv/include/asm/sparsemem.h @@ -4,7 +4,11 @@ #define _ASM_RISCV_SPARSEMEM_H #ifdef CONFIG_SPARSEMEM -#define MAX_PHYSMEM_BITS CONFIG_PA_BITS +#ifdef CONFIG_64BIT +#define MAX_PHYSMEM_BITS 56 +#else +#define MAX_PHYSMEM_BITS 34 +#endif /* CONFIG_64BIT */ #define SECTION_SIZE_BITS 27 #endif /* CONFIG_SPARSEMEM */ From 60639f74c2f4fcc3ffa2ac0b120eaa874ccc713f Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:50 +0100 Subject: [PATCH 0250/1295] asm-generic: Prepare for riscv use of pud_alloc_one and pud_free In the following commits, riscv will almost use the generic versions of pud_alloc_one and pud_free but an additional check is required since those functions are only relevant when using at least a 4-level page table, which will be determined at runtime on riscv. So move the content of those functions into other functions that riscv can use without duplicating code. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- include/asm-generic/pgalloc.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 02932efad3ab..977bea16cf1b 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -147,6 +147,15 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #if CONFIG_PGTABLE_LEVELS > 3 +static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + gfp_t gfp = GFP_PGTABLE_USER; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + return (pud_t *)get_zeroed_page(gfp); +} + #ifndef __HAVE_ARCH_PUD_ALLOC_ONE /** * pud_alloc_one - allocate a page for PUD-level page table @@ -159,20 +168,23 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) */ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - gfp_t gfp = GFP_PGTABLE_USER; - - if (mm == &init_mm) - gfp = GFP_PGTABLE_KERNEL; - return (pud_t *)get_zeroed_page(gfp); + return __pud_alloc_one(mm, addr); } #endif -static inline void pud_free(struct mm_struct *mm, pud_t *pud) +static inline void __pud_free(struct mm_struct *mm, pud_t *pud) { BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); free_page((unsigned long)pud); } +#ifndef __HAVE_ARCH_PUD_FREE +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + __pud_free(mm, pud); +} +#endif + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #ifndef __HAVE_ARCH_PGD_FREE From e8a62cc26ddf53a3c6ba2a8d33036cf7b84f3923 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:51 +0100 Subject: [PATCH 0251/1295] riscv: Implement sv48 support By adding a new 4th level of page table, give the possibility to 64bit kernel to address 2^48 bytes of virtual address: in practice, that offers 128TB of virtual address space to userspace and allows up to 64TB of physical memory. If the underlying hardware does not support sv48, we will automatically fallback to a standard 3-level page table by folding the new PUD level into PGDIR level. In order to detect HW capabilities at runtime, we use SATP feature that ignores writes with an unsupported mode. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 4 +- arch/riscv/include/asm/csr.h | 3 +- arch/riscv/include/asm/fixmap.h | 1 + arch/riscv/include/asm/kasan.h | 6 +- arch/riscv/include/asm/page.h | 14 ++ arch/riscv/include/asm/pgalloc.h | 40 +++++ arch/riscv/include/asm/pgtable-64.h | 108 +++++++++++- arch/riscv/include/asm/pgtable.h | 24 ++- arch/riscv/kernel/head.S | 3 +- arch/riscv/mm/context.c | 4 +- arch/riscv/mm/init.c | 212 +++++++++++++++++++++--- arch/riscv/mm/kasan_init.c | 137 ++++++++++++++- drivers/firmware/efi/libstub/efi-stub.c | 2 + 13 files changed, 514 insertions(+), 44 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 7945bacb1d0e..c4289d755dd4 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -150,7 +150,7 @@ config PAGE_OFFSET hex default 0xC0000000 if 32BIT default 0x80000000 if 64BIT && !MMU - default 0xffffffd800000000 if 64BIT + default 0xffffaf8000000000 if 64BIT config KASAN_SHADOW_OFFSET hex @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM config PGTABLE_LEVELS int - default 3 if 64BIT + default 4 if 64BIT default 2 config LOCKDEP_SUPPORT diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 5046f431645c..ae711692eec9 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -40,14 +40,13 @@ #ifndef CONFIG_64BIT #define SATP_PPN _AC(0x003FFFFF, UL) #define SATP_MODE_32 _AC(0x80000000, UL) -#define SATP_MODE SATP_MODE_32 #define SATP_ASID_BITS 9 #define SATP_ASID_SHIFT 22 #define SATP_ASID_MASK _AC(0x1FF, UL) #else #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL) #define SATP_MODE_39 _AC(0x8000000000000000, UL) -#define SATP_MODE SATP_MODE_39 +#define SATP_MODE_48 _AC(0x9000000000000000, UL) #define SATP_ASID_BITS 16 #define SATP_ASID_SHIFT 44 #define SATP_ASID_MASK _AC(0xFFFF, UL) diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h index 54cbf07fb4e9..58a718573ad6 100644 --- a/arch/riscv/include/asm/fixmap.h +++ b/arch/riscv/include/asm/fixmap.h @@ -24,6 +24,7 @@ enum fixed_addresses { FIX_HOLE, FIX_PTE, FIX_PMD, + FIX_PUD, FIX_TEXT_POKE1, FIX_TEXT_POKE0, FIX_EARLYCON_MEM_BASE, diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index 743e6ff57996..0b85e363e778 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -28,7 +28,11 @@ #define KASAN_SHADOW_SCALE_SHIFT 3 #define KASAN_SHADOW_SIZE (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) -#define KASAN_SHADOW_START (KASAN_SHADOW_END - KASAN_SHADOW_SIZE) +/* + * Depending on the size of the virtual address space, the region may not be + * aligned on PGDIR_SIZE, so force its alignment to ease its population. + */ +#define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK) #define KASAN_SHADOW_END MODULES_LOWEST_VADDR #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index a18c989e600c..160e3a1e8f8b 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -31,7 +31,20 @@ * When not using MMU this corresponds to the first free page in * physical memory (aligned on a page boundary). */ +#ifdef CONFIG_64BIT +#ifdef CONFIG_MMU +#define PAGE_OFFSET kernel_map.page_offset +#else #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) +#endif +/* + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so + * define the PAGE_OFFSET value for SV39. + */ +#define PAGE_OFFSET_L3 _AC(0xffffffd800000000, UL) +#else +#define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) +#endif /* CONFIG_64BIT */ #ifndef __ASSEMBLY__ @@ -84,6 +97,7 @@ extern unsigned long riscv_pfn_base; #endif /* CONFIG_MMU */ struct kernel_mapping { + unsigned long page_offset; unsigned long virt_addr; uintptr_t phys_addr; uintptr_t size; diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index 0af6933a7100..11823004b87a 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -11,6 +11,8 @@ #include #ifdef CONFIG_MMU +#define __HAVE_ARCH_PUD_ALLOC_ONE +#define __HAVE_ARCH_PUD_FREE #include static inline void pmd_populate_kernel(struct mm_struct *mm, @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); } + +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) +{ + if (pgtable_l4_enabled) { + unsigned long pfn = virt_to_pfn(pud); + + set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); + } +} + +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, + pud_t *pud) +{ + if (pgtable_l4_enabled) { + unsigned long pfn = virt_to_pfn(pud); + + set_p4d_safe(p4d, + __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); + } +} + +#define pud_alloc_one pud_alloc_one +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + if (pgtable_l4_enabled) + return __pud_alloc_one(mm, addr); + + return NULL; +} + +#define pud_free pud_free +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + if (pgtable_l4_enabled) + __pud_free(mm, pud); +} + +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud) #endif /* __PAGETABLE_PMD_FOLDED */ static inline pgd_t *pgd_alloc(struct mm_struct *mm) diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index 228261aa9628..bbbdd66e5e2f 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -8,16 +8,36 @@ #include -#define PGDIR_SHIFT 30 +extern bool pgtable_l4_enabled; + +#define PGDIR_SHIFT_L3 30 +#define PGDIR_SHIFT_L4 39 +#define PGDIR_SIZE_L3 (_AC(1, UL) << PGDIR_SHIFT_L3) + +#define PGDIR_SHIFT (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3) /* Size of region mapped by a page global directory */ #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) +/* pud is folded into pgd in case of 3-level page table */ +#define PUD_SHIFT 30 +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE - 1)) + #define PMD_SHIFT 21 /* Size of region mapped by a page middle directory */ #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE - 1)) +/* Page Upper Directory entry */ +typedef struct { + unsigned long pud; +} pud_t; + +#define pud_val(x) ((x).pud) +#define __pud(x) ((pud_t) { (x) }) +#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t)) + /* Page Middle Directory entry */ typedef struct { unsigned long pmd; @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp) set_pud(pudp, __pud(0)); } +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot) +{ + return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); +} + +static inline unsigned long _pud_pfn(pud_t pud) +{ + return pud_val(pud) >> _PAGE_PFN_SHIFT; +} + static inline pmd_t *pud_pgtable(pud_t pud) { return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT); @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud) return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT); } +#define mm_pud_folded mm_pud_folded +static inline bool mm_pud_folded(struct mm_struct *mm) +{ + if (pgtable_l4_enabled) + return false; + + return true; +} + +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) + static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot) { return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd) #define pmd_ERROR(e) \ pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) +#define pud_ERROR(e) \ + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) + +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) +{ + if (pgtable_l4_enabled) + *p4dp = p4d; + else + set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) }); +} + +static inline int p4d_none(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return (p4d_val(p4d) == 0); + + return 0; +} + +static inline int p4d_present(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return (p4d_val(p4d) & _PAGE_PRESENT); + + return 1; +} + +static inline int p4d_bad(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return !p4d_present(p4d); + + return 0; +} + +static inline void p4d_clear(p4d_t *p4d) +{ + if (pgtable_l4_enabled) + set_p4d(p4d, __p4d(0)); +} + +static inline pud_t *p4d_pgtable(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT); + + return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) }); +} + +static inline struct page *p4d_page(p4d_t p4d) +{ + return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT); +} + +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) + +#define pud_offset pud_offset +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +{ + if (pgtable_l4_enabled) + return p4d_pgtable(*p4d) + pud_index(address); + + return (pud_t *)p4d; +} + #endif /* _ASM_RISCV_PGTABLE_64_H */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 23c7c66f2d79..2ab6650d3926 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -62,7 +62,7 @@ * position vmemmap directly below the VMALLOC region. */ #ifdef CONFIG_64BIT -#define VA_BITS 39 +#define VA_BITS (pgtable_l4_enabled ? 48 : 39) #else #define VA_BITS 32 #endif @@ -102,8 +102,7 @@ #ifndef __ASSEMBLY__ -/* Page Upper Directory not used in RISC-V */ -#include +#include #include #include #include @@ -126,6 +125,17 @@ #define XIP_FIXUP(addr) (addr) #endif /* CONFIG_XIP_KERNEL */ +struct pt_alloc_ops { + pte_t *(*get_pte_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pte)(uintptr_t va); +#ifndef __PAGETABLE_PMD_FOLDED + pmd_t *(*get_pmd_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pmd)(uintptr_t va); + pud_t *(*get_pud_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pud)(uintptr_t va); +#endif +}; + #ifdef CONFIG_MMU /* Number of PGD entries that a user-mode program can use */ #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) @@ -677,9 +687,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, * Note that PGDIR_SIZE must evenly divide TASK_SIZE. */ #ifdef CONFIG_64BIT -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) +#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) +#define TASK_SIZE_MIN (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2) #else -#define TASK_SIZE FIXADDR_START +#define TASK_SIZE FIXADDR_START +#define TASK_SIZE_MIN TASK_SIZE #endif #else /* CONFIG_MMU */ @@ -705,6 +717,8 @@ extern uintptr_t _dtb_early_pa; #define dtb_early_va _dtb_early_va #define dtb_early_pa _dtb_early_pa #endif /* CONFIG_XIP_KERNEL */ +extern u64 satp_mode; +extern bool pgtable_l4_enabled; void paging_init(void); void misc_mem_init(void); diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index f52f01ecbeea..10e718559960 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -105,7 +105,8 @@ relocate: /* Compute satp for kernel page tables, but don't load it yet */ srl a2, a0, PAGE_SHIFT - li a1, SATP_MODE + la a1, satp_mode + REG_L a1, 0(a1) or a2, a2, a1 /* diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c index ea54cc0c9106..7acbfbd14557 100644 --- a/arch/riscv/mm/context.c +++ b/arch/riscv/mm/context.c @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) switch_mm_fast: csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | ((cntx & asid_mask) << SATP_ASID_SHIFT) | - SATP_MODE); + satp_mode); if (need_flush_tlb) local_flush_tlb_all(); @@ -201,7 +201,7 @@ switch_mm_fast: static void set_mm_noasid(struct mm_struct *mm) { /* Switch the page table and blindly nuke entire local TLB */ - csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE); + csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode); local_flush_tlb_all(); } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 424d7777f4df..7ba915849817 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map); #define kernel_map (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map)) #endif +#ifdef CONFIG_64BIT +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39; +#else +u64 satp_mode = SATP_MODE_32; +#endif +EXPORT_SYMBOL(satp_mode); + +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ? + true : false; +EXPORT_SYMBOL(pgtable_l4_enabled); + phys_addr_t phys_ram_base __ro_after_init; EXPORT_SYMBOL(phys_ram_base); @@ -53,15 +64,6 @@ extern char _start[]; void *_dtb_early_va __initdata; uintptr_t _dtb_early_pa __initdata; -struct pt_alloc_ops { - pte_t *(*get_pte_virt)(phys_addr_t pa); - phys_addr_t (*alloc_pte)(uintptr_t va); -#ifndef __PAGETABLE_PMD_FOLDED - pmd_t *(*get_pmd_virt)(phys_addr_t pa); - phys_addr_t (*alloc_pmd)(uintptr_t va); -#endif -}; - static phys_addr_t dma32_phys_limit __initdata; static void __init zone_sizes_init(void) @@ -222,7 +224,7 @@ static void __init setup_bootmem(void) } #ifdef CONFIG_MMU -static struct pt_alloc_ops _pt_ops __initdata; +struct pt_alloc_ops _pt_ops __initdata; #ifdef CONFIG_XIP_KERNEL #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops)) @@ -238,6 +240,7 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); +static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #ifdef CONFIG_XIP_KERNEL @@ -326,6 +329,16 @@ static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #define early_pmd ((pmd_t *)XIP_FIXUP(early_pmd)) #endif /* CONFIG_XIP_KERNEL */ +static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss; +static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss; +static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); + +#ifdef CONFIG_XIP_KERNEL +#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud)) +#define fixmap_pud ((pud_t *)XIP_FIXUP(fixmap_pud)) +#define early_pud ((pud_t *)XIP_FIXUP(early_pud)) +#endif /* CONFIG_XIP_KERNEL */ + static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) { /* Before MMU is enabled */ @@ -345,7 +358,7 @@ static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) static phys_addr_t __init alloc_pmd_early(uintptr_t va) { - BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); + BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT); return (uintptr_t)early_pmd; } @@ -391,21 +404,97 @@ static void __init create_pmd_mapping(pmd_t *pmdp, create_pte_mapping(ptep, va, pa, sz, prot); } -#define pgd_next_t pmd_t -#define alloc_pgd_next(__va) pt_ops.alloc_pmd(__va) -#define get_pgd_next_virt(__pa) pt_ops.get_pmd_virt(__pa) +static pud_t *__init get_pud_virt_early(phys_addr_t pa) +{ + return (pud_t *)((uintptr_t)pa); +} + +static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa) +{ + clear_fixmap(FIX_PUD); + return (pud_t *)set_fixmap_offset(FIX_PUD, pa); +} + +static pud_t *__init get_pud_virt_late(phys_addr_t pa) +{ + return (pud_t *)__va(pa); +} + +static phys_addr_t __init alloc_pud_early(uintptr_t va) +{ + /* Only one PUD is available for early mapping */ + BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); + + return (uintptr_t)early_pud; +} + +static phys_addr_t __init alloc_pud_fixmap(uintptr_t va) +{ + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static phys_addr_t alloc_pud_late(uintptr_t va) +{ + unsigned long vaddr; + + vaddr = __get_free_page(GFP_KERNEL); + BUG_ON(!vaddr); + return __pa(vaddr); +} + +static void __init create_pud_mapping(pud_t *pudp, + uintptr_t va, phys_addr_t pa, + phys_addr_t sz, pgprot_t prot) +{ + pmd_t *nextp; + phys_addr_t next_phys; + uintptr_t pud_index = pud_index(va); + + if (sz == PUD_SIZE) { + if (pud_val(pudp[pud_index]) == 0) + pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot); + return; + } + + if (pud_val(pudp[pud_index]) == 0) { + next_phys = pt_ops.alloc_pmd(va); + pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE); + nextp = pt_ops.get_pmd_virt(next_phys); + memset(nextp, 0, PAGE_SIZE); + } else { + next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index])); + nextp = pt_ops.get_pmd_virt(next_phys); + } + + create_pmd_mapping(nextp, va, pa, sz, prot); +} + +#define pgd_next_t pud_t +#define alloc_pgd_next(__va) (pgtable_l4_enabled ? \ + pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va)) +#define get_pgd_next_virt(__pa) (pgtable_l4_enabled ? \ + pt_ops.get_pud_virt(__pa) : (pgd_next_t *)pt_ops.get_pmd_virt(__pa)) #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ - create_pmd_mapping(__nextp, __va, __pa, __sz, __prot) -#define fixmap_pgd_next fixmap_pmd + (pgtable_l4_enabled ? \ + create_pud_mapping(__nextp, __va, __pa, __sz, __prot) : \ + create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot)) +#define fixmap_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd) +#define trampoline_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd) +#define early_dtb_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd) #else #define pgd_next_t pte_t #define alloc_pgd_next(__va) pt_ops.alloc_pte(__va) #define get_pgd_next_virt(__pa) pt_ops.get_pte_virt(__pa) #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pte_mapping(__nextp, __va, __pa, __sz, __prot) -#define fixmap_pgd_next fixmap_pte +#define fixmap_pgd_next ((uintptr_t)fixmap_pte) +#define early_dtb_pgd_next ((uintptr_t)early_dtb_pmd) +#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot) #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) -#endif +#endif /* __PAGETABLE_PMD_FOLDED */ void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, @@ -492,6 +581,57 @@ static __init pgprot_t pgprot_from_va(uintptr_t va) } #endif /* CONFIG_STRICT_KERNEL_RWX */ +#ifdef CONFIG_64BIT +static void __init disable_pgtable_l4(void) +{ + pgtable_l4_enabled = false; + kernel_map.page_offset = PAGE_OFFSET_L3; + satp_mode = SATP_MODE_39; +} + +/* + * There is a simple way to determine if 4-level is supported by the + * underlying hardware: establish 1:1 mapping in 4-level page table mode + * then read SATP to see if the configuration was taken into account + * meaning sv48 is supported. + */ +static __init void set_satp_mode(void) +{ + u64 identity_satp, hw_satp; + uintptr_t set_satp_mode_pmd; + + set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; + create_pgd_mapping(early_pg_dir, + set_satp_mode_pmd, (uintptr_t)early_pud, + PGDIR_SIZE, PAGE_TABLE); + create_pud_mapping(early_pud, + set_satp_mode_pmd, (uintptr_t)early_pmd, + PUD_SIZE, PAGE_TABLE); + /* Handle the case where set_satp_mode straddles 2 PMDs */ + create_pmd_mapping(early_pmd, + set_satp_mode_pmd, set_satp_mode_pmd, + PMD_SIZE, PAGE_KERNEL_EXEC); + create_pmd_mapping(early_pmd, + set_satp_mode_pmd + PMD_SIZE, + set_satp_mode_pmd + PMD_SIZE, + PMD_SIZE, PAGE_KERNEL_EXEC); + + identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode; + + local_flush_tlb_all(); + csr_write(CSR_SATP, identity_satp); + hw_satp = csr_swap(CSR_SATP, 0ULL); + local_flush_tlb_all(); + + if (hw_satp != identity_satp) + disable_pgtable_l4(); + + memset(early_pg_dir, 0, PAGE_SIZE); + memset(early_pud, 0, PAGE_SIZE); + memset(early_pmd, 0, PAGE_SIZE); +} +#endif + /* * setup_vm() is called from head.S with MMU-off. * @@ -556,10 +696,15 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, - IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd : pa, + IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa, PGDIR_SIZE, IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL); + if (pgtable_l4_enabled) { + create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA, + (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE); + } + if (IS_ENABLED(CONFIG_64BIT)) { create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, pa, PMD_SIZE, PAGE_KERNEL); @@ -592,6 +737,8 @@ void pt_ops_set_early(void) #ifndef __PAGETABLE_PMD_FOLDED pt_ops.alloc_pmd = alloc_pmd_early; pt_ops.get_pmd_virt = get_pmd_virt_early; + pt_ops.alloc_pud = alloc_pud_early; + pt_ops.get_pud_virt = get_pud_virt_early; #endif } @@ -610,6 +757,8 @@ void pt_ops_set_fixmap(void) #ifndef __PAGETABLE_PMD_FOLDED pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap); pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap); + pt_ops.alloc_pud = kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap); + pt_ops.get_pud_virt = kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap); #endif } @@ -624,6 +773,8 @@ void pt_ops_set_late(void) #ifndef __PAGETABLE_PMD_FOLDED pt_ops.alloc_pmd = alloc_pmd_late; pt_ops.get_pmd_virt = get_pmd_virt_late; + pt_ops.alloc_pud = alloc_pud_late; + pt_ops.get_pud_virt = get_pud_virt_late; #endif } @@ -632,6 +783,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; kernel_map.virt_addr = KERNEL_LINK_ADDR; + kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL); #ifdef CONFIG_XIP_KERNEL kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR; @@ -646,6 +798,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) kernel_map.phys_addr = (uintptr_t)(&_start); kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr; #endif + +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) + set_satp_mode(); +#endif + kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr; kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; @@ -675,15 +832,21 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) /* Setup early PGD for fixmap */ create_pgd_mapping(early_pg_dir, FIXADDR_START, - (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); + fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); #ifndef __PAGETABLE_PMD_FOLDED - /* Setup fixmap PMD */ + /* Setup fixmap PUD and PMD */ + if (pgtable_l4_enabled) + create_pud_mapping(fixmap_pud, FIXADDR_START, + (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE); create_pmd_mapping(fixmap_pmd, FIXADDR_START, (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); /* Setup trampoline PGD and PMD */ create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr, - (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); + trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE); + if (pgtable_l4_enabled) + create_pud_mapping(trampoline_pud, kernel_map.virt_addr, + (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE); #ifdef CONFIG_XIP_KERNEL create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr, kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC); @@ -711,7 +874,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap * range can not span multiple pmds. */ - BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) + BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); #ifndef __PAGETABLE_PMD_FOLDED @@ -782,9 +945,10 @@ static void __init setup_vm_final(void) /* Clear fixmap PTE and PMD mappings */ clear_fixmap(FIX_PTE); clear_fixmap(FIX_PMD); + clear_fixmap(FIX_PUD); /* Move to swapper page table */ - csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); + csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode); local_flush_tlb_all(); pt_ops_set_late(); diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 1434a0225140..993f50571a3b 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -11,7 +11,29 @@ #include #include +/* + * Kasan shadow region must lie at a fixed address across sv39, sv48 and sv57 + * which is right before the kernel. + * + * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate + * the page global directory with kasan_early_shadow_pmd. + * + * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so the mapping + * must be divided as follows: + * - the first PGD entry, although incomplete, is populated with + * kasan_early_shadow_pud/p4d + * - the PGD entries in the middle are populated with kasan_early_shadow_pud/p4d + * - the last PGD entry is shared with the kernel mapping so populated at the + * lower levels pud/p4d + * + * In addition, when shallow populating a kasan region (for example vmalloc), + * this region may also not be aligned on PGDIR size, so we must go down to the + * pud level too. + */ + extern pgd_t early_pg_dir[PTRS_PER_PGD]; +extern struct pt_alloc_ops _pt_ops __initdata; +#define pt_ops _pt_ops static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) { @@ -35,15 +57,19 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE)); } -static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end) +static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; pmd_t *pmdp, *base_pmd; unsigned long next; - base_pmd = (pmd_t *)pgd_page_vaddr(*pgd); - if (base_pmd == lm_alias(kasan_early_shadow_pmd)) + if (pud_none(*pud)) { base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + } else { + base_pmd = (pmd_t *)pud_pgtable(*pud); + if (base_pmd == lm_alias(kasan_early_shadow_pmd)) + base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + } pmdp = base_pmd + pmd_index(vaddr); @@ -67,9 +93,72 @@ static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned * it entirely, memblock could allocate a page at a physical address * where KASAN is not populated yet and then we'd get a page fault. */ - set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); + set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); } +static void __init kasan_populate_pud(pgd_t *pgd, + unsigned long vaddr, unsigned long end, + bool early) +{ + phys_addr_t phys_addr; + pud_t *pudp, *base_pud; + unsigned long next; + + if (early) { + /* + * We can't use pgd_page_vaddr here as it would return a linear + * mapping address but it is not mapped yet, but when populating + * early_pg_dir, we need the physical address and when populating + * swapper_pg_dir, we need the kernel virtual address so use + * pt_ops facility. + */ + base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd))); + } else { + base_pud = (pud_t *)pgd_page_vaddr(*pgd); + if (base_pud == lm_alias(kasan_early_shadow_pud)) + base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); + } + + pudp = base_pud + pud_index(vaddr); + + do { + next = pud_addr_end(vaddr, end); + + if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) { + if (early) { + phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd)); + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE)); + continue; + } else { + phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE); + if (phys_addr) { + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL)); + continue; + } + } + } + + kasan_populate_pmd(pudp, vaddr, next); + } while (pudp++, vaddr = next, vaddr != end); + + /* + * Wait for the whole PGD to be populated before setting the PGD in + * the page table, otherwise, if we did set the PGD before populating + * it entirely, memblock could allocate a page at a physical address + * where KASAN is not populated yet and then we'd get a page fault. + */ + if (!early) + set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE)); +} + +#define kasan_early_shadow_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)kasan_early_shadow_pud : \ + (uintptr_t)kasan_early_shadow_pmd) +#define kasan_populate_pgd_next(pgdp, vaddr, next, early) \ + (pgtable_l4_enabled ? \ + kasan_populate_pud(pgdp, vaddr, next, early) : \ + kasan_populate_pmd((pud_t *)pgdp, vaddr, next)) + static void __init kasan_populate_pgd(pgd_t *pgdp, unsigned long vaddr, unsigned long end, bool early) @@ -102,7 +191,7 @@ static void __init kasan_populate_pgd(pgd_t *pgdp, } } - kasan_populate_pmd(pgdp, vaddr, next); + kasan_populate_pgd_next(pgdp, vaddr, next, early); } while (pgdp++, vaddr = next, vaddr != end); } @@ -157,18 +246,54 @@ static void __init kasan_populate(void *start, void *end) memset(start, KASAN_SHADOW_INIT, end - start); } +static void __init kasan_shallow_populate_pud(pgd_t *pgdp, + unsigned long vaddr, unsigned long end, + bool kasan_populate) +{ + unsigned long next; + pud_t *pudp, *base_pud; + pmd_t *base_pmd; + bool is_kasan_pmd; + + base_pud = (pud_t *)pgd_page_vaddr(*pgdp); + pudp = base_pud + pud_index(vaddr); + + if (kasan_populate) + memcpy(base_pud, (void *)kasan_early_shadow_pgd_next, + sizeof(pud_t) * PTRS_PER_PUD); + + do { + next = pud_addr_end(vaddr, end); + is_kasan_pmd = (pud_pgtable(*pudp) == lm_alias(kasan_early_shadow_pmd)); + + if (is_kasan_pmd) { + base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); + } + } while (pudp++, vaddr = next, vaddr != end); +} + static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end) { unsigned long next; void *p; pgd_t *pgd_k = pgd_offset_k(vaddr); + bool is_kasan_pgd_next; do { next = pgd_addr_end(vaddr, end); - if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) { + is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) == + (unsigned long)lm_alias(kasan_early_shadow_pgd_next)); + + if (is_kasan_pgd_next) { p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); } + + if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) + continue; + + kasan_shallow_populate_pud(pgd_k, vaddr, next, is_kasan_pgd_next); } while (pgd_k++, vaddr = next, vaddr != end); } diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index 26e69788f27a..b3db5d91ed38 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -40,6 +40,8 @@ #ifdef CONFIG_ARM64 # define EFI_RT_VIRTUAL_LIMIT DEFAULT_MAP_WINDOW_64 +#elif defined(CONFIG_RISCV) +# define EFI_RT_VIRTUAL_LIMIT TASK_SIZE_MIN #else # define EFI_RT_VIRTUAL_LIMIT TASK_SIZE #endif From 73c7c8f68e7266bd558227bd9c598cb90b1673cc Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:52 +0100 Subject: [PATCH 0252/1295] riscv: Use pgtable_l4_enabled to output mmu_type in cpuinfo Now that the mmu type is determined at runtime using SATP characteristic, use the global variable pgtable_l4_enabled to output mmu type of the processor through /proc/cpuinfo instead of relying on device tree infos. Signed-off-by: Alexandre Ghiti Reviewed-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index f13b2c9ea912..ad0a7e9f828b 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -7,6 +7,7 @@ #include #include #include +#include /* * Returns the hart ID of the given device tree node, or -ENODEV if the node @@ -71,18 +72,19 @@ static void print_isa(struct seq_file *f, const char *isa) seq_puts(f, "\n"); } -static void print_mmu(struct seq_file *f, const char *mmu_type) +static void print_mmu(struct seq_file *f) { -#if defined(CONFIG_32BIT) - if (strcmp(mmu_type, "riscv,sv32") != 0) - return; -#elif defined(CONFIG_64BIT) - if (strcmp(mmu_type, "riscv,sv39") != 0 && - strcmp(mmu_type, "riscv,sv48") != 0) - return; -#endif + char sv_type[16]; - seq_printf(f, "mmu\t\t: %s\n", mmu_type+6); +#if defined(CONFIG_32BIT) + strncpy(sv_type, "sv32", 5); +#elif defined(CONFIG_64BIT) + if (pgtable_l4_enabled) + strncpy(sv_type, "sv48", 5); + else + strncpy(sv_type, "sv39", 5); +#endif + seq_printf(f, "mmu\t\t: %s\n", sv_type); } static void *c_start(struct seq_file *m, loff_t *pos) @@ -107,14 +109,13 @@ static int c_show(struct seq_file *m, void *v) { unsigned long cpu_id = (unsigned long)v - 1; struct device_node *node = of_get_cpu_node(cpu_id, NULL); - const char *compat, *isa, *mmu; + const char *compat, *isa; seq_printf(m, "processor\t: %lu\n", cpu_id); seq_printf(m, "hart\t\t: %lu\n", cpuid_to_hartid_map(cpu_id)); if (!of_property_read_string(node, "riscv,isa", &isa)) print_isa(m, isa); - if (!of_property_read_string(node, "mmu-type", &mmu)) - print_mmu(m, mmu); + print_mmu(m); if (!of_property_read_string(node, "compatible", &compat) && strcmp(compat, "riscv")) seq_printf(m, "uarch\t\t: %s\n", compat); From c774de22c430733487f70d755067d9ea55dbe6de Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:53 +0100 Subject: [PATCH 0253/1295] riscv: Explicit comment about user virtual address space size Define precisely the size of the user accessible virtual space size for sv32/39/48 mmu types and explain why the whole virtual address space is split into 2 equal chunks between kernel and user space. Signed-off-by: Alexandre Ghiti Reviewed-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 2ab6650d3926..d0a96b507561 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -685,6 +685,15 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, /* * Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32. * Note that PGDIR_SIZE must evenly divide TASK_SIZE. + * Task size is: + * - 0x9fc00000 (~2.5GB) for RV32. + * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu + * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu + * + * Note that PGDIR_SIZE must evenly divide TASK_SIZE since "RISC-V + * Instruction Set Manual Volume II: Privileged Architecture" states that + * "load and store effective addresses, which are 64bits, must have bits + * 63–48 all equal to bit 47, or else a page-fault exception will occur." */ #ifdef CONFIG_64BIT #define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) From 20aa49541a2ea2cb767ada04cbcaf12fe3ca1275 Mon Sep 17 00:00:00 2001 From: kernel test robot Date: Wed, 19 Jan 2022 11:38:36 +0800 Subject: [PATCH 0254/1295] riscv: fix boolconv.cocci warnings arch/riscv/mm/init.c:48:11-16: WARNING: conversion to bool not needed here Remove unneeded conversion to bool Semantic patch information: Relational and logical operators evaluate to bool, explicit conversion is overly verbose and unneeded. Generated by: scripts/coccinelle/misc/boolconv.cocci Reported-by: kernel test robot Signed-off-by: kernel test robot Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 8def6f8db0b0..cf4d018b7d66 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -44,8 +44,7 @@ u64 satp_mode = SATP_MODE_32; #endif EXPORT_SYMBOL(satp_mode); -bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ? - true : false; +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL); EXPORT_SYMBOL(pgtable_l4_enabled); phys_addr_t phys_ram_base __ro_after_init; From 52d005337b2c94ab37273d9ad8382d4fb051defd Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 19 Jan 2022 22:00:29 -0600 Subject: [PATCH 0255/1295] smb3: send NTLMSSP version information For improved debugging it can be helpful to send version information as other clients do during NTLMSSP negotiation. See protocol document MS-NLMP section 2.2.1.1 Set the major and minor versions based on the kernel version, and the BuildNumber based on the internal cifs.ko module version number, and following the recommendation in the protocol documentation (MS-NLMP section 2.2.10) we set the NTLMRevisionCurrent field to 15. Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 1 + fs/cifs/ntlmssp.h | 30 +++++++++++++++++++- fs/cifs/sess.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/smb2pdu.c | 2 +- 4 files changed, 101 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 9e5d9e192ef0..7c6f8180df69 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -152,5 +152,6 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ +#define SMB3_PRODUCT_BUILD 34 #define CIFS_VERSION "2.34" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 6d242af536cb..298458404252 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -40,7 +40,7 @@ #define NTLMSSP_REQUEST_NON_NT_KEY 0x400000 #define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000 /* #define reserved4 0x1000000 */ -#define NTLMSSP_NEGOTIATE_VERSION 0x2000000 /* we do not set */ +#define NTLMSSP_NEGOTIATE_VERSION 0x2000000 /* we only set for SMB2+ */ /* #define reserved3 0x4000000 */ /* #define reserved2 0x8000000 */ /* #define reserved1 0x10000000 */ @@ -87,6 +87,30 @@ typedef struct _NEGOTIATE_MESSAGE { /* followed by WorkstationString */ } __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE; +#define NTLMSSP_REVISION_W2K3 0x0F + +/* See MS-NLMP section 2.2.2.10 */ +struct ntlmssp_version { + __u8 ProductMajorVersion; + __u8 ProductMinorVersion; + __le16 ProductBuild; /* we send the cifs.ko module version here */ + __u8 Reserved[3]; + __u8 NTLMRevisionCurrent; /* currently 0x0F */ +} __packed; + +/* see MS-NLMP section 2.2.1.1 */ +struct negotiate_message { + __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; + __le32 MessageType; /* NtLmNegotiate = 1 */ + __le32 NegotiateFlags; + SECURITY_BUFFER DomainName; /* RFC 1001 style and ASCII */ + SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */ + struct ntlmssp_version Version; + /* SECURITY_BUFFER */ + char DomainString[0]; + /* followed by WorkstationString */ +} __packed; + typedef struct _CHALLENGE_MESSAGE { __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; __le32 MessageType; /* NtLmChallenge = 2 */ @@ -123,6 +147,10 @@ int build_ntlmssp_negotiate_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, struct TCP_Server_Info *server, const struct nls_table *nls_cp); +int build_ntlmssp_smb3_negotiate_blob(unsigned char **pbuffer, u16 *buflen, + struct cifs_ses *ses, + struct TCP_Server_Info *server, + const struct nls_table *nls_cp); int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, struct TCP_Server_Info *server, diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 15373a377a36..dc3b16d1be09 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -17,6 +17,8 @@ #include "nterr.h" #include #include +#include +#include "cifsfs.h" #include "cifs_spnego.h" #include "smb2proto.h" #include "fs_context.h" @@ -809,6 +811,74 @@ setup_ntlm_neg_ret: return rc; } +/* + * Build ntlmssp blob with additional fields, such as version, + * supported by modern servers. For safety limit to SMB3 or later + * See notes in MS-NLMP Section 2.2.2.1 e.g. + */ +int build_ntlmssp_smb3_negotiate_blob(unsigned char **pbuffer, + u16 *buflen, + struct cifs_ses *ses, + struct TCP_Server_Info *server, + const struct nls_table *nls_cp) +{ + int rc = 0; + struct negotiate_message *sec_blob; + __u32 flags; + unsigned char *tmp; + int len; + + len = size_of_ntlmssp_blob(ses, sizeof(struct negotiate_message)); + *pbuffer = kmalloc(len, GFP_KERNEL); + if (!*pbuffer) { + rc = -ENOMEM; + cifs_dbg(VFS, "Error %d during NTLMSSP allocation\n", rc); + *buflen = 0; + goto setup_ntlm_smb3_neg_ret; + } + sec_blob = (struct negotiate_message *)*pbuffer; + + memset(*pbuffer, 0, sizeof(struct negotiate_message)); + memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); + sec_blob->MessageType = NtLmNegotiate; + + /* BB is NTLMV2 session security format easier to use here? */ + flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | + NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | + NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC | + NTLMSSP_NEGOTIATE_ALWAYS_SIGN | NTLMSSP_NEGOTIATE_SEAL | + NTLMSSP_NEGOTIATE_SIGN | NTLMSSP_NEGOTIATE_VERSION; + if (!server->session_estab || ses->ntlmssp->sesskey_per_smbsess) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; + + sec_blob->Version.ProductMajorVersion = LINUX_VERSION_MAJOR; + sec_blob->Version.ProductMinorVersion = LINUX_VERSION_PATCHLEVEL; + sec_blob->Version.ProductBuild = cpu_to_le16(SMB3_PRODUCT_BUILD); + sec_blob->Version.NTLMRevisionCurrent = NTLMSSP_REVISION_W2K3; + + tmp = *pbuffer + sizeof(struct negotiate_message); + ses->ntlmssp->client_flags = flags; + sec_blob->NegotiateFlags = cpu_to_le32(flags); + + /* these fields should be null in negotiate phase MS-NLMP 3.1.5.1.1 */ + cifs_security_buffer_from_str(&sec_blob->DomainName, + NULL, + CIFS_MAX_DOMAINNAME_LEN, + *pbuffer, &tmp, + nls_cp); + + cifs_security_buffer_from_str(&sec_blob->WorkstationName, + NULL, + CIFS_MAX_WORKSTATION_LEN, + *pbuffer, &tmp, + nls_cp); + + *buflen = tmp - *pbuffer; +setup_ntlm_smb3_neg_ret: + return rc; +} + + int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 1e670e56b07a..7e7909b1ae11 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1506,7 +1506,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) if (rc) goto out_err; - rc = build_ntlmssp_negotiate_blob(&ntlmssp_blob, + rc = build_ntlmssp_smb3_negotiate_blob(&ntlmssp_blob, &blob_length, ses, server, sess_data->nls_cp); if (rc) From 51620150ca2df62f8ea472ab8962be590c957288 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 19 Jan 2022 22:11:33 -0600 Subject: [PATCH 0256/1295] cifs: update internal module number To 2.35 Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 7c6f8180df69..15a5c5db038b 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -152,6 +152,6 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define SMB3_PRODUCT_BUILD 34 -#define CIFS_VERSION "2.34" +#define SMB3_PRODUCT_BUILD 35 +#define CIFS_VERSION "2.35" #endif /* _CIFSFS_H */ From 440323b6cf5b9896013a78c4f578823e8243a7fd Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Fri, 14 Jan 2022 18:58:57 +0800 Subject: [PATCH 0257/1295] asm-generic: Add missing brackets for io_stop_wc macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After using io_stop_wc(), drivers reports following compile error when compiled on X86. drivers/net/ethernet/hisilicon/hns3/hns3_enet.c: In function ‘hns3_tx_push_bd’: drivers/net/ethernet/hisilicon/hns3/hns3_enet.c:2058:12: error: expected ‘;’ before ‘(’ token io_stop_wc(); ^ It is because I missed to add the brackets after io_stop_wc macro. So let's add the missing brackets. Fixes: d5624bb29f49 ("asm-generic: introduce io_stop_wc() and add implementation for ARM64") Reported-by: Guangbin Huang Signed-off-by: Xiongfeng Wang Link: https://lore.kernel.org/r/20220114105857.126300-1-wangxiongfeng2@huawei.com Signed-off-by: Catalin Marinas --- include/asm-generic/barrier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 4c2c1b830344..3385ca5ca5c0 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -259,7 +259,7 @@ do { \ * write-combining memory accesses before this macro with those after it. */ #ifndef io_stop_wc -#define io_stop_wc do { } while (0) +#define io_stop_wc() do { } while (0) #endif #endif /* !__ASSEMBLY__ */ From 3364c6ce23c6e347c63fc895e0d6d1e8a9407849 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jan 2022 12:22:59 -0800 Subject: [PATCH 0258/1295] arm64: atomics: lse: Dereference matching size When building with -Warray-bounds, the following warning is generated: In file included from ./arch/arm64/include/asm/lse.h:16, from ./arch/arm64/include/asm/cmpxchg.h:14, from ./arch/arm64/include/asm/atomic.h:16, from ./include/linux/atomic.h:7, from ./include/asm-generic/bitops/atomic.h:5, from ./arch/arm64/include/asm/bitops.h:25, from ./include/linux/bitops.h:33, from ./include/linux/kernel.h:22, from kernel/printk/printk.c:22: ./arch/arm64/include/asm/atomic_lse.h:247:9: warning: array subscript 'long unsigned int[0]' is partly outside array bounds of 'atomic_t[1]' [-Warray-bounds] 247 | asm volatile( \ | ^~~ ./arch/arm64/include/asm/atomic_lse.h:266:1: note: in expansion of macro '__CMPXCHG_CASE' 266 | __CMPXCHG_CASE(w, , acq_, 32, a, "memory") | ^~~~~~~~~~~~~~ kernel/printk/printk.c:3606:17: note: while referencing 'printk_cpulock_owner' 3606 | static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); | ^~~~~~~~~~~~~~~~~~~~ This is due to the compiler seeing an unsigned long * cast against something (atomic_t) that is int sized. Replace the cast with the matching size cast. This results in no change in binary output. Note that __ll_sc__cmpxchg_case_##name##sz already uses the same constraint: [v] "+Q" (*(u##sz *)ptr Which is why only the LSE form needs updating and not the LL/SC form, so this change is unlikely to be problematic. Cc: Will Deacon Cc: Peter Zijlstra Cc: Boqun Feng Cc: linux-arm-kernel@lists.infradead.org Acked-by: Ard Biesheuvel Acked-by: Mark Rutland Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220112202259.3950286-1-keescook@chromium.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/atomic_lse.h | 2 +- arch/arm64/include/asm/cmpxchg.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h index d955ade5df7c..5d460f6b7675 100644 --- a/arch/arm64/include/asm/atomic_lse.h +++ b/arch/arm64/include/asm/atomic_lse.h @@ -249,7 +249,7 @@ __lse__cmpxchg_case_##name##sz(volatile void *ptr, \ " mov %" #w "[tmp], %" #w "[old]\n" \ " cas" #mb #sfx "\t%" #w "[tmp], %" #w "[new], %[v]\n" \ " mov %" #w "[ret], %" #w "[tmp]" \ - : [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr), \ + : [ret] "+r" (x0), [v] "+Q" (*(u##sz *)ptr), \ [tmp] "=&r" (tmp) \ : [old] "r" (x1), [new] "r" (x2) \ : cl); \ diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h index f9bef42c1411..497acf134d99 100644 --- a/arch/arm64/include/asm/cmpxchg.h +++ b/arch/arm64/include/asm/cmpxchg.h @@ -243,7 +243,7 @@ static inline void __cmpwait_case_##sz(volatile void *ptr, \ " cbnz %" #w "[tmp], 1f\n" \ " wfe\n" \ "1:" \ - : [tmp] "=&r" (tmp), [v] "+Q" (*(unsigned long *)ptr) \ + : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \ : [val] "r" (val)); \ } From bb425a7598479fa0f171ec806033c440f218b0ce Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Wed, 15 Dec 2021 14:45:58 +0800 Subject: [PATCH 0259/1295] arm64: mm: apply __ro_after_init to memory_limit This variable is only set during initialization, so mark with __ro_after_init. Signed-off-by: Peng Fan Reviewed-by: David Hildenbrand Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20211215064559.2843555-1-peng.fan@oss.nxp.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index a8834434af99..db63cc885771 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -172,7 +172,7 @@ int pfn_is_map_memory(unsigned long pfn) } EXPORT_SYMBOL(pfn_is_map_memory); -static phys_addr_t memory_limit = PHYS_ADDR_MAX; +static phys_addr_t memory_limit __ro_after_init = PHYS_ADDR_MAX; /* * Limit the memory size that was specified via FDT. From 47934e06b65637c88a762d9c98329ae6e3238888 Mon Sep 17 00:00:00 2001 From: Congyu Liu Date: Tue, 18 Jan 2022 14:20:13 -0500 Subject: [PATCH 0260/1295] net: fix information leakage in /proc/net/ptype In one net namespace, after creating a packet socket without binding it to a device, users in other net namespaces can observe the new `packet_type` added by this packet socket by reading `/proc/net/ptype` file. This is minor information leakage as packet socket is namespace aware. Add a net pointer in `packet_type` to keep the net namespace of of corresponding packet socket. In `ptype_seq_show`, this net pointer must be checked when it is not NULL. Fixes: 2feb27dbe00c ("[NETNS]: Minor information leak via /proc/net/ptype file.") Signed-off-by: Congyu Liu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/net-procfs.c | 3 ++- net/packet/af_packet.c | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3213c7227b59..e490b84732d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2548,6 +2548,7 @@ struct packet_type { struct net_device *); bool (*id_match)(struct packet_type *ptype, struct sock *sk); + struct net *af_packet_net; void *af_packet_priv; struct list_head list; }; diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index d8b9dbabd4a4..5b8016335aca 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -260,7 +260,8 @@ static int ptype_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_puts(seq, "Type Device Function\n"); - else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { + else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) && + (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) { if (pt->type == htons(ETH_P_ALL)) seq_puts(seq, "ALL "); else diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5bd409ab4cc2..85ea7ddb48db 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1774,6 +1774,7 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; + match->prot_hook.af_packet_net = read_pnet(&match->net); match->prot_hook.id_match = match_fanout_group; match->max_num_members = args->max_num_members; list_add(&match->list, &fanout_list); @@ -3353,6 +3354,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; + po->prot_hook.af_packet_net = sock_net(sk); if (proto) { po->prot_hook.type = proto; From 973bf8fdd12f0e70ea351c018e68edd377a836d1 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Tue, 18 Jan 2022 14:19:09 -0300 Subject: [PATCH 0261/1295] net: sched: Clarify error message when qdisc kind is unknown When adding a tc rule with a qdisc kind that is not supported or not compiled into the kernel, the kernel emits the following error: "Error: Specified qdisc not found.". Found via tdc testing when ETS qdisc was not compiled in and it was not obvious right away what the message meant without looking at the kernel code. Change the error message to be more explicit and say the qdisc kind is unknown. Signed-off-by: Victor Nogueira Signed-off-by: David S. Miller --- net/sched/sch_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 2cb496c84878..179825a3b2fd 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1204,7 +1204,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, err = -ENOENT; if (!ops) { - NL_SET_ERR_MSG(extack, "Specified qdisc not found"); + NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown"); goto err_out; } From d15c7e875d44367005370e6a82e8f3a382a04f9b Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 18 Jan 2022 15:52:43 -0600 Subject: [PATCH 0262/1295] net: phy: broadcom: hook up soft_reset for BCM54616S A problem was encountered with the Bel-Fuse 1GBT-SFP05 SFP module (which is a 1 Gbps copper module operating in SGMII mode with an internal BCM54616S PHY device) using the Xilinx AXI Ethernet MAC core, where the module would work properly on the initial insertion or boot of the device, but after the device was rebooted, the link would either only come up at 100 Mbps speeds or go up and down erratically. I found no meaningful changes in the PHY configuration registers between the working and non-working boots, but the status registers seemed to have a lot of error indications set on the SERDES side of the device on the non-working boot. I suspect the problem is that whatever happens on the SGMII link when the device is rebooted and the FPGA logic gets reloaded ends up putting the module's onboard PHY into a bad state. Since commit 6e2d85ec0559 ("net: phy: Stop with excessive soft reset") the genphy_soft_reset call is not made automatically by the PHY core unless the callback is explicitly specified in the driver structure. For most of these Broadcom devices, there is probably a hardware reset that gets asserted to reset the PHY during boot, however for SFP modules (where the BCM54616S is commonly found) no such reset line exists, so if the board keeps the SFP cage powered up across a reboot, it will end up with no reset occurring during reboots. Hook up the genphy_soft_reset callback for BCM54616S to ensure that a PHY reset is performed before the device is initialized. This appears to fix the issue with erratic operation after a reboot with this SFP module. Fixes: 6e2d85ec0559 ("net: phy: Stop with excessive soft reset") Signed-off-by: Robert Hancock Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index bb5104ae4610..3c683e0e40e9 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -854,6 +854,7 @@ static struct phy_driver broadcom_drivers[] = { .phy_id_mask = 0xfffffff0, .name = "Broadcom BCM54616S", /* PHY_GBIT_FEATURES */ + .soft_reset = genphy_soft_reset, .config_init = bcm54xx_config_init, .config_aneg = bcm54616s_config_aneg, .config_intr = bcm_phy_config_intr, From e2f08207c558bc0bc8abaa557cdb29bad776ac7b Mon Sep 17 00:00:00 2001 From: Moshe Tal Date: Thu, 20 Jan 2022 11:55:50 +0200 Subject: [PATCH 0263/1295] ethtool: Fix link extended state for big endian The link extended sub-states are assigned as enum that is an integer size but read from a union as u8, this is working for small values on little endian systems but for big endian this always give 0. Fix the variable in the union to match the enum size. Fixes: ecc31c60240b ("ethtool: Add link extended state") Signed-off-by: Moshe Tal Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: Gal Pressman Reviewed-by: Amit Cohen Signed-off-by: David S. Miller --- include/linux/ethtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index a26f37a27167..11efc45de66a 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -111,7 +111,7 @@ struct ethtool_link_ext_state_info { enum ethtool_link_ext_substate_bad_signal_integrity bad_signal_integrity; enum ethtool_link_ext_substate_cable_issue cable_issue; enum ethtool_link_ext_substate_module module; - u8 __link_ext_substate; + u32 __link_ext_substate; }; }; From 6cee105e7f2ced596373951d9ea08dacc3883c68 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 20 Jan 2022 10:05:46 +0200 Subject: [PATCH 0264/1295] ipv6_tunnel: Rate limit warning messages The warning messages can be invoked from the data path for every packet transmitted through an ip6gre netdev, leading to high CPU utilization. Fix that by rate limiting the messages. Fixes: 09c6bbf090ec ("[IPV6]: Do mandatory IPv6 tunnel endpoint checks in realtime") Reported-by: Maksym Yaremchuk Tested-by: Maksym Yaremchuk Signed-off-by: Ido Schimmel Reviewed-by: Amit Cohen Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index fe786df4f849..97ade833f58c 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1036,14 +1036,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) - pr_warn("%s xmit: Local address not yet configured!\n", - p->name); + pr_warn_ratelimited("%s xmit: Local address not yet configured!\n", + p->name); else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && !ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE))) - pr_warn("%s xmit: Routing loop! Remote address found on this node!\n", - p->name); + pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n", + p->name); else ret = 1; rcu_read_unlock(); From 1ba1a4a90fa416a6f389206416c5f488cf8b1543 Mon Sep 17 00:00:00 2001 From: Yuji Ishikawa Date: Wed, 19 Jan 2022 13:46:47 +0900 Subject: [PATCH 0265/1295] net: stmmac: dwmac-visconti: Fix bit definitions for ETHER_CLK_SEL just 0 should be used to represent cleared bits * ETHER_CLK_SEL_DIV_SEL_20 * ETHER_CLK_SEL_TX_CLK_EXT_SEL_IN * ETHER_CLK_SEL_RX_CLK_EXT_SEL_IN * ETHER_CLK_SEL_TX_CLK_O_TX_I * ETHER_CLK_SEL_RMII_CLK_SEL_IN Fixes: b38dd98ff8d0 ("net: stmmac: Add Toshiba Visconti SoCs glue driver") Signed-off-by: Yuji Ishikawa Reviewed-by: Nobuhiro Iwamatsu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c index e2e0f977875d..43a446ceadf7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c @@ -22,21 +22,21 @@ #define ETHER_CLK_SEL_RMII_CLK_EN BIT(2) #define ETHER_CLK_SEL_RMII_CLK_RST BIT(3) #define ETHER_CLK_SEL_DIV_SEL_2 BIT(4) -#define ETHER_CLK_SEL_DIV_SEL_20 BIT(0) +#define ETHER_CLK_SEL_DIV_SEL_20 0 #define ETHER_CLK_SEL_FREQ_SEL_125M (BIT(9) | BIT(8)) #define ETHER_CLK_SEL_FREQ_SEL_50M BIT(9) #define ETHER_CLK_SEL_FREQ_SEL_25M BIT(8) #define ETHER_CLK_SEL_FREQ_SEL_2P5M 0 -#define ETHER_CLK_SEL_TX_CLK_EXT_SEL_IN BIT(0) +#define ETHER_CLK_SEL_TX_CLK_EXT_SEL_IN 0 #define ETHER_CLK_SEL_TX_CLK_EXT_SEL_TXC BIT(10) #define ETHER_CLK_SEL_TX_CLK_EXT_SEL_DIV BIT(11) -#define ETHER_CLK_SEL_RX_CLK_EXT_SEL_IN BIT(0) +#define ETHER_CLK_SEL_RX_CLK_EXT_SEL_IN 0 #define ETHER_CLK_SEL_RX_CLK_EXT_SEL_RXC BIT(12) #define ETHER_CLK_SEL_RX_CLK_EXT_SEL_DIV BIT(13) -#define ETHER_CLK_SEL_TX_CLK_O_TX_I BIT(0) +#define ETHER_CLK_SEL_TX_CLK_O_TX_I 0 #define ETHER_CLK_SEL_TX_CLK_O_RMII_I BIT(14) #define ETHER_CLK_SEL_TX_O_E_N_IN BIT(15) -#define ETHER_CLK_SEL_RMII_CLK_SEL_IN BIT(0) +#define ETHER_CLK_SEL_RMII_CLK_SEL_IN 0 #define ETHER_CLK_SEL_RMII_CLK_SEL_RX_C BIT(16) #define ETHER_CLK_SEL_RX_TX_CLK_EN (ETHER_CLK_SEL_RX_CLK_EN | ETHER_CLK_SEL_TX_CLK_EN) From 0959bc4bd4206433ed101a1332a23e93ad16ec77 Mon Sep 17 00:00:00 2001 From: Yuji Ishikawa Date: Wed, 19 Jan 2022 13:46:48 +0900 Subject: [PATCH 0266/1295] net: stmmac: dwmac-visconti: Fix clock configuration for RMII mode Bit pattern of the ETHER_CLOCK_SEL register for RMII/MII mode should be fixed. Also, some control bits should be modified with a specific sequence. Fixes: b38dd98ff8d0 ("net: stmmac: Add Toshiba Visconti SoCs glue driver") Signed-off-by: Yuji Ishikawa Reviewed-by: Nobuhiro Iwamatsu Signed-off-by: David S. Miller --- .../ethernet/stmicro/stmmac/dwmac-visconti.c | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c index 43a446ceadf7..dde5b772a5af 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c @@ -96,31 +96,41 @@ static void visconti_eth_fix_mac_speed(void *priv, unsigned int speed) val |= ETHER_CLK_SEL_TX_O_E_N_IN; writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); + /* Set Clock-Mux, Start clock, Set TX_O direction */ switch (dwmac->phy_intf_sel) { case ETHER_CONFIG_INTF_RGMII: val = clk_sel_val | ETHER_CLK_SEL_RX_CLK_EXT_SEL_RXC; + writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); + + val |= ETHER_CLK_SEL_RX_TX_CLK_EN; + writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); + + val &= ~ETHER_CLK_SEL_TX_O_E_N_IN; + writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); break; case ETHER_CONFIG_INTF_RMII: val = clk_sel_val | ETHER_CLK_SEL_RX_CLK_EXT_SEL_DIV | - ETHER_CLK_SEL_TX_CLK_EXT_SEL_TXC | ETHER_CLK_SEL_TX_O_E_N_IN | + ETHER_CLK_SEL_TX_CLK_EXT_SEL_DIV | ETHER_CLK_SEL_TX_O_E_N_IN | ETHER_CLK_SEL_RMII_CLK_SEL_RX_C; + writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); + + val |= ETHER_CLK_SEL_RMII_CLK_RST; + writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); + + val |= ETHER_CLK_SEL_RMII_CLK_EN | ETHER_CLK_SEL_RX_TX_CLK_EN; + writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); break; case ETHER_CONFIG_INTF_MII: default: val = clk_sel_val | ETHER_CLK_SEL_RX_CLK_EXT_SEL_RXC | - ETHER_CLK_SEL_TX_CLK_EXT_SEL_DIV | ETHER_CLK_SEL_TX_O_E_N_IN | - ETHER_CLK_SEL_RMII_CLK_EN; + ETHER_CLK_SEL_TX_CLK_EXT_SEL_TXC | ETHER_CLK_SEL_TX_O_E_N_IN; + writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); + + val |= ETHER_CLK_SEL_RX_TX_CLK_EN; + writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); break; } - /* Start clock */ - writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); - val |= ETHER_CLK_SEL_RX_TX_CLK_EN; - writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); - - val &= ~ETHER_CLK_SEL_TX_O_E_N_IN; - writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); - spin_unlock_irqrestore(&dwmac->lock, flags); } From 217663f101a56ef77f82273818253fff082bf503 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 20 Jan 2022 13:57:22 +0100 Subject: [PATCH 0267/1295] fanotify: remove variable set but not used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code that uses the pointer info has been removed in 7326e382c21e ("fanotify: report old and/or new parent+name in FAN_RENAME event"). and fanotify_event_info() doesn't change 'event', so the declaration and assignment of info can be removed. Eliminate the following clang warning: fs/notify/fanotify/fanotify_user.c:161:24: warning: variable ‘info’ set but not used Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify_user.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 73a3e939c921..2641b0802479 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -151,7 +151,6 @@ static size_t fanotify_event_len(unsigned int info_mode, struct fanotify_event *event) { size_t event_len = FAN_EVENT_METADATA_LEN; - struct fanotify_info *info; int fh_len; int dot_len = 0; @@ -161,8 +160,6 @@ static size_t fanotify_event_len(unsigned int info_mode, if (fanotify_is_error_event(event->mask)) event_len += FANOTIFY_ERROR_INFO_LEN; - info = fanotify_event_info(event); - if (fanotify_event_has_any_dir_fh(event)) { event_len += fanotify_dir_name_info_len(event); } else if ((info_mode & FAN_REPORT_NAME) && From 6e10e21915c1ab6eaa145f7b5ebaf4500af1b011 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 8 Sep 2021 16:09:08 -0300 Subject: [PATCH 0268/1295] tools headers UAPI: Sync files changed by new set_mempolicy_home_node syscall To pick the changes in these csets: 21b084fdf2a49ca1 ("mm/mempolicy: wire up syscall set_mempolicy_home_node") That add support for this new syscall in tools such as 'perf trace'. For instance, this is now possible: [root@five ~]# perf trace -e set_mempolicy_home_node ^C[root@five ~]# [root@five ~]# perf trace -v -e set_mempolicy_home_node Using CPUID AuthenticAMD-25-21-0 event qualifier tracepoint filter: (common_pid != 253729 && common_pid != 3585) && (id == 450) mmap size 528384B ^C[root@five ~] [root@five ~]# perf trace -v -e set* --max-events 5 Using CPUID AuthenticAMD-25-21-0 event qualifier tracepoint filter: (common_pid != 253734 && common_pid != 3585) && (id == 38 || id == 54 || id == 105 || id == 106 || id == 109 || id == 112 || id == 113 || id == 114 || id == 116 || id == 117 || id == 119 || id == 122 || id == 123 || id == 141 || id == 160 || id == 164 || id == 170 || id == 171 || id == 188 || id == 205 || id == 218 || id == 238 || id == 273 || id == 308 || id == 450) mmap size 528384B 0.000 ( 0.008 ms): bash/253735 setpgid(pid: 253735 (bash), pgid: 253735 (bash)) = 0 6849.011 ( 0.008 ms): bash/16046 setpgid(pid: 253736 (bash), pgid: 253736 (bash)) = 0 6849.080 ( 0.005 ms): bash/253736 setpgid(pid: 253736 (bash), pgid: 253736 (bash)) = 0 7437.718 ( 0.009 ms): gnome-shell/253737 set_robust_list(head: 0x7f34b527e920, len: 24) = 0 13445.986 ( 0.010 ms): bash/16046 setpgid(pid: 253738 (bash), pgid: 253738 (bash)) = 0 [root@five ~]# That is the filter expression attached to the raw_syscalls:sys_{enter,exit} tracepoints. $ find tools/perf/arch/ -name "syscall*tbl" | xargs grep -w set_mempolicy_home_node tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl:450 common set_mempolicy_home_node sys_set_mempolicy_home_node tools/perf/arch/powerpc/entry/syscalls/syscall.tbl:450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node tools/perf/arch/s390/entry/syscalls/syscall.tbl:450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node tools/perf/arch/x86/entry/syscalls/syscall_64.tbl:450 common set_mempolicy_home_node sys_set_mempolicy_home_node $ $ grep -w set_mempolicy_home_node /tmp/build/perf/arch/x86/include/generated/asm/syscalls_64.c [450] = "set_mempolicy_home_node", $ This addresses these perf build warnings: Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/unistd.h' differs from latest version at 'include/uapi/asm-generic/unistd.h' diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h Warning: Kernel ABI header at 'tools/perf/arch/x86/entry/syscalls/syscall_64.tbl' differs from latest version at 'arch/x86/entry/syscalls/syscall_64.tbl' diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl Warning: Kernel ABI header at 'tools/perf/arch/powerpc/entry/syscalls/syscall.tbl' differs from latest version at 'arch/powerpc/kernel/syscalls/syscall.tbl' diff -u tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl Warning: Kernel ABI header at 'tools/perf/arch/s390/entry/syscalls/syscall.tbl' differs from latest version at 'arch/s390/kernel/syscalls/syscall.tbl' diff -u tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl Warning: Kernel ABI header at 'tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl' differs from latest version at 'arch/mips/kernel/syscalls/syscall_n64.tbl' diff -u tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl Cc: Aneesh Kumar K.V Cc: Linus Torvalds Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/unistd.h | 5 ++++- tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl | 1 + tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 1 + tools/perf/arch/s390/entry/syscalls/syscall.tbl | 1 + tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 1 + 5 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 4557a8b6086f..1c48b0ae3ba3 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -883,8 +883,11 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) #define __NR_futex_waitv 449 __SYSCALL(__NR_futex_waitv, sys_futex_waitv) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) + #undef __NR_syscalls -#define __NR_syscalls 450 +#define __NR_syscalls 451 /* * 32 bit systems traditionally used different diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl index e2c481fcede6..3f1886ad9d80 100644 --- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl +++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl @@ -364,3 +364,4 @@ # 447 reserved for memfd_secret 448 n64 process_mrelease sys_process_mrelease 449 n64 futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 15109af9d075..2600b4237292 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -529,3 +529,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index ed9c5c2eafad..799147658dee 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -452,3 +452,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index fe8f8dd157b4..c84d12608cd2 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -371,6 +371,7 @@ 447 common memfd_secret sys_memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node # # Due to a historical design error, certain syscalls are numbered differently From 3938d5a2f9369d1ebd56320629fed395ce327e9c Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Thu, 16 Dec 2021 13:35:38 +0100 Subject: [PATCH 0269/1295] riscv: default to CONFIG_RISCV_SBI_V01=n The SBI 0.1 specification is obsolete. The current version is 0.3. Hence we should not rely by default on SBI 0.1 being implemented. Signed-off-by: Heinrich Schuchardt Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 38205f25c0d7..86553d8c6b07 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -369,7 +369,6 @@ source "kernel/Kconfig.hz" config RISCV_SBI_V01 bool "SBI v0.1 support" - default y depends on RISCV_SBI help This config allows kernel to use SBI v0.1 APIs. This will be From a0f4ba7f51ea736a6b4ccf58563507d7af9128fb Mon Sep 17 00:00:00 2001 From: Jinrong Liang Date: Wed, 19 Jan 2022 21:39:10 +0800 Subject: [PATCH 0270/1295] selftests: kvm/x86: Fix the warning in pmu_event_filter_test.c The following warning appears when executing make -C tools/testing/selftests/kvm x86_64/pmu_event_filter_test.c: In function 'vcpu_supports_intel_br_retired': x86_64/pmu_event_filter_test.c:241:28: warning: variable 'cpuid' set but not used [-Wunused-but-set-variable] 241 | struct kvm_cpuid2 *cpuid; | ^~~~~ x86_64/pmu_event_filter_test.c: In function 'vcpu_supports_amd_zen_br_retired': x86_64/pmu_event_filter_test.c:258:28: warning: variable 'cpuid' set but not used [-Wunused-but-set-variable] 258 | struct kvm_cpuid2 *cpuid; | ^~~~~ Just delete the unused variables to stay away from warnings. Fixes: dc7e75b3b3ee ("selftests: kvm/x86: Add test for KVM_SET_PMU_EVENT_FILTER") Signed-off-by: Jinrong Liang Message-Id: <20220119133910.56285-1-cloudliang@tencent.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index aa104946e6e0..c715adcbd487 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -347,9 +347,7 @@ static bool check_intel_pmu_leaf(struct kvm_cpuid_entry2 *entry) static bool use_intel_pmu(void) { struct kvm_cpuid_entry2 *entry; - struct kvm_cpuid2 *cpuid; - cpuid = kvm_get_supported_cpuid(); entry = kvm_get_supported_cpuid_index(0xa, 0); return is_intel_cpu() && entry && check_intel_pmu_leaf(entry); } @@ -381,9 +379,7 @@ static bool is_zen3(uint32_t eax) static bool use_amd_pmu(void) { struct kvm_cpuid_entry2 *entry; - struct kvm_cpuid2 *cpuid; - cpuid = kvm_get_supported_cpuid(); entry = kvm_get_supported_cpuid_index(1, 0); return is_amd_cpu() && entry && (is_zen1(entry->eax) || From 83a34ad848937462aa64fa3d48f8c0b4034f2503 Mon Sep 17 00:00:00 2001 From: Jinrong Liang Date: Wed, 19 Jan 2022 22:03:25 +0800 Subject: [PATCH 0271/1295] selftests: kvm/x86: Fix the warning in lib/x86_64/processor.c The following warning appears when executing make -C tools/testing/selftests/kvm include/x86_64/processor.h:290:2: warning: 'ecx' may be used uninitialized in this function [-Wmaybe-uninitialized] asm volatile("cpuid" ^~~ lib/x86_64/processor.c:1523:21: note: 'ecx' was declared here uint32_t eax, ebx, ecx, edx, max_ext_leaf; Just initialize ecx to remove this warning. Fixes: c8cc43c1eae2 ("selftests: KVM: avoid failures due to reserved HyperTransport region") Signed-off-by: Jinrong Liang Message-Id: <20220119140325.59369-1-cloudliang@tencent.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 5c8c270a9158..5f9d7e91dc69 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1535,6 +1535,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm) /* Before family 17h, the HyperTransport area is just below 1T. */ ht_gfn = (1 << 28) - num_ht_pages; eax = 1; + ecx = 0; cpuid(&eax, &ebx, &ecx, &edx); if (x86_family(eax) < 0x17) goto done; From e2e83a73d7ce66f62c7830a85619542ef59c90e4 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 19 Jan 2022 23:50:03 -0500 Subject: [PATCH 0272/1295] docs: kvm: fix WARNINGs from api.rst Use the api number 134 for KVM_GET_XSAVE2, instead of 42, which has been used by KVM_GET_XSAVE. Also, fix the WARNINGs of the underlines being too short. Reported-by: Stephen Rothwell Signed-off-by: Wei Wang Tested-by: Stephen Rothwell Message-Id: <20220120045003.315177-1-wei.w.wang@intel.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index d3791a14eb9a..bb8cfddbb22d 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5545,8 +5545,8 @@ the trailing ``'\0'``, is indicated by ``name_size`` in the header. The Stats Data block contains an array of 64-bit values in the same order as the descriptors in Descriptors block. -4.42 KVM_GET_XSAVE2 ------------------- +4.134 KVM_GET_XSAVE2 +-------------------- :Capability: KVM_CAP_XSAVE2 :Architectures: x86 @@ -7363,7 +7363,7 @@ trap and emulate MSRs that are outside of the scope of KVM as well as limit the attack surface on KVM's MSR emulation code. 8.28 KVM_CAP_ENFORCE_PV_FEATURE_CPUID ------------------------------ +------------------------------------- Architectures: x86 From 9a2451f1866344d38b4a1dc20396e3a03954fcd7 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 20 Jan 2022 01:09:13 -0800 Subject: [PATCH 0273/1295] RISC-V: Avoid using per cpu array for ordered booting Currently both order booting and spinwait approach uses a per cpu array to update stack & task pointer. This approach will not work for the following cases. 1. If NR_CPUs are configured to be less than highest hart id. 2. A platform has sparse hartid. This issue can be fixed for ordered booting as the booting cpu brings up one cpu at a time using SBI HSM extension which has opaque parameter that is unused until now. Introduce a common secondary boot data structure that can store the stack and task pointer. Secondary harts will use this data while booting up to setup the sp & tp. Reviewed-by: Anup Patel Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cpu_ops_sbi.h | 25 +++++++++++++++++++++++++ arch/riscv/kernel/asm-offsets.c | 3 +++ arch/riscv/kernel/cpu_ops_sbi.c | 24 +++++++++++++++++++----- arch/riscv/kernel/head.S | 19 ++++++++++--------- 4 files changed, 57 insertions(+), 14 deletions(-) create mode 100644 arch/riscv/include/asm/cpu_ops_sbi.h diff --git a/arch/riscv/include/asm/cpu_ops_sbi.h b/arch/riscv/include/asm/cpu_ops_sbi.h new file mode 100644 index 000000000000..56e4b76d09ff --- /dev/null +++ b/arch/riscv/include/asm/cpu_ops_sbi.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021 by Rivos Inc. + */ +#ifndef __ASM_CPU_OPS_SBI_H +#define __ASM_CPU_OPS_SBI_H + +#ifndef __ASSEMBLY__ +#include +#include +#include + +/** + * struct sbi_hart_boot_data - Hart specific boot used during booting and + * cpu hotplug. + * @task_ptr: A pointer to the hart specific tp + * @stack_ptr: A pointer to the hart specific sp + */ +struct sbi_hart_boot_data { + void *task_ptr; + void *stack_ptr; +}; +#endif + +#endif /* ifndef __ASM_CPU_OPS_SBI_H */ diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 253126e4beef..df0519a64eaf 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -12,6 +12,7 @@ #include #include #include +#include void asm_offsets(void); @@ -468,4 +469,6 @@ void asm_offsets(void) DEFINE(PT_SIZE_ON_STACK, ALIGN(sizeof(struct pt_regs), STACK_ALIGN)); OFFSET(KERNEL_MAP_VIRT_ADDR, kernel_mapping, virt_addr); + OFFSET(SBI_HART_BOOT_TASK_PTR_OFFSET, sbi_hart_boot_data, task_ptr); + OFFSET(SBI_HART_BOOT_STACK_PTR_OFFSET, sbi_hart_boot_data, stack_ptr); } diff --git a/arch/riscv/kernel/cpu_ops_sbi.c b/arch/riscv/kernel/cpu_ops_sbi.c index 685fae72b7f5..dae29cbfe550 100644 --- a/arch/riscv/kernel/cpu_ops_sbi.c +++ b/arch/riscv/kernel/cpu_ops_sbi.c @@ -7,13 +7,22 @@ #include #include +#include #include +#include #include #include extern char secondary_start_sbi[]; const struct cpu_operations cpu_ops_sbi; +/* + * Ordered booting via HSM brings one cpu at a time. However, cpu hotplug can + * be invoked from multiple threads in parallel. Define a per cpu data + * to handle that. + */ +DEFINE_PER_CPU(struct sbi_hart_boot_data, boot_data); + static int sbi_hsm_hart_start(unsigned long hartid, unsigned long saddr, unsigned long priv) { @@ -55,14 +64,19 @@ static int sbi_hsm_hart_get_status(unsigned long hartid) static int sbi_cpu_start(unsigned int cpuid, struct task_struct *tidle) { - int rc; unsigned long boot_addr = __pa_symbol(secondary_start_sbi); int hartid = cpuid_to_hartid_map(cpuid); + unsigned long hsm_data; + struct sbi_hart_boot_data *bdata = &per_cpu(boot_data, cpuid); - cpu_update_secondary_bootdata(cpuid, tidle); - rc = sbi_hsm_hart_start(hartid, boot_addr, 0); - - return rc; + /* Make sure tidle is updated */ + smp_mb(); + bdata->task_ptr = tidle; + bdata->stack_ptr = task_stack_page(tidle) + THREAD_SIZE; + /* Make sure boot data is updated */ + smp_mb(); + hsm_data = __pa(bdata); + return sbi_hsm_hart_start(hartid, boot_addr, hsm_data); } static int sbi_cpu_prepare(unsigned int cpuid) diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index db4be6a7e392..05ea372e1909 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include "efi-header.S" @@ -168,15 +169,15 @@ secondary_start_sbi: la a3, .Lsecondary_park csrw CSR_TVEC, a3 - slli a3, a0, LGREG - la a4, __cpu_up_stack_pointer - XIP_FIXUP_OFFSET a4 - la a5, __cpu_up_task_pointer - XIP_FIXUP_OFFSET a5 - add a4, a3, a4 - add a5, a3, a5 - REG_L sp, (a4) - REG_L tp, (a5) + /* a0 contains the hartid & a1 contains boot data */ + li a2, SBI_HART_BOOT_TASK_PTR_OFFSET + XIP_FIXUP_OFFSET a2 + add a2, a2, a1 + REG_L tp, (a2) + li a3, SBI_HART_BOOT_STACK_PTR_OFFSET + XIP_FIXUP_OFFSET a3 + add a3, a3, a1 + REG_L sp, (a3) .Lsecondary_start_common: From 410bb20a698d4c95c63e7f2b6f6f7d8da43795f5 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 20 Jan 2022 01:09:14 -0800 Subject: [PATCH 0274/1295] RISC-V: Do not print the SBI version during HSM extension boot print The HSM extension information log also prints the SBI version v0.2. This is misleading as the underlying firmware SBI version may be different from v0.2. Remove the unncessary printing of SBI version. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu_ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/cpu_ops.c b/arch/riscv/kernel/cpu_ops.c index 1985884fe829..3f5a38b03044 100644 --- a/arch/riscv/kernel/cpu_ops.c +++ b/arch/riscv/kernel/cpu_ops.c @@ -38,7 +38,7 @@ void __init cpu_set_ops(int cpuid) #if IS_ENABLED(CONFIG_RISCV_SBI) if (sbi_probe_extension(SBI_EXT_HSM) > 0) { if (!cpuid) - pr_info("SBI v0.2 HSM extension detected\n"); + pr_info("SBI HSM extension detected\n"); cpu_ops[cpuid] = &cpu_ops_sbi; } else #endif From c78f94f35cf6486c4057317e8de3ddc4c62e12c7 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 20 Jan 2022 01:09:15 -0800 Subject: [PATCH 0275/1295] RISC-V: Use __cpu_up_stack/task_pointer only for spinwait method The __cpu_up_stack/task_pointer array is only used for spinwait method now. The per cpu array based lookup is also fragile for platforms with discontiguous/sparse hartids. The spinwait method is only used for M-mode Linux or older firmwares without SBI HSM extension. For general Linux systems, ordered booting method is preferred anyways to support cpu hotplug and kexec. Make sure that __cpu_up_stack/task_pointer is only used for spinwait method. Take this opportunity to rename it to __cpu_spinwait_stack/task_pointer to emphasize the purpose as well. Reviewed-by: Anup Patel Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cpu_ops.h | 2 -- arch/riscv/kernel/cpu_ops.c | 16 ---------------- arch/riscv/kernel/cpu_ops_spinwait.c | 27 ++++++++++++++++++++++++++- arch/riscv/kernel/head.S | 4 ++-- arch/riscv/kernel/head.h | 4 ++-- 5 files changed, 30 insertions(+), 23 deletions(-) diff --git a/arch/riscv/include/asm/cpu_ops.h b/arch/riscv/include/asm/cpu_ops.h index a8ec3c5c1bd2..134590f1b843 100644 --- a/arch/riscv/include/asm/cpu_ops.h +++ b/arch/riscv/include/asm/cpu_ops.h @@ -40,7 +40,5 @@ struct cpu_operations { extern const struct cpu_operations *cpu_ops[NR_CPUS]; void __init cpu_set_ops(int cpu); -void cpu_update_secondary_bootdata(unsigned int cpuid, - struct task_struct *tidle); #endif /* ifndef __ASM_CPU_OPS_H */ diff --git a/arch/riscv/kernel/cpu_ops.c b/arch/riscv/kernel/cpu_ops.c index 3f5a38b03044..c1e30f403c3b 100644 --- a/arch/riscv/kernel/cpu_ops.c +++ b/arch/riscv/kernel/cpu_ops.c @@ -8,31 +8,15 @@ #include #include #include -#include #include #include #include const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; -void *__cpu_up_stack_pointer[NR_CPUS] __section(".data"); -void *__cpu_up_task_pointer[NR_CPUS] __section(".data"); - extern const struct cpu_operations cpu_ops_sbi; extern const struct cpu_operations cpu_ops_spinwait; -void cpu_update_secondary_bootdata(unsigned int cpuid, - struct task_struct *tidle) -{ - int hartid = cpuid_to_hartid_map(cpuid); - - /* Make sure tidle is updated */ - smp_mb(); - WRITE_ONCE(__cpu_up_stack_pointer[hartid], - task_stack_page(tidle) + THREAD_SIZE); - WRITE_ONCE(__cpu_up_task_pointer[hartid], tidle); -} - void __init cpu_set_ops(int cpuid) { #if IS_ENABLED(CONFIG_RISCV_SBI) diff --git a/arch/riscv/kernel/cpu_ops_spinwait.c b/arch/riscv/kernel/cpu_ops_spinwait.c index b2c957bb68c1..346847f6c41c 100644 --- a/arch/riscv/kernel/cpu_ops_spinwait.c +++ b/arch/riscv/kernel/cpu_ops_spinwait.c @@ -6,11 +6,36 @@ #include #include #include +#include #include #include #include const struct cpu_operations cpu_ops_spinwait; +void *__cpu_spinwait_stack_pointer[NR_CPUS] __section(".data"); +void *__cpu_spinwait_task_pointer[NR_CPUS] __section(".data"); + +static void cpu_update_secondary_bootdata(unsigned int cpuid, + struct task_struct *tidle) +{ + int hartid = cpuid_to_hartid_map(cpuid); + + /* + * The hartid must be less than NR_CPUS to avoid out-of-bound access + * errors for __cpu_spinwait_stack/task_pointer. That is not always possible + * for platforms with discontiguous hartid numbering scheme. That's why + * spinwait booting is not the recommended approach for any platforms + * booting Linux in S-mode and can be disabled in the future. + */ + if (hartid == INVALID_HARTID || hartid >= NR_CPUS) + return; + + /* Make sure tidle is updated */ + smp_mb(); + WRITE_ONCE(__cpu_spinwait_stack_pointer[hartid], + task_stack_page(tidle) + THREAD_SIZE); + WRITE_ONCE(__cpu_spinwait_task_pointer[hartid], tidle); +} static int spinwait_cpu_prepare(unsigned int cpuid) { @@ -28,7 +53,7 @@ static int spinwait_cpu_start(unsigned int cpuid, struct task_struct *tidle) * selects the first cpu to boot the kernel and causes the remainder * of the cpus to spin in a loop waiting for their stack pointer to be * setup by that main cpu. Writing to bootdata - * (i.e __cpu_up_stack_pointer) signals to the spinning cpus that they + * (i.e __cpu_spinwait_stack_pointer) signals to the spinning cpus that they * can continue the boot process. */ cpu_update_secondary_bootdata(cpuid, tidle); diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 05ea372e1909..efa9cd499ac6 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -347,9 +347,9 @@ clear_bss_done: csrw CSR_TVEC, a3 slli a3, a0, LGREG - la a1, __cpu_up_stack_pointer + la a1, __cpu_spinwait_stack_pointer XIP_FIXUP_OFFSET a1 - la a2, __cpu_up_task_pointer + la a2, __cpu_spinwait_task_pointer XIP_FIXUP_OFFSET a2 add a1, a3, a1 add a2, a3, a2 diff --git a/arch/riscv/kernel/head.h b/arch/riscv/kernel/head.h index aabbc3ac3e48..5393cca77790 100644 --- a/arch/riscv/kernel/head.h +++ b/arch/riscv/kernel/head.h @@ -16,7 +16,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa); asmlinkage void __init __copy_data(void); #endif -extern void *__cpu_up_stack_pointer[]; -extern void *__cpu_up_task_pointer[]; +extern void *__cpu_spinwait_stack_pointer[]; +extern void *__cpu_spinwait_task_pointer[]; #endif /* __ASM_HEAD_H */ From 0b39eb38f85908e039ce8c9f09868438e029757b Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 20 Jan 2022 01:09:16 -0800 Subject: [PATCH 0276/1295] RISC-V: Move the entire hart selection via lottery to SMP The booting hart selection via lottery is only useful for SMP systems. Moreover, the lottery selection is only necessary for systems using spinwait booting method. It is better to keep the entire lottery selection together so that it can be disabled in future. Move the lottery selection code to under CONFIG_SMP. Reviewed-by: Anup Patel Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/head.S | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index efa9cd499ac6..b0766f62bd73 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -264,8 +264,8 @@ pmp_done: blt a0, t0, .Lgood_cores tail .Lsecondary_park .Lgood_cores: -#endif + /* The lottery system is only required for spinwait booting method */ #ifndef CONFIG_XIP_KERNEL /* Pick one hart to run the main boot sequence */ la a3, hart_lottery @@ -284,6 +284,10 @@ pmp_done: /* first time here if hart_lottery in RAM is not set */ beq t0, t1, .Lsecondary_start +#endif /* CONFIG_XIP */ +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_XIP_KERNEL la sp, _end + THREAD_SIZE XIP_FIXUP_OFFSET sp mv s0, a0 @@ -340,8 +344,8 @@ clear_bss_done: call soc_early_init tail start_kernel -.Lsecondary_start: #ifdef CONFIG_SMP +.Lsecondary_start: /* Set trap vector to spin forever to help debug */ la a3, .Lsecondary_park csrw CSR_TVEC, a3 From 2ffc48fc7071da4b2d881b0f21d37ed05feb697b Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 20 Jan 2022 01:09:17 -0800 Subject: [PATCH 0277/1295] RISC-V: Move spinwait booting method to its own config The spinwait booting method should only be used for platforms with older firmware without SBI HSM extension or M-mode firmware because spinwait method can't support cpu hotplug, kexec or sparse hartid. It is better to move the entire spinwait implementation to its own config which can be disabled if required. It is enabled by default to maintain backward compatibility and M-mode Linux. Reviewed-by: Anup Patel Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 14 ++++++++++++++ arch/riscv/kernel/Makefile | 3 ++- arch/riscv/kernel/cpu_ops.c | 8 ++++++++ arch/riscv/kernel/head.S | 8 ++++---- arch/riscv/kernel/head.h | 2 ++ 5 files changed, 30 insertions(+), 5 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 86553d8c6b07..fd60ab94260b 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -374,6 +374,20 @@ config RISCV_SBI_V01 This config allows kernel to use SBI v0.1 APIs. This will be deprecated in future once legacy M-mode software are no longer in use. +config RISCV_BOOT_SPINWAIT + bool "Spinwait booting method" + depends on SMP + default y + help + This enables support for booting Linux via spinwait method. In the + spinwait method, all cores randomly jump to Linux. One of the cores + gets chosen via lottery and all other keep spinning on a percpu + variable. This method cannot support CPU hotplug and sparse hartid + scheme. It should be only enabled for M-mode Linux or platforms relying + on older firmware without SBI HSM extension. All other platforms should + rely on ordered booting via SBI HSM extension which gets chosen + dynamically at runtime if the firmware supports it. + config KEXEC bool "Kexec system call" select KEXEC_CORE diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 3397ddac1a30..612556faa527 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -43,7 +43,8 @@ obj-$(CONFIG_FPU) += fpu.o obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP) += cpu_ops.o -obj-$(CONFIG_SMP) += cpu_ops_spinwait.o + +obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o diff --git a/arch/riscv/kernel/cpu_ops.c b/arch/riscv/kernel/cpu_ops.c index c1e30f403c3b..170d07e57721 100644 --- a/arch/riscv/kernel/cpu_ops.c +++ b/arch/riscv/kernel/cpu_ops.c @@ -15,7 +15,15 @@ const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; extern const struct cpu_operations cpu_ops_sbi; +#ifdef CONFIG_RISCV_BOOT_SPINWAIT extern const struct cpu_operations cpu_ops_spinwait; +#else +const struct cpu_operations cpu_ops_spinwait = { + .name = "", + .cpu_prepare = NULL, + .cpu_start = NULL, +}; +#endif void __init cpu_set_ops(int cpuid) { diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index b0766f62bd73..2363b43312fc 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -259,7 +259,7 @@ pmp_done: li t0, SR_FS csrc CSR_STATUS, t0 -#ifdef CONFIG_SMP +#ifdef CONFIG_RISCV_BOOT_SPINWAIT li t0, CONFIG_NR_CPUS blt a0, t0, .Lgood_cores tail .Lsecondary_park @@ -285,7 +285,7 @@ pmp_done: beq t0, t1, .Lsecondary_start #endif /* CONFIG_XIP */ -#endif /* CONFIG_SMP */ +#endif /* CONFIG_RISCV_BOOT_SPINWAIT */ #ifdef CONFIG_XIP_KERNEL la sp, _end + THREAD_SIZE @@ -344,7 +344,7 @@ clear_bss_done: call soc_early_init tail start_kernel -#ifdef CONFIG_SMP +#if CONFIG_RISCV_BOOT_SPINWAIT .Lsecondary_start: /* Set trap vector to spin forever to help debug */ la a3, .Lsecondary_park @@ -371,7 +371,7 @@ clear_bss_done: fence tail .Lsecondary_start_common -#endif +#endif /* CONFIG_RISCV_BOOT_SPINWAIT */ END(_start_kernel) diff --git a/arch/riscv/kernel/head.h b/arch/riscv/kernel/head.h index 5393cca77790..726731ada534 100644 --- a/arch/riscv/kernel/head.h +++ b/arch/riscv/kernel/head.h @@ -16,7 +16,9 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa); asmlinkage void __init __copy_data(void); #endif +#ifdef CONFIG_RISCV_BOOT_SPINWAIT extern void *__cpu_spinwait_stack_pointer[]; extern void *__cpu_spinwait_task_pointer[]; +#endif #endif /* __ASM_HEAD_H */ From 26fb751ca37846c912daa347be298bfd945cc560 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 20 Jan 2022 01:09:18 -0800 Subject: [PATCH 0278/1295] RISC-V: Do not use cpumask data structure for hartid bitmap Currently, SBI APIs accept a hartmask that is generated from struct cpumask. Cpumask data structure can hold upto NR_CPUs value. Thus, it is not the correct data structure for hartids as it can be higher than NR_CPUs for platforms with sparse or discontguous hartids. Remove all association between hartid mask and struct cpumask. Reviewed-by: Anup Patel (For Linux RISC-V changes) Acked-by: Anup Patel (For KVM RISC-V changes) Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 19 +-- arch/riscv/include/asm/smp.h | 2 - arch/riscv/kernel/sbi.c | 189 +++++++++++++++++------------- arch/riscv/kernel/setup.c | 10 -- arch/riscv/kernel/smpboot.c | 2 +- arch/riscv/kvm/mmu.c | 4 +- arch/riscv/kvm/vcpu_sbi_replace.c | 11 +- arch/riscv/kvm/vcpu_sbi_v01.c | 11 +- arch/riscv/kvm/vmid.c | 4 +- arch/riscv/mm/cacheflush.c | 5 +- arch/riscv/mm/tlbflush.c | 9 +- 11 files changed, 130 insertions(+), 136 deletions(-) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 26ba6f2d7a40..d1c37479d828 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -8,6 +8,7 @@ #define _ASM_RISCV_SBI_H #include +#include #ifdef CONFIG_RISCV_SBI enum sbi_ext_id { @@ -128,27 +129,27 @@ long sbi_get_mimpid(void); void sbi_set_timer(uint64_t stime_value); void sbi_shutdown(void); void sbi_clear_ipi(void); -int sbi_send_ipi(const unsigned long *hart_mask); -int sbi_remote_fence_i(const unsigned long *hart_mask); -int sbi_remote_sfence_vma(const unsigned long *hart_mask, +int sbi_send_ipi(const struct cpumask *cpu_mask); +int sbi_remote_fence_i(const struct cpumask *cpu_mask); +int sbi_remote_sfence_vma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size); -int sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, +int sbi_remote_sfence_vma_asid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long asid); -int sbi_remote_hfence_gvma(const unsigned long *hart_mask, +int sbi_remote_hfence_gvma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size); -int sbi_remote_hfence_gvma_vmid(const unsigned long *hart_mask, +int sbi_remote_hfence_gvma_vmid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long vmid); -int sbi_remote_hfence_vvma(const unsigned long *hart_mask, +int sbi_remote_hfence_vvma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size); -int sbi_remote_hfence_vvma_asid(const unsigned long *hart_mask, +int sbi_remote_hfence_vvma_asid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long asid); @@ -183,7 +184,7 @@ static inline unsigned long sbi_mk_version(unsigned long major, int sbi_err_map_linux_errno(int err); #else /* CONFIG_RISCV_SBI */ -static inline int sbi_remote_fence_i(const unsigned long *hart_mask) { return -1; } +static inline int sbi_remote_fence_i(const struct cpumask *cpu_mask) { return -1; } static inline void sbi_init(void) {} #endif /* CONFIG_RISCV_SBI */ #endif /* _ASM_RISCV_SBI_H */ diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h index 6ad749f42807..23170c933d73 100644 --- a/arch/riscv/include/asm/smp.h +++ b/arch/riscv/include/asm/smp.h @@ -92,8 +92,6 @@ static inline void riscv_clear_ipi(void) #endif /* CONFIG_SMP */ -void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out); - #if defined(CONFIG_HOTPLUG_CPU) && (CONFIG_SMP) bool cpu_has_hotplug(unsigned int cpu); #else diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 9a84f0cb5175..f72527fcb347 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -16,8 +16,8 @@ unsigned long sbi_spec_version __ro_after_init = SBI_SPEC_VERSION_DEFAULT; EXPORT_SYMBOL(sbi_spec_version); static void (*__sbi_set_timer)(uint64_t stime) __ro_after_init; -static int (*__sbi_send_ipi)(const unsigned long *hart_mask) __ro_after_init; -static int (*__sbi_rfence)(int fid, const unsigned long *hart_mask, +static int (*__sbi_send_ipi)(const struct cpumask *cpu_mask) __ro_after_init; +static int (*__sbi_rfence)(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) __ro_after_init; @@ -67,6 +67,30 @@ int sbi_err_map_linux_errno(int err) EXPORT_SYMBOL(sbi_err_map_linux_errno); #ifdef CONFIG_RISCV_SBI_V01 +static unsigned long __sbi_v01_cpumask_to_hartmask(const struct cpumask *cpu_mask) +{ + unsigned long cpuid, hartid; + unsigned long hmask = 0; + + /* + * There is no maximum hartid concept in RISC-V and NR_CPUS must not be + * associated with hartid. As SBI v0.1 is only kept for backward compatibility + * and will be removed in the future, there is no point in supporting hartid + * greater than BITS_PER_LONG (32 for RV32 and 64 for RV64). Ideally, SBI v0.2 + * should be used for platforms with hartid greater than BITS_PER_LONG. + */ + for_each_cpu(cpuid, cpu_mask) { + hartid = cpuid_to_hartid_map(cpuid); + if (hartid >= BITS_PER_LONG) { + pr_warn("Unable to send any request to hartid > BITS_PER_LONG for SBI v0.1\n"); + break; + } + hmask |= 1 << hartid; + } + + return hmask; +} + /** * sbi_console_putchar() - Writes given character to the console device. * @ch: The data to be written to the console. @@ -132,33 +156,44 @@ static void __sbi_set_timer_v01(uint64_t stime_value) #endif } -static int __sbi_send_ipi_v01(const unsigned long *hart_mask) +static int __sbi_send_ipi_v01(const struct cpumask *cpu_mask) { - sbi_ecall(SBI_EXT_0_1_SEND_IPI, 0, (unsigned long)hart_mask, + unsigned long hart_mask; + + if (!cpu_mask) + cpu_mask = cpu_online_mask; + hart_mask = __sbi_v01_cpumask_to_hartmask(cpu_mask); + + sbi_ecall(SBI_EXT_0_1_SEND_IPI, 0, (unsigned long)(&hart_mask), 0, 0, 0, 0, 0); return 0; } -static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask, +static int __sbi_rfence_v01(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) { int result = 0; + unsigned long hart_mask; + + if (!cpu_mask) + cpu_mask = cpu_online_mask; + hart_mask = __sbi_v01_cpumask_to_hartmask(cpu_mask); /* v0.2 function IDs are equivalent to v0.1 extension IDs */ switch (fid) { case SBI_EXT_RFENCE_REMOTE_FENCE_I: sbi_ecall(SBI_EXT_0_1_REMOTE_FENCE_I, 0, - (unsigned long)hart_mask, 0, 0, 0, 0, 0); + (unsigned long)&hart_mask, 0, 0, 0, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA, 0, - (unsigned long)hart_mask, start, size, + (unsigned long)&hart_mask, start, size, 0, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID, 0, - (unsigned long)hart_mask, start, size, + (unsigned long)&hart_mask, start, size, arg4, 0, 0); break; default: @@ -180,7 +215,7 @@ static void __sbi_set_timer_v01(uint64_t stime_value) sbi_major_version(), sbi_minor_version()); } -static int __sbi_send_ipi_v01(const unsigned long *hart_mask) +static int __sbi_send_ipi_v01(const struct cpumask *cpu_mask) { pr_warn("IPI extension is not available in SBI v%lu.%lu\n", sbi_major_version(), sbi_minor_version()); @@ -188,7 +223,7 @@ static int __sbi_send_ipi_v01(const unsigned long *hart_mask) return 0; } -static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask, +static int __sbi_rfence_v01(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) { @@ -212,37 +247,33 @@ static void __sbi_set_timer_v02(uint64_t stime_value) #endif } -static int __sbi_send_ipi_v02(const unsigned long *hart_mask) +static int __sbi_send_ipi_v02(const struct cpumask *cpu_mask) { - unsigned long hartid, hmask_val, hbase; - struct cpumask tmask; + unsigned long hartid, cpuid, hmask = 0, hbase = 0; struct sbiret ret = {0}; int result; - if (!hart_mask || !(*hart_mask)) { - riscv_cpuid_to_hartid_mask(cpu_online_mask, &tmask); - hart_mask = cpumask_bits(&tmask); - } + if (!cpu_mask) + cpu_mask = cpu_online_mask; - hmask_val = 0; - hbase = 0; - for_each_set_bit(hartid, hart_mask, NR_CPUS) { - if (hmask_val && ((hbase + BITS_PER_LONG) <= hartid)) { + for_each_cpu(cpuid, cpu_mask) { + hartid = cpuid_to_hartid_map(cpuid); + if (hmask && ((hbase + BITS_PER_LONG) <= hartid)) { ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI, - hmask_val, hbase, 0, 0, 0, 0); + hmask, hbase, 0, 0, 0, 0); if (ret.error) goto ecall_failed; - hmask_val = 0; + hmask = 0; hbase = 0; } - if (!hmask_val) + if (!hmask) hbase = hartid; - hmask_val |= 1UL << (hartid - hbase); + hmask |= 1UL << (hartid - hbase); } - if (hmask_val) { + if (hmask) { ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI, - hmask_val, hbase, 0, 0, 0, 0); + hmask, hbase, 0, 0, 0, 0); if (ret.error) goto ecall_failed; } @@ -252,11 +283,11 @@ static int __sbi_send_ipi_v02(const unsigned long *hart_mask) ecall_failed: result = sbi_err_map_linux_errno(ret.error); pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n", - __func__, hbase, hmask_val, result); + __func__, hbase, hmask, result); return result; } -static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val, +static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask, unsigned long hbase, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) @@ -267,31 +298,31 @@ static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val, switch (fid) { case SBI_EXT_RFENCE_REMOTE_FENCE_I: - ret = sbi_ecall(ext, fid, hmask_val, hbase, 0, 0, 0, 0); + ret = sbi_ecall(ext, fid, hmask, hbase, 0, 0, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, arg4, 0); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, arg4, 0); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, arg4, 0); break; default: @@ -303,43 +334,39 @@ static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val, if (ret.error) { result = sbi_err_map_linux_errno(ret.error); pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n", - __func__, hbase, hmask_val, result); + __func__, hbase, hmask, result); } return result; } -static int __sbi_rfence_v02(int fid, const unsigned long *hart_mask, +static int __sbi_rfence_v02(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) { - unsigned long hmask_val, hartid, hbase; - struct cpumask tmask; + unsigned long hartid, cpuid, hmask = 0, hbase = 0; int result; - if (!hart_mask || !(*hart_mask)) { - riscv_cpuid_to_hartid_mask(cpu_online_mask, &tmask); - hart_mask = cpumask_bits(&tmask); - } + if (!cpu_mask) + cpu_mask = cpu_online_mask; - hmask_val = 0; - hbase = 0; - for_each_set_bit(hartid, hart_mask, NR_CPUS) { - if (hmask_val && ((hbase + BITS_PER_LONG) <= hartid)) { - result = __sbi_rfence_v02_call(fid, hmask_val, hbase, + for_each_cpu(cpuid, cpu_mask) { + hartid = cpuid_to_hartid_map(cpuid); + if (hmask && ((hbase + BITS_PER_LONG) <= hartid)) { + result = __sbi_rfence_v02_call(fid, hmask, hbase, start, size, arg4, arg5); if (result) return result; - hmask_val = 0; + hmask = 0; hbase = 0; } - if (!hmask_val) + if (!hmask) hbase = hartid; - hmask_val |= 1UL << (hartid - hbase); + hmask |= 1UL << (hartid - hbase); } - if (hmask_val) { - result = __sbi_rfence_v02_call(fid, hmask_val, hbase, + if (hmask) { + result = __sbi_rfence_v02_call(fid, hmask, hbase, start, size, arg4, arg5); if (result) return result; @@ -361,44 +388,44 @@ void sbi_set_timer(uint64_t stime_value) /** * sbi_send_ipi() - Send an IPI to any hart. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * * Return: 0 on success, appropriate linux error code otherwise. */ -int sbi_send_ipi(const unsigned long *hart_mask) +int sbi_send_ipi(const struct cpumask *cpu_mask) { - return __sbi_send_ipi(hart_mask); + return __sbi_send_ipi(cpu_mask); } EXPORT_SYMBOL(sbi_send_ipi); /** * sbi_remote_fence_i() - Execute FENCE.I instruction on given remote harts. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * * Return: 0 on success, appropriate linux error code otherwise. */ -int sbi_remote_fence_i(const unsigned long *hart_mask) +int sbi_remote_fence_i(const struct cpumask *cpu_mask) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_FENCE_I, - hart_mask, 0, 0, 0, 0); + cpu_mask, 0, 0, 0, 0); } EXPORT_SYMBOL(sbi_remote_fence_i); /** * sbi_remote_sfence_vma() - Execute SFENCE.VMA instructions on given remote * harts for the specified virtual address range. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the virtual address * @size: Total size of the virtual address range. * * Return: 0 on success, appropriate linux error code otherwise. */ -int sbi_remote_sfence_vma(const unsigned long *hart_mask, +int sbi_remote_sfence_vma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA, - hart_mask, start, size, 0, 0); + cpu_mask, start, size, 0, 0); } EXPORT_SYMBOL(sbi_remote_sfence_vma); @@ -406,38 +433,38 @@ EXPORT_SYMBOL(sbi_remote_sfence_vma); * sbi_remote_sfence_vma_asid() - Execute SFENCE.VMA instructions on given * remote harts for a virtual address range belonging to a specific ASID. * - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the virtual address * @size: Total size of the virtual address range. * @asid: The value of address space identifier (ASID). * * Return: 0 on success, appropriate linux error code otherwise. */ -int sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, +int sbi_remote_sfence_vma_asid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long asid) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID, - hart_mask, start, size, asid, 0); + cpu_mask, start, size, asid, 0); } EXPORT_SYMBOL(sbi_remote_sfence_vma_asid); /** * sbi_remote_hfence_gvma() - Execute HFENCE.GVMA instructions on given remote * harts for the specified guest physical address range. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the guest physical address * @size: Total size of the guest physical address range. * * Return: None */ -int sbi_remote_hfence_gvma(const unsigned long *hart_mask, +int sbi_remote_hfence_gvma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA, - hart_mask, start, size, 0, 0); + cpu_mask, start, size, 0, 0); } EXPORT_SYMBOL_GPL(sbi_remote_hfence_gvma); @@ -445,38 +472,38 @@ EXPORT_SYMBOL_GPL(sbi_remote_hfence_gvma); * sbi_remote_hfence_gvma_vmid() - Execute HFENCE.GVMA instructions on given * remote harts for a guest physical address range belonging to a specific VMID. * - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the guest physical address * @size: Total size of the guest physical address range. * @vmid: The value of guest ID (VMID). * * Return: 0 if success, Error otherwise. */ -int sbi_remote_hfence_gvma_vmid(const unsigned long *hart_mask, +int sbi_remote_hfence_gvma_vmid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long vmid) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID, - hart_mask, start, size, vmid, 0); + cpu_mask, start, size, vmid, 0); } EXPORT_SYMBOL(sbi_remote_hfence_gvma_vmid); /** * sbi_remote_hfence_vvma() - Execute HFENCE.VVMA instructions on given remote * harts for the current guest virtual address range. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the current guest virtual address * @size: Total size of the current guest virtual address range. * * Return: None */ -int sbi_remote_hfence_vvma(const unsigned long *hart_mask, +int sbi_remote_hfence_vvma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA, - hart_mask, start, size, 0, 0); + cpu_mask, start, size, 0, 0); } EXPORT_SYMBOL(sbi_remote_hfence_vvma); @@ -485,20 +512,20 @@ EXPORT_SYMBOL(sbi_remote_hfence_vvma); * remote harts for current guest virtual address range belonging to a specific * ASID. * - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the current guest virtual address * @size: Total size of the current guest virtual address range. * @asid: The value of address space identifier (ASID). * * Return: None */ -int sbi_remote_hfence_vvma_asid(const unsigned long *hart_mask, +int sbi_remote_hfence_vvma_asid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long asid) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID, - hart_mask, start, size, asid, 0); + cpu_mask, start, size, asid, 0); } EXPORT_SYMBOL(sbi_remote_hfence_vvma_asid); @@ -591,11 +618,7 @@ long sbi_get_mimpid(void) static void sbi_send_cpumask_ipi(const struct cpumask *target) { - struct cpumask hartid_mask; - - riscv_cpuid_to_hartid_mask(target, &hartid_mask); - - sbi_send_ipi(cpumask_bits(&hartid_mask)); + sbi_send_ipi(target); } static const struct riscv_ipi_ops sbi_ipi_ops = { diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 63241abe84eb..b42bfdc67482 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -59,16 +59,6 @@ atomic_t hart_lottery __section(".sdata") unsigned long boot_cpu_hartid; static DEFINE_PER_CPU(struct cpu, cpu_devices); -void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out) -{ - int cpu; - - cpumask_clear(out); - for_each_cpu(cpu, in) - cpumask_set_cpu(cpuid_to_hartid_map(cpu), out); -} -EXPORT_SYMBOL_GPL(riscv_cpuid_to_hartid_mask); - /* * Place kernel memory regions on the resource tree so that * kexec-tools can retrieve them from /proc/iomem. While there diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index bd82375db51a..622f226454d5 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -96,7 +96,7 @@ void __init setup_smp(void) if (cpuid >= NR_CPUS) { pr_warn("Invalid cpuid [%d] for hartid [%d]\n", cpuid, hart); - break; + continue; } cpuid_to_hartid_map(cpuid) = hart; diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 9af67dbdc66a..f80a34fbf102 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -114,7 +114,6 @@ static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr, static void stage2_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) { - struct cpumask hmask; unsigned long size = PAGE_SIZE; struct kvm_vmid *vmid = &kvm->arch.vmid; @@ -127,8 +126,7 @@ static void stage2_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) * where the Guest/VM is running. */ preempt_disable(); - riscv_cpuid_to_hartid_mask(cpu_online_mask, &hmask); - sbi_remote_hfence_gvma_vmid(cpumask_bits(&hmask), addr, size, + sbi_remote_hfence_gvma_vmid(cpu_online_mask, addr, size, READ_ONCE(vmid->vmid)); preempt_enable(); } diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 00036b7f83b9..1bc0608a5bfd 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -82,7 +82,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run { int ret = 0; unsigned long i; - struct cpumask cm, hm; + struct cpumask cm; struct kvm_vcpu *tmp; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; unsigned long hmask = cp->a0; @@ -90,7 +90,6 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run unsigned long funcid = cp->a6; cpumask_clear(&cm); - cpumask_clear(&hm); kvm_for_each_vcpu(i, tmp, vcpu->kvm) { if (hbase != -1UL) { if (tmp->vcpu_id < hbase) @@ -103,17 +102,15 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run cpumask_set_cpu(tmp->cpu, &cm); } - riscv_cpuid_to_hartid_mask(&cm, &hm); - switch (funcid) { case SBI_EXT_RFENCE_REMOTE_FENCE_I: - ret = sbi_remote_fence_i(cpumask_bits(&hm)); + ret = sbi_remote_fence_i(&cm); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: - ret = sbi_remote_hfence_vvma(cpumask_bits(&hm), cp->a2, cp->a3); + ret = sbi_remote_hfence_vvma(&cm, cp->a2, cp->a3); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: - ret = sbi_remote_hfence_vvma_asid(cpumask_bits(&hm), cp->a2, + ret = sbi_remote_hfence_vvma_asid(&cm, cp->a2, cp->a3, cp->a4); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA: diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c index 4c7e13ec9ccc..07e2de14433a 100644 --- a/arch/riscv/kvm/vcpu_sbi_v01.c +++ b/arch/riscv/kvm/vcpu_sbi_v01.c @@ -38,7 +38,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, int i, ret = 0; u64 next_cycle; struct kvm_vcpu *rvcpu; - struct cpumask cm, hm; + struct cpumask cm; struct kvm *kvm = vcpu->kvm; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; @@ -101,15 +101,12 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, continue; cpumask_set_cpu(rvcpu->cpu, &cm); } - riscv_cpuid_to_hartid_mask(&cm, &hm); if (cp->a7 == SBI_EXT_0_1_REMOTE_FENCE_I) - ret = sbi_remote_fence_i(cpumask_bits(&hm)); + ret = sbi_remote_fence_i(&cm); else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA) - ret = sbi_remote_hfence_vvma(cpumask_bits(&hm), - cp->a1, cp->a2); + ret = sbi_remote_hfence_vvma(&cm, cp->a1, cp->a2); else - ret = sbi_remote_hfence_vvma_asid(cpumask_bits(&hm), - cp->a1, cp->a2, cp->a3); + ret = sbi_remote_hfence_vvma_asid(&cm, cp->a1, cp->a2, cp->a3); break; default: ret = -EINVAL; diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c index 807228f8f409..2fa4f7b1813d 100644 --- a/arch/riscv/kvm/vmid.c +++ b/arch/riscv/kvm/vmid.c @@ -67,7 +67,6 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) { unsigned long i; struct kvm_vcpu *v; - struct cpumask hmask; struct kvm_vmid *vmid = &vcpu->kvm->arch.vmid; if (!kvm_riscv_stage2_vmid_ver_changed(vmid)) @@ -102,8 +101,7 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) * running, we force VM exits on all host CPUs using IPI and * flush all Guest TLBs. */ - riscv_cpuid_to_hartid_mask(cpu_online_mask, &hmask); - sbi_remote_hfence_gvma(cpumask_bits(&hmask), 0, 0); + sbi_remote_hfence_gvma(cpu_online_mask, 0, 0); } vmid->vmid = vmid_next; diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 89f81067e09e..6cb7d96ad9c7 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -67,10 +67,7 @@ void flush_icache_mm(struct mm_struct *mm, bool local) */ smp_mb(); } else if (IS_ENABLED(CONFIG_RISCV_SBI)) { - cpumask_t hartid_mask; - - riscv_cpuid_to_hartid_mask(&others, &hartid_mask); - sbi_remote_fence_i(cpumask_bits(&hartid_mask)); + sbi_remote_fence_i(&others); } else { on_each_cpu_mask(&others, ipi_remote_fence_i, NULL, 1); } diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index 64f8201237c2..37ed760d007c 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -32,7 +32,6 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start, unsigned long size, unsigned long stride) { struct cpumask *cmask = mm_cpumask(mm); - struct cpumask hmask; unsigned int cpuid; bool broadcast; @@ -46,9 +45,7 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start, unsigned long asid = atomic_long_read(&mm->context.id); if (broadcast) { - riscv_cpuid_to_hartid_mask(cmask, &hmask); - sbi_remote_sfence_vma_asid(cpumask_bits(&hmask), - start, size, asid); + sbi_remote_sfence_vma_asid(cmask, start, size, asid); } else if (size <= stride) { local_flush_tlb_page_asid(start, asid); } else { @@ -56,9 +53,7 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start, } } else { if (broadcast) { - riscv_cpuid_to_hartid_mask(cmask, &hmask); - sbi_remote_sfence_vma(cpumask_bits(&hmask), - start, size); + sbi_remote_sfence_vma(cmask, start, size); } else if (size <= stride) { local_flush_tlb_page(start); } else { From 3c2905ea79245fd37c2ab9d9384ab85d4732e3ef Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 29 Dec 2021 20:24:58 +0100 Subject: [PATCH 0279/1295] riscv: canaan: remove useless select of non-existing config SYSCON The config SYSCON never existed in the kernel repository; so, the select of that config in ./drivers/soc/canaan/Kconfig has no effect. Presumably, this was just some mistake, assuming some symmetry in handling and naming of configs that simply does not exist. Remove this useless select of a non-existing config. Signed-off-by: Lukas Bulwahn Signed-off-by: Palmer Dabbelt --- drivers/soc/canaan/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/soc/canaan/Kconfig b/drivers/soc/canaan/Kconfig index 853096b7e84c..2527cf5757ec 100644 --- a/drivers/soc/canaan/Kconfig +++ b/drivers/soc/canaan/Kconfig @@ -5,7 +5,6 @@ config SOC_K210_SYSCTL depends on RISCV && SOC_CANAAN && OF default SOC_CANAAN select PM - select SYSCON select MFD_SYSCON help Canaan Kendryte K210 SoC system controller driver. From db3f02df1853acf4d678bcddb3f1eab23219b410 Mon Sep 17 00:00:00 2001 From: Ron Economos Date: Thu, 30 Dec 2021 22:11:06 -0800 Subject: [PATCH 0280/1295] riscv: dts: sifive unmatched: Add gpio poweroff Some of the GPIO pins on the Unmatched are wire up to control the power of the board, indicate that in the device tree. Signed-off-by: Ron Economos Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts index 6bfa1f24d3de..c4ed9efdff03 100644 --- a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts +++ b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts @@ -39,6 +39,11 @@ clock-frequency = ; clock-output-names = "rtcclk"; }; + + gpio-poweroff { + compatible = "gpio-poweroff"; + gpios = <&gpio 2 GPIO_ACTIVE_LOW>; + }; }; &uart0 { From 58dfff3e984dfb96dae98008e6ea0ab92248d003 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 18 Jan 2022 19:53:25 -0600 Subject: [PATCH 0281/1295] dt-bindings: Drop unnecessary pinctrl properties For a single pinctrl mode, it is not necessary to define pinctrl properties as the tools always allow pinctrl properties. Signed-off-by: Rob Herring Acked-by: Charles Keepax Acked-by: Mark Brown Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220119015325.2438277-1-robh@kernel.org --- .../display/rockchip/rockchip,rk3066-hdmi.yaml | 8 -------- Documentation/devicetree/bindings/input/gpio-keys.yaml | 6 ------ .../devicetree/bindings/pinctrl/cirrus,lochnagar.yaml | 9 --------- .../devicetree/bindings/pinctrl/cirrus,madera.yaml | 10 ---------- .../devicetree/bindings/sound/samsung-i2s.yaml | 6 ------ 5 files changed, 39 deletions(-) diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.yaml b/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.yaml index 008c144257cb..1a68a940d165 100644 --- a/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.yaml +++ b/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.yaml @@ -26,14 +26,6 @@ properties: clock-names: const: hclk - pinctrl-0: - maxItems: 2 - - pinctrl-names: - const: default - description: - Switch the iomux for the HPD/I2C pins to HDMI function. - power-domains: maxItems: 1 diff --git a/Documentation/devicetree/bindings/input/gpio-keys.yaml b/Documentation/devicetree/bindings/input/gpio-keys.yaml index dbe7ecc19ccb..7fe1966ea28a 100644 --- a/Documentation/devicetree/bindings/input/gpio-keys.yaml +++ b/Documentation/devicetree/bindings/input/gpio-keys.yaml @@ -88,12 +88,6 @@ patternProperties: which can be disabled to suppress events from the button. type: boolean - pinctrl-0: - maxItems: 1 - - pinctrl-names: - maxItems: 1 - required: - linux,code diff --git a/Documentation/devicetree/bindings/pinctrl/cirrus,lochnagar.yaml b/Documentation/devicetree/bindings/pinctrl/cirrus,lochnagar.yaml index a07dd197176a..6ff829a7d6e5 100644 --- a/Documentation/devicetree/bindings/pinctrl/cirrus,lochnagar.yaml +++ b/Documentation/devicetree/bindings/pinctrl/cirrus,lochnagar.yaml @@ -51,15 +51,6 @@ properties: appropriate of the LOCHNAGARx_PIN_NUM_GPIOS define, see [3]. maxItems: 1 - pinctrl-0: - description: - A phandle to the default pinctrl state. - - pinctrl-names: - description: - A pinctrl state named "default" must be defined. - const: default - pin-settings: type: object patternProperties: diff --git a/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml b/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml index 4cb174bf31ff..8a90d8273767 100644 --- a/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml +++ b/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml @@ -30,16 +30,6 @@ description: | Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt properties: - pinctrl-0: - description: - A phandle to the node containing the subnodes containing default - configurations. - - pinctrl-names: - description: - A pinctrl state named "default" must be defined. - const: default - pin-settings: description: One subnode is required to contain the default settings. It diff --git a/Documentation/devicetree/bindings/sound/samsung-i2s.yaml b/Documentation/devicetree/bindings/sound/samsung-i2s.yaml index 2e3628ef48df..84c4d6cba521 100644 --- a/Documentation/devicetree/bindings/sound/samsung-i2s.yaml +++ b/Documentation/devicetree/bindings/sound/samsung-i2s.yaml @@ -110,12 +110,6 @@ properties: Internal DMA register base address of the audio subsystem (used in secondary sound source). - pinctrl-0: - description: Should specify pin control groups used for this controller. - - pinctrl-names: - const: default - power-domains: maxItems: 1 From 8da46c0f98a1a17bc8f6d324d82b4d26e1448869 Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Wed, 12 Jan 2022 08:27:29 +0000 Subject: [PATCH 0282/1295] RISC-V: Remove redundant err variable Return value from user_regset_copyin() directly instead of taking this in another redundant variable. Reported-by: Zeal Robot Signed-off-by: Minghao Chi Signed-off-by: CGEL ZTE Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/ptrace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 9c0511119bad..a89243730153 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -42,12 +42,10 @@ static int riscv_gpr_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - int ret; struct pt_regs *regs; regs = task_pt_regs(target); - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, regs, 0, -1); - return ret; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, regs, 0, -1); } #ifdef CONFIG_FPU From 9b13bd53134c9ddd544a790125199fdbdb505e67 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Thu, 28 Oct 2021 13:51:14 +0000 Subject: [PATCH 0283/1295] i40e: Increase delay to 1 s after global EMP reset Recently simplified i40e_rebuild causes that FW sometimes is not ready after NVM update, the ping does not return. Increase the delay in case of EMP reset. Old delay of 300 ms was introduced for specific cards for 710 series. Now it works for all the cards and delay was increased. Fixes: 1fa51a650e1d ("i40e: Add delay after EMP reset for firmware to recover") Signed-off-by: Arkadiusz Kubalewski Signed-off-by: Jedrzej Jagielski Tested-by: Gurucharan G Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 2a3d8aef7f4e..1b0fd3eacd64 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -10574,15 +10574,9 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) } i40e_get_oem_version(&pf->hw); - if (test_bit(__I40E_EMP_RESET_INTR_RECEIVED, pf->state) && - ((hw->aq.fw_maj_ver == 4 && hw->aq.fw_min_ver <= 33) || - hw->aq.fw_maj_ver < 4) && hw->mac.type == I40E_MAC_XL710) { - /* The following delay is necessary for 4.33 firmware and older - * to recover after EMP reset. 200 ms should suffice but we - * put here 300 ms to be sure that FW is ready to operate - * after reset. - */ - mdelay(300); + if (test_and_clear_bit(__I40E_EMP_RESET_INTR_RECEIVED, pf->state)) { + /* The following delay is necessary for firmware update. */ + mdelay(1000); } /* re-verify the eeprom if we just had an EMP reset */ From d701658a50a471591094b3eb3961b4926cc8f104 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Fri, 5 Nov 2021 11:17:00 +0000 Subject: [PATCH 0284/1295] i40e: Fix issue when maximum queues is exceeded Before this patch VF interface vanished when maximum queue number was exceeded. Driver tried to add next queues even if there was not enough space. PF sent incorrect number of queues to the VF when there were not enough of them. Add an additional condition introduced to check available space in 'qp_pile' before proceeding. This condition makes it impossible to add queues if they number is greater than the number resulting from available space. Also add the search for free space in PF queue pair piles. Without this patch VF interfaces are not seen when available space for queues has been exceeded and following logs appears permanently in dmesg: "Unable to get VF config (-32)". "VF 62 failed opcode 3, retval: -5" "Unable to get VF config due to PF error condition, not retrying" Fixes: 7daa6bf3294e ("i40e: driver core headers") Fixes: 41c445ff0f48 ("i40e: main driver core") Signed-off-by: Jaroslaw Gawin Signed-off-by: Slawomir Laba Signed-off-by: Jedrzej Jagielski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e.h | 1 - drivers/net/ethernet/intel/i40e/i40e_main.c | 14 +---- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 59 +++++++++++++++++++ 3 files changed, 61 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 4d939af0a626..c8cfe62d5e05 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -174,7 +174,6 @@ enum i40e_interrupt_policy { struct i40e_lump_tracking { u16 num_entries; - u16 search_hint; u16 list[0]; #define I40E_PILE_VALID_BIT 0x8000 #define I40E_IWARP_IRQ_PILE_ID (I40E_PILE_VALID_BIT - 2) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 1b0fd3eacd64..9456641cd24a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -196,10 +196,6 @@ int i40e_free_virt_mem_d(struct i40e_hw *hw, struct i40e_virt_mem *mem) * @id: an owner id to stick on the items assigned * * Returns the base item index of the lump, or negative for error - * - * The search_hint trick and lack of advanced fit-finding only work - * because we're highly likely to have all the same size lump requests. - * Linear search time and any fragmentation should be minimal. **/ static int i40e_get_lump(struct i40e_pf *pf, struct i40e_lump_tracking *pile, u16 needed, u16 id) @@ -214,8 +210,7 @@ static int i40e_get_lump(struct i40e_pf *pf, struct i40e_lump_tracking *pile, return -EINVAL; } - /* start the linear search with an imperfect hint */ - i = pile->search_hint; + i = 0; while (i < pile->num_entries) { /* skip already allocated entries */ if (pile->list[i] & I40E_PILE_VALID_BIT) { @@ -234,7 +229,6 @@ static int i40e_get_lump(struct i40e_pf *pf, struct i40e_lump_tracking *pile, for (j = 0; j < needed; j++) pile->list[i+j] = id | I40E_PILE_VALID_BIT; ret = i; - pile->search_hint = i + j; break; } @@ -257,7 +251,7 @@ static int i40e_put_lump(struct i40e_lump_tracking *pile, u16 index, u16 id) { int valid_id = (id | I40E_PILE_VALID_BIT); int count = 0; - int i; + u16 i; if (!pile || index >= pile->num_entries) return -EINVAL; @@ -269,8 +263,6 @@ static int i40e_put_lump(struct i40e_lump_tracking *pile, u16 index, u16 id) count++; } - if (count && index < pile->search_hint) - pile->search_hint = index; return count; } @@ -11786,7 +11778,6 @@ static int i40e_init_interrupt_scheme(struct i40e_pf *pf) return -ENOMEM; pf->irq_pile->num_entries = vectors; - pf->irq_pile->search_hint = 0; /* track first vector for misc interrupts, ignore return */ (void)i40e_get_lump(pf, pf->irq_pile, 1, I40E_PILE_VALID_BIT - 1); @@ -12589,7 +12580,6 @@ static int i40e_sw_init(struct i40e_pf *pf) goto sw_init_done; } pf->qp_pile->num_entries = pf->hw.func_caps.num_tx_qp; - pf->qp_pile->search_hint = 0; pf->tx_timeout_recovery_level = 1; diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index b785d09c19f8..7dd2a2bab339 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2617,6 +2617,59 @@ error_param: aq_ret); } +/** + * i40e_check_enough_queue - find big enough queue number + * @vf: pointer to the VF info + * @needed: the number of items needed + * + * Returns the base item index of the queue, or negative for error + **/ +static int i40e_check_enough_queue(struct i40e_vf *vf, u16 needed) +{ + unsigned int i, cur_queues, more, pool_size; + struct i40e_lump_tracking *pile; + struct i40e_pf *pf = vf->pf; + struct i40e_vsi *vsi; + + vsi = pf->vsi[vf->lan_vsi_idx]; + cur_queues = vsi->alloc_queue_pairs; + + /* if current allocated queues are enough for need */ + if (cur_queues >= needed) + return vsi->base_queue; + + pile = pf->qp_pile; + if (cur_queues > 0) { + /* if the allocated queues are not zero + * just check if there are enough queues for more + * behind the allocated queues. + */ + more = needed - cur_queues; + for (i = vsi->base_queue + cur_queues; + i < pile->num_entries; i++) { + if (pile->list[i] & I40E_PILE_VALID_BIT) + break; + + if (more-- == 1) + /* there is enough */ + return vsi->base_queue; + } + } + + pool_size = 0; + for (i = 0; i < pile->num_entries; i++) { + if (pile->list[i] & I40E_PILE_VALID_BIT) { + pool_size = 0; + continue; + } + if (needed <= ++pool_size) + /* there is enough */ + return i; + } + + return -ENOMEM; +} + /** * i40e_vc_request_queues_msg * @vf: pointer to the VF info @@ -2651,6 +2704,12 @@ static int i40e_vc_request_queues_msg(struct i40e_vf *vf, u8 *msg) req_pairs - cur_pairs, pf->queues_left); vfres->num_queue_pairs = pf->queues_left + cur_pairs; + } else if (i40e_check_enough_queue(vf, req_pairs) < 0) { + dev_warn(&pf->pdev->dev, + "VF %d requested %d more queues, but there is not enough for it.\n", + vf->vf_id, + req_pairs - cur_pairs); + vfres->num_queue_pairs = cur_pairs; } else { /* successful request */ vf->num_req_queues = req_pairs; From 92947844b8beee988c0ce17082b705c2f75f0742 Mon Sep 17 00:00:00 2001 From: Sylwester Dziedziuch Date: Fri, 26 Nov 2021 11:11:22 +0100 Subject: [PATCH 0285/1295] i40e: Fix queues reservation for XDP When XDP was configured on a system with large number of CPUs and X722 NIC there was a call trace with NULL pointer dereference. i40e 0000:87:00.0: failed to get tracking for 256 queues for VSI 0 err -12 i40e 0000:87:00.0: setup of MAIN VSI failed BUG: kernel NULL pointer dereference, address: 0000000000000000 RIP: 0010:i40e_xdp+0xea/0x1b0 [i40e] Call Trace: ? i40e_reconfig_rss_queues+0x130/0x130 [i40e] dev_xdp_install+0x61/0xe0 dev_xdp_attach+0x18a/0x4c0 dev_change_xdp_fd+0x1e6/0x220 do_setlink+0x616/0x1030 ? ahci_port_stop+0x80/0x80 ? ata_qc_issue+0x107/0x1e0 ? lock_timer_base+0x61/0x80 ? __mod_timer+0x202/0x380 rtnl_setlink+0xe5/0x170 ? bpf_lsm_binder_transaction+0x10/0x10 ? security_capable+0x36/0x50 rtnetlink_rcv_msg+0x121/0x350 ? rtnl_calcit.isra.0+0x100/0x100 netlink_rcv_skb+0x50/0xf0 netlink_unicast+0x1d3/0x2a0 netlink_sendmsg+0x22a/0x440 sock_sendmsg+0x5e/0x60 __sys_sendto+0xf0/0x160 ? __sys_getsockname+0x7e/0xc0 ? _copy_from_user+0x3c/0x80 ? __sys_setsockopt+0xc8/0x1a0 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f83fa7a39e0 This was caused by PF queue pile fragmentation due to flow director VSI queue being placed right after main VSI. Because of this main VSI was not able to resize its queue allocation for XDP resulting in no queues allocated for main VSI when XDP was turned on. Fix this by always allocating last queue in PF queue pile for a flow director VSI. Fixes: 41c445ff0f48 ("i40e: main driver core") Fixes: 74608d17fe29 ("i40e: add support for XDP_TX action") Signed-off-by: Sylwester Dziedziuch Signed-off-by: Mateusz Palczewski Reviewed-by: Maciej Fijalkowski Tested-by: Kiran Bhandare Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 9456641cd24a..9b65cb50282c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -210,6 +210,20 @@ static int i40e_get_lump(struct i40e_pf *pf, struct i40e_lump_tracking *pile, return -EINVAL; } + /* Allocate last queue in the pile for FDIR VSI queue + * so it doesn't fragment the qp_pile + */ + if (pile == pf->qp_pile && pf->vsi[id]->type == I40E_VSI_FDIR) { + if (pile->list[pile->num_entries - 1] & I40E_PILE_VALID_BIT) { + dev_err(&pf->pdev->dev, + "Cannot allocate queue %d for I40E_VSI_FDIR\n", + pile->num_entries - 1); + return -ENOMEM; + } + pile->list[pile->num_entries - 1] = id | I40E_PILE_VALID_BIT; + return pile->num_entries - 1; + } + i = 0; while (i < pile->num_entries) { /* skip already allocated entries */ From 0f344c8129a5337dae50e31b817dd50a60ff238c Mon Sep 17 00:00:00 2001 From: Karen Sornek Date: Thu, 2 Dec 2021 12:52:01 +0100 Subject: [PATCH 0286/1295] i40e: Fix for failed to init adminq while VF reset Fix for failed to init adminq: -53 while VF is resetting via MAC address changing procedure. Added sync module to avoid reading deadbeef value in reinit adminq during software reset. Without this patch it is possible to trigger VF reset procedure during reinit adminq. This resulted in an incorrect reading of value from the AQP registers and generated the -53 error. Fixes: 5c3c48ac6bf5 ("i40e: implement virtual device interface") Signed-off-by: Grzegorz Szczurek Signed-off-by: Karen Sornek Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/i40e/i40e_register.h | 3 ++ .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 44 ++++++++++++++++++- .../ethernet/intel/i40e/i40e_virtchnl_pf.h | 1 + 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_register.h b/drivers/net/ethernet/intel/i40e/i40e_register.h index 8d0588a27a05..1908eed4fa5e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_register.h +++ b/drivers/net/ethernet/intel/i40e/i40e_register.h @@ -413,6 +413,9 @@ #define I40E_VFINT_DYN_CTLN(_INTVF) (0x00024800 + ((_INTVF) * 4)) /* _i=0...511 */ /* Reset: VFR */ #define I40E_VFINT_DYN_CTLN_CLEARPBA_SHIFT 1 #define I40E_VFINT_DYN_CTLN_CLEARPBA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_CLEARPBA_SHIFT) +#define I40E_VFINT_ICR0_ADMINQ_SHIFT 30 +#define I40E_VFINT_ICR0_ADMINQ_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ADMINQ_SHIFT) +#define I40E_VFINT_ICR0_ENA(_VF) (0x0002C000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */ #define I40E_VPINT_AEQCTL(_VF) (0x0002B800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */ #define I40E_VPINT_AEQCTL_MSIX_INDX_SHIFT 0 #define I40E_VPINT_AEQCTL_ITR_INDX_SHIFT 11 diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 7dd2a2bab339..dfdb6e786461 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -1376,6 +1376,32 @@ static i40e_status i40e_config_vf_promiscuous_mode(struct i40e_vf *vf, return aq_ret; } +/** + * i40e_sync_vfr_reset + * @hw: pointer to hw struct + * @vf_id: VF identifier + * + * Before trigger hardware reset, we need to know if no other process has + * reserved the hardware for any reset operations. This check is done by + * examining the status of the RSTAT1 register used to signal the reset. + **/ +static int i40e_sync_vfr_reset(struct i40e_hw *hw, int vf_id) +{ + u32 reg; + int i; + + for (i = 0; i < I40E_VFR_WAIT_COUNT; i++) { + reg = rd32(hw, I40E_VFINT_ICR0_ENA(vf_id)) & + I40E_VFINT_ICR0_ADMINQ_MASK; + if (reg) + return 0; + + usleep_range(100, 200); + } + + return -EAGAIN; +} + /** * i40e_trigger_vf_reset * @vf: pointer to the VF structure @@ -1390,9 +1416,11 @@ static void i40e_trigger_vf_reset(struct i40e_vf *vf, bool flr) struct i40e_pf *pf = vf->pf; struct i40e_hw *hw = &pf->hw; u32 reg, reg_idx, bit_idx; + bool vf_active; + u32 radq; /* warn the VF */ - clear_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states); + vf_active = test_and_clear_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states); /* Disable VF's configuration API during reset. The flag is re-enabled * in i40e_alloc_vf_res(), when it's safe again to access VF's VSI. @@ -1406,7 +1434,19 @@ static void i40e_trigger_vf_reset(struct i40e_vf *vf, bool flr) * just need to clean up, so don't hit the VFRTRIG register. */ if (!flr) { - /* reset VF using VPGEN_VFRTRIG reg */ + /* Sync VFR reset before trigger next one */ + radq = rd32(hw, I40E_VFINT_ICR0_ENA(vf->vf_id)) & + I40E_VFINT_ICR0_ADMINQ_MASK; + if (vf_active && !radq) + /* waiting for finish reset by virtual driver */ + if (i40e_sync_vfr_reset(hw, vf->vf_id)) + dev_info(&pf->pdev->dev, + "Reset VF %d never finished\n", + vf->vf_id); + + /* Reset VF using VPGEN_VFRTRIG reg. It is also setting + * in progress state in rstat1 register. + */ reg = rd32(hw, I40E_VPGEN_VFRTRIG(vf->vf_id)); reg |= I40E_VPGEN_VFRTRIG_VFSWR_MASK; wr32(hw, I40E_VPGEN_VFRTRIG(vf->vf_id), reg); diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h index 49575a640a84..03c42fd0fea1 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h @@ -19,6 +19,7 @@ #define I40E_MAX_VF_PROMISC_FLAGS 3 #define I40E_VF_STATE_WAIT_COUNT 20 +#define I40E_VFR_WAIT_COUNT 100 /* Various queue ctrls */ enum i40e_queue_ctrl { From 3b8428b84539c78fdc8006c17ebd25afd4722d51 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 8 Dec 2021 17:56:33 -0800 Subject: [PATCH 0287/1295] i40e: fix unsigned stat widths Change i40e_update_vsi_stats and struct i40e_vsi to use u64 fields to match the width of the stats counters in struct i40e_rx_queue_stats. Update debugfs code to use the correct format specifier for u64. Fixes: 41c445ff0f48 ("i40e: main driver core") Signed-off-by: Joe Damato Reported-by: kernel test robot Tested-by: Gurucharan G Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e.h | 8 ++++---- drivers/net/ethernet/intel/i40e/i40e_debugfs.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index c8cfe62d5e05..2e02cc68cd3f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -847,12 +847,12 @@ struct i40e_vsi { struct rtnl_link_stats64 net_stats_offsets; struct i40e_eth_stats eth_stats; struct i40e_eth_stats eth_stats_offsets; - u32 tx_restart; - u32 tx_busy; + u64 tx_restart; + u64 tx_busy; u64 tx_linearize; u64 tx_force_wb; - u32 rx_buf_failed; - u32 rx_page_failed; + u64 rx_buf_failed; + u64 rx_page_failed; /* These are containers of ring pointers, allocated at run-time */ struct i40e_ring **rx_rings; diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c index 2c1b1da1220e..1e57cc8c47d7 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c +++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c @@ -240,7 +240,7 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid) (unsigned long int)vsi->net_stats_offsets.rx_compressed, (unsigned long int)vsi->net_stats_offsets.tx_compressed); dev_info(&pf->pdev->dev, - " tx_restart = %d, tx_busy = %d, rx_buf_failed = %d, rx_page_failed = %d\n", + " tx_restart = %llu, tx_busy = %llu, rx_buf_failed = %llu, rx_page_failed = %llu\n", vsi->tx_restart, vsi->tx_busy, vsi->rx_buf_failed, vsi->rx_page_failed); rcu_read_lock(); diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 9b65cb50282c..f70c478dafdb 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -778,9 +778,9 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi) struct rtnl_link_stats64 *ns; /* netdev stats */ struct i40e_eth_stats *oes; struct i40e_eth_stats *es; /* device's eth stats */ - u32 tx_restart, tx_busy; + u64 tx_restart, tx_busy; struct i40e_ring *p; - u32 rx_page, rx_buf; + u64 rx_page, rx_buf; u64 bytes, packets; unsigned int start; u64 tx_linearize; From 986536b952fd7070f5137358df7b055f3081dd2b Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 18 Jan 2022 19:56:26 -0600 Subject: [PATCH 0288/1295] dt-bindings: Fix array schemas encoded as matrices The YAML DT encoding has leaked into some array properties. Properties which are defined as an array should have a schema that's just an array. That means there should only be a single level of 'minItems', 'maxItems', and/or 'items'. Signed-off-by: Rob Herring Acked-by: Marc Kleine-Budde Link: https://lore.kernel.org/r/20220119015627.2443334-1-robh@kernel.org --- .../bindings/media/nxp,imx7-mipi-csi2.yaml | 12 ++--- .../bindings/media/nxp,imx8mq-mipi-csi2.yaml | 12 ++--- .../bindings/net/can/bosch,m_can.yaml | 52 +++++++++---------- .../bindings/net/ethernet-controller.yaml | 51 ++++++++---------- .../devicetree/bindings/nvmem/nvmem.yaml | 17 +++--- 5 files changed, 66 insertions(+), 78 deletions(-) diff --git a/Documentation/devicetree/bindings/media/nxp,imx7-mipi-csi2.yaml b/Documentation/devicetree/bindings/media/nxp,imx7-mipi-csi2.yaml index 1ef849dc74d7..e2e6e9aa0fe6 100644 --- a/Documentation/devicetree/bindings/media/nxp,imx7-mipi-csi2.yaml +++ b/Documentation/devicetree/bindings/media/nxp,imx7-mipi-csi2.yaml @@ -81,14 +81,12 @@ properties: data-lanes: description: Note that 'fsl,imx7-mipi-csi2' only supports up to 2 data lines. + minItems: 1 items: - minItems: 1 - maxItems: 4 - items: - - const: 1 - - const: 2 - - const: 3 - - const: 4 + - const: 1 + - const: 2 + - const: 3 + - const: 4 required: - data-lanes diff --git a/Documentation/devicetree/bindings/media/nxp,imx8mq-mipi-csi2.yaml b/Documentation/devicetree/bindings/media/nxp,imx8mq-mipi-csi2.yaml index 9c04fa85ee5c..1b3e1c4b99ed 100644 --- a/Documentation/devicetree/bindings/media/nxp,imx8mq-mipi-csi2.yaml +++ b/Documentation/devicetree/bindings/media/nxp,imx8mq-mipi-csi2.yaml @@ -87,14 +87,12 @@ properties: properties: data-lanes: + minItems: 1 items: - minItems: 1 - maxItems: 4 - items: - - const: 1 - - const: 2 - - const: 3 - - const: 4 + - const: 1 + - const: 2 + - const: 3 + - const: 4 required: - data-lanes diff --git a/Documentation/devicetree/bindings/net/can/bosch,m_can.yaml b/Documentation/devicetree/bindings/net/can/bosch,m_can.yaml index fb547e26c676..401ab7cdb379 100644 --- a/Documentation/devicetree/bindings/net/can/bosch,m_can.yaml +++ b/Documentation/devicetree/bindings/net/can/bosch,m_can.yaml @@ -76,33 +76,31 @@ properties: M_CAN user manual for details. $ref: /schemas/types.yaml#/definitions/int32-array items: - items: - - description: The 'offset' is an address offset of the Message RAM where - the following elements start from. This is usually set to 0x0 if - you're using a private Message RAM. - default: 0 - - description: 11-bit Filter 0-128 elements / 0-128 words - minimum: 0 - maximum: 128 - - description: 29-bit Filter 0-64 elements / 0-128 words - minimum: 0 - maximum: 64 - - description: Rx FIFO 0 0-64 elements / 0-1152 words - minimum: 0 - maximum: 64 - - description: Rx FIFO 1 0-64 elements / 0-1152 words - minimum: 0 - maximum: 64 - - description: Rx Buffers 0-64 elements / 0-1152 words - minimum: 0 - maximum: 64 - - description: Tx Event FIFO 0-32 elements / 0-64 words - minimum: 0 - maximum: 32 - - description: Tx Buffers 0-32 elements / 0-576 words - minimum: 0 - maximum: 32 - maxItems: 1 + - description: The 'offset' is an address offset of the Message RAM where + the following elements start from. This is usually set to 0x0 if + you're using a private Message RAM. + default: 0 + - description: 11-bit Filter 0-128 elements / 0-128 words + minimum: 0 + maximum: 128 + - description: 29-bit Filter 0-64 elements / 0-128 words + minimum: 0 + maximum: 64 + - description: Rx FIFO 0 0-64 elements / 0-1152 words + minimum: 0 + maximum: 64 + - description: Rx FIFO 1 0-64 elements / 0-1152 words + minimum: 0 + maximum: 64 + - description: Rx Buffers 0-64 elements / 0-1152 words + minimum: 0 + maximum: 64 + - description: Tx Event FIFO 0-32 elements / 0-64 words + minimum: 0 + maximum: 32 + - description: Tx Buffers 0-32 elements / 0-576 words + minimum: 0 + maximum: 32 power-domains: description: diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml index 47b5f728701d..34c5463abcec 100644 --- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml @@ -17,9 +17,8 @@ properties: description: Specifies the MAC address that was assigned to the network device. $ref: /schemas/types.yaml#/definitions/uint8-array - items: - - minItems: 6 - maxItems: 6 + minItems: 6 + maxItems: 6 mac-address: description: @@ -28,9 +27,8 @@ properties: to the device by the boot program is different from the local-mac-address property. $ref: /schemas/types.yaml#/definitions/uint8-array - items: - - minItems: 6 - maxItems: 6 + minItems: 6 + maxItems: 6 max-frame-size: $ref: /schemas/types.yaml#/definitions/uint32 @@ -164,33 +162,30 @@ properties: type: array then: deprecated: true - minItems: 1 - maxItems: 1 items: - items: - - minimum: 0 - maximum: 31 - description: - Emulated PHY ID, choose any but unique to the all - specified fixed-links + - minimum: 0 + maximum: 31 + description: + Emulated PHY ID, choose any but unique to the all + specified fixed-links - - enum: [0, 1] - description: - Duplex configuration. 0 for half duplex or 1 for - full duplex + - enum: [0, 1] + description: + Duplex configuration. 0 for half duplex or 1 for + full duplex - - enum: [10, 100, 1000, 2500, 10000] - description: - Link speed in Mbits/sec. + - enum: [10, 100, 1000, 2500, 10000] + description: + Link speed in Mbits/sec. - - enum: [0, 1] - description: - Pause configuration. 0 for no pause, 1 for pause + - enum: [0, 1] + description: + Pause configuration. 0 for no pause, 1 for pause - - enum: [0, 1] - description: - Asymmetric pause configuration. 0 for no asymmetric - pause, 1 for asymmetric pause + - enum: [0, 1] + description: + Asymmetric pause configuration. 0 for no asymmetric + pause, 1 for asymmetric pause - if: diff --git a/Documentation/devicetree/bindings/nvmem/nvmem.yaml b/Documentation/devicetree/bindings/nvmem/nvmem.yaml index 456fb808100a..43ed7e32e5ac 100644 --- a/Documentation/devicetree/bindings/nvmem/nvmem.yaml +++ b/Documentation/devicetree/bindings/nvmem/nvmem.yaml @@ -50,16 +50,15 @@ patternProperties: Offset and size in bytes within the storage device. bits: - maxItems: 1 + $ref: /schemas/types.yaml#/definitions/uint32-array items: - items: - - minimum: 0 - maximum: 7 - description: - Offset in bit within the address range specified by reg. - - minimum: 1 - description: - Size in bit within the address range specified by reg. + - minimum: 0 + maximum: 7 + description: + Offset in bit within the address range specified by reg. + - minimum: 1 + description: + Size in bit within the address range specified by reg. required: - reg From 25e20b505e0e2454ff6713c0cef4bcdd66dffb95 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 18 Jan 2022 19:56:11 -0600 Subject: [PATCH 0289/1295] dt-bindings: mfd: cirrus,madera: Fix 'interrupts' in example The 'interrupts' properties takes an irq number, not a phandle, and 'interrupt-parent' isn't needed in examples. Signed-off-by: Rob Herring Acked-by: Charles Keepax Link: https://lore.kernel.org/r/20220119015611.2442819-1-robh@kernel.org --- Documentation/devicetree/bindings/mfd/cirrus,madera.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/mfd/cirrus,madera.yaml b/Documentation/devicetree/bindings/mfd/cirrus,madera.yaml index 5dce62a7eff2..68c75a517c92 100644 --- a/Documentation/devicetree/bindings/mfd/cirrus,madera.yaml +++ b/Documentation/devicetree/bindings/mfd/cirrus,madera.yaml @@ -245,8 +245,7 @@ examples: interrupt-controller; #interrupt-cells = <2>; - interrupts = <&host_irq1>; - interrupt-parent = <&gic>; + interrupts = <4 1 0>; gpio-controller; #gpio-cells = <2>; From 66a8f7f04979f4ad739085f01d99c8caf620b4f5 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 18 Jan 2022 18:35:02 +0100 Subject: [PATCH 0290/1295] of: base: make small of_parse_phandle() variants static inline Make all the smaller variants of the of_parse_phandle() static inline. This also let us remove the empty function stubs if CONFIG_OF is not defined. Suggested-by: Rob Herring Signed-off-by: Michael Walle [robh: move index < 0 check into __of_parse_phandle_with_args] Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220118173504.2867523-2-michael@walle.cc --- drivers/of/base.c | 131 +++------------------------------------ include/linux/of.h | 148 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 129 insertions(+), 150 deletions(-) diff --git a/drivers/of/base.c b/drivers/of/base.c index 8a24d37153b4..e7d92b67cb8a 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -1420,15 +1420,18 @@ int of_phandle_iterator_args(struct of_phandle_iterator *it, return count; } -static int __of_parse_phandle_with_args(const struct device_node *np, - const char *list_name, - const char *cells_name, - int cell_count, int index, - struct of_phandle_args *out_args) +int __of_parse_phandle_with_args(const struct device_node *np, + const char *list_name, + const char *cells_name, + int cell_count, int index, + struct of_phandle_args *out_args) { struct of_phandle_iterator it; int rc, cur_index = 0; + if (index < 0) + return -EINVAL; + /* Loop over the phandles until all the requested entry is found */ of_for_each_phandle(&it, rc, np, list_name, cells_name, cell_count) { /* @@ -1471,82 +1474,7 @@ static int __of_parse_phandle_with_args(const struct device_node *np, of_node_put(it.node); return rc; } - -/** - * of_parse_phandle - Resolve a phandle property to a device_node pointer - * @np: Pointer to device node holding phandle property - * @phandle_name: Name of property holding a phandle value - * @index: For properties holding a table of phandles, this is the index into - * the table - * - * Return: The device_node pointer with refcount incremented. Use - * of_node_put() on it when done. - */ -struct device_node *of_parse_phandle(const struct device_node *np, - const char *phandle_name, int index) -{ - struct of_phandle_args args; - - if (index < 0) - return NULL; - - if (__of_parse_phandle_with_args(np, phandle_name, NULL, 0, - index, &args)) - return NULL; - - return args.np; -} -EXPORT_SYMBOL(of_parse_phandle); - -/** - * of_parse_phandle_with_args() - Find a node pointed by phandle in a list - * @np: pointer to a device tree node containing a list - * @list_name: property name that contains a list - * @cells_name: property name that specifies phandles' arguments count - * @index: index of a phandle to parse out - * @out_args: optional pointer to output arguments structure (will be filled) - * - * This function is useful to parse lists of phandles and their arguments. - * Returns 0 on success and fills out_args, on error returns appropriate - * errno value. - * - * Caller is responsible to call of_node_put() on the returned out_args->np - * pointer. - * - * Example:: - * - * phandle1: node1 { - * #list-cells = <2>; - * }; - * - * phandle2: node2 { - * #list-cells = <1>; - * }; - * - * node3 { - * list = <&phandle1 1 2 &phandle2 3>; - * }; - * - * To get a device_node of the ``node2`` node you may call this: - * of_parse_phandle_with_args(node3, "list", "#list-cells", 1, &args); - */ -int of_parse_phandle_with_args(const struct device_node *np, const char *list_name, - const char *cells_name, int index, - struct of_phandle_args *out_args) -{ - int cell_count = -1; - - if (index < 0) - return -EINVAL; - - /* If cells_name is NULL we assume a cell count of 0 */ - if (!cells_name) - cell_count = 0; - - return __of_parse_phandle_with_args(np, list_name, cells_name, - cell_count, index, out_args); -} -EXPORT_SYMBOL(of_parse_phandle_with_args); +EXPORT_SYMBOL(__of_parse_phandle_with_args); /** * of_parse_phandle_with_args_map() - Find a node pointed by phandle in a list and remap it @@ -1732,47 +1660,6 @@ free: } EXPORT_SYMBOL(of_parse_phandle_with_args_map); -/** - * of_parse_phandle_with_fixed_args() - Find a node pointed by phandle in a list - * @np: pointer to a device tree node containing a list - * @list_name: property name that contains a list - * @cell_count: number of argument cells following the phandle - * @index: index of a phandle to parse out - * @out_args: optional pointer to output arguments structure (will be filled) - * - * This function is useful to parse lists of phandles and their arguments. - * Returns 0 on success and fills out_args, on error returns appropriate - * errno value. - * - * Caller is responsible to call of_node_put() on the returned out_args->np - * pointer. - * - * Example:: - * - * phandle1: node1 { - * }; - * - * phandle2: node2 { - * }; - * - * node3 { - * list = <&phandle1 0 2 &phandle2 2 3>; - * }; - * - * To get a device_node of the ``node2`` node you may call this: - * of_parse_phandle_with_fixed_args(node3, "list", 2, 1, &args); - */ -int of_parse_phandle_with_fixed_args(const struct device_node *np, - const char *list_name, int cell_count, - int index, struct of_phandle_args *out_args) -{ - if (index < 0) - return -EINVAL; - return __of_parse_phandle_with_args(np, list_name, NULL, cell_count, - index, out_args); -} -EXPORT_SYMBOL(of_parse_phandle_with_fixed_args); - /** * of_count_phandle_with_args() - Find the number of phandles references in a property * @np: pointer to a device tree node containing a list diff --git a/include/linux/of.h b/include/linux/of.h index ff143a027abc..16d76c92fbe0 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -364,18 +364,12 @@ extern const struct of_device_id *of_match_node( const struct of_device_id *matches, const struct device_node *node); extern int of_modalias_node(struct device_node *node, char *modalias, int len); extern void of_print_phandle_args(const char *msg, const struct of_phandle_args *args); -extern struct device_node *of_parse_phandle(const struct device_node *np, - const char *phandle_name, - int index); -extern int of_parse_phandle_with_args(const struct device_node *np, - const char *list_name, const char *cells_name, int index, - struct of_phandle_args *out_args); +extern int __of_parse_phandle_with_args(const struct device_node *np, + const char *list_name, const char *cells_name, int cell_count, + int index, struct of_phandle_args *out_args); extern int of_parse_phandle_with_args_map(const struct device_node *np, const char *list_name, const char *stem_name, int index, struct of_phandle_args *out_args); -extern int of_parse_phandle_with_fixed_args(const struct device_node *np, - const char *list_name, int cells_count, int index, - struct of_phandle_args *out_args); extern int of_count_phandle_with_args(const struct device_node *np, const char *list_name, const char *cells_name); @@ -865,18 +859,12 @@ static inline int of_property_read_string_helper(const struct device_node *np, return -ENOSYS; } -static inline struct device_node *of_parse_phandle(const struct device_node *np, - const char *phandle_name, - int index) -{ - return NULL; -} - -static inline int of_parse_phandle_with_args(const struct device_node *np, - const char *list_name, - const char *cells_name, - int index, - struct of_phandle_args *out_args) +static inline int __of_parse_phandle_with_args(const struct device_node *np, + const char *list_name, + const char *cells_name, + int cell_count, + int index, + struct of_phandle_args *out_args) { return -ENOSYS; } @@ -890,13 +878,6 @@ static inline int of_parse_phandle_with_args_map(const struct device_node *np, return -ENOSYS; } -static inline int of_parse_phandle_with_fixed_args(const struct device_node *np, - const char *list_name, int cells_count, int index, - struct of_phandle_args *out_args) -{ - return -ENOSYS; -} - static inline int of_count_phandle_with_args(const struct device_node *np, const char *list_name, const char *cells_name) @@ -1077,6 +1058,117 @@ static inline bool of_node_is_type(const struct device_node *np, const char *typ return np && match && type && !strcmp(match, type); } +/** + * of_parse_phandle - Resolve a phandle property to a device_node pointer + * @np: Pointer to device node holding phandle property + * @phandle_name: Name of property holding a phandle value + * @index: For properties holding a table of phandles, this is the index into + * the table + * + * Return: The device_node pointer with refcount incremented. Use + * of_node_put() on it when done. + */ +static inline struct device_node *of_parse_phandle(const struct device_node *np, + const char *phandle_name, + int index) +{ + struct of_phandle_args args; + + if (__of_parse_phandle_with_args(np, phandle_name, NULL, 0, + index, &args)) + return NULL; + + return args.np; +} + +/** + * of_parse_phandle_with_args() - Find a node pointed by phandle in a list + * @np: pointer to a device tree node containing a list + * @list_name: property name that contains a list + * @cells_name: property name that specifies phandles' arguments count + * @index: index of a phandle to parse out + * @out_args: optional pointer to output arguments structure (will be filled) + * + * This function is useful to parse lists of phandles and their arguments. + * Returns 0 on success and fills out_args, on error returns appropriate + * errno value. + * + * Caller is responsible to call of_node_put() on the returned out_args->np + * pointer. + * + * Example:: + * + * phandle1: node1 { + * #list-cells = <2>; + * }; + * + * phandle2: node2 { + * #list-cells = <1>; + * }; + * + * node3 { + * list = <&phandle1 1 2 &phandle2 3>; + * }; + * + * To get a device_node of the ``node2`` node you may call this: + * of_parse_phandle_with_args(node3, "list", "#list-cells", 1, &args); + */ +static inline int of_parse_phandle_with_args(const struct device_node *np, + const char *list_name, + const char *cells_name, + int index, + struct of_phandle_args *out_args) +{ + int cell_count = -1; + + /* If cells_name is NULL we assume a cell count of 0 */ + if (!cells_name) + cell_count = 0; + + return __of_parse_phandle_with_args(np, list_name, cells_name, + cell_count, index, out_args); +} + +/** + * of_parse_phandle_with_fixed_args() - Find a node pointed by phandle in a list + * @np: pointer to a device tree node containing a list + * @list_name: property name that contains a list + * @cell_count: number of argument cells following the phandle + * @index: index of a phandle to parse out + * @out_args: optional pointer to output arguments structure (will be filled) + * + * This function is useful to parse lists of phandles and their arguments. + * Returns 0 on success and fills out_args, on error returns appropriate + * errno value. + * + * Caller is responsible to call of_node_put() on the returned out_args->np + * pointer. + * + * Example:: + * + * phandle1: node1 { + * }; + * + * phandle2: node2 { + * }; + * + * node3 { + * list = <&phandle1 0 2 &phandle2 2 3>; + * }; + * + * To get a device_node of the ``node2`` node you may call this: + * of_parse_phandle_with_fixed_args(node3, "list", 2, 1, &args); + */ +static inline int of_parse_phandle_with_fixed_args(const struct device_node *np, + const char *list_name, + int cell_count, + int index, + struct of_phandle_args *out_args) +{ + return __of_parse_phandle_with_args(np, list_name, NULL, cell_count, + index, out_args); +} + /** * of_property_count_u8_elems - Count the number of u8 elements in a property * From 2ca42c3ad9ed875b136065b010753a4caaaa1d38 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 18 Jan 2022 18:35:03 +0100 Subject: [PATCH 0291/1295] of: property: define of_property_read_u{8,16,32,64}_array() unconditionally We can get rid of all the empty stubs because all these functions call of_property_read_variable_u{8,16,32,64}_array() which already have an empty stub if CONFIG_OF is not defined. Signed-off-by: Michael Walle Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220118173504.2867523-3-michael@walle.cc --- include/linux/of.h | 274 ++++++++++++++++++++------------------------- 1 file changed, 124 insertions(+), 150 deletions(-) diff --git a/include/linux/of.h b/include/linux/of.h index 16d76c92fbe0..2dc77430a91a 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -410,130 +410,6 @@ extern int of_detach_node(struct device_node *); #define of_match_ptr(_ptr) (_ptr) -/** - * of_property_read_u8_array - Find and read an array of u8 from a property. - * - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @out_values: pointer to return value, modified only if return value is 0. - * @sz: number of array elements to read - * - * Search for a property in a device node and read 8-bit value(s) from - * it. - * - * dts entry of array should be like: - * ``property = /bits/ 8 <0x50 0x60 0x70>;`` - * - * Return: 0 on success, -EINVAL if the property does not exist, - * -ENODATA if property does not have a value, and -EOVERFLOW if the - * property data isn't large enough. - * - * The out_values is modified only if a valid u8 value can be decoded. - */ -static inline int of_property_read_u8_array(const struct device_node *np, - const char *propname, - u8 *out_values, size_t sz) -{ - int ret = of_property_read_variable_u8_array(np, propname, out_values, - sz, 0); - if (ret >= 0) - return 0; - else - return ret; -} - -/** - * of_property_read_u16_array - Find and read an array of u16 from a property. - * - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @out_values: pointer to return value, modified only if return value is 0. - * @sz: number of array elements to read - * - * Search for a property in a device node and read 16-bit value(s) from - * it. - * - * dts entry of array should be like: - * ``property = /bits/ 16 <0x5000 0x6000 0x7000>;`` - * - * Return: 0 on success, -EINVAL if the property does not exist, - * -ENODATA if property does not have a value, and -EOVERFLOW if the - * property data isn't large enough. - * - * The out_values is modified only if a valid u16 value can be decoded. - */ -static inline int of_property_read_u16_array(const struct device_node *np, - const char *propname, - u16 *out_values, size_t sz) -{ - int ret = of_property_read_variable_u16_array(np, propname, out_values, - sz, 0); - if (ret >= 0) - return 0; - else - return ret; -} - -/** - * of_property_read_u32_array - Find and read an array of 32 bit integers - * from a property. - * - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @out_values: pointer to return value, modified only if return value is 0. - * @sz: number of array elements to read - * - * Search for a property in a device node and read 32-bit value(s) from - * it. - * - * Return: 0 on success, -EINVAL if the property does not exist, - * -ENODATA if property does not have a value, and -EOVERFLOW if the - * property data isn't large enough. - * - * The out_values is modified only if a valid u32 value can be decoded. - */ -static inline int of_property_read_u32_array(const struct device_node *np, - const char *propname, - u32 *out_values, size_t sz) -{ - int ret = of_property_read_variable_u32_array(np, propname, out_values, - sz, 0); - if (ret >= 0) - return 0; - else - return ret; -} - -/** - * of_property_read_u64_array - Find and read an array of 64 bit integers - * from a property. - * - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @out_values: pointer to return value, modified only if return value is 0. - * @sz: number of array elements to read - * - * Search for a property in a device node and read 64-bit value(s) from - * it. - * - * Return: 0 on success, -EINVAL if the property does not exist, - * -ENODATA if property does not have a value, and -EOVERFLOW if the - * property data isn't large enough. - * - * The out_values is modified only if a valid u64 value can be decoded. - */ -static inline int of_property_read_u64_array(const struct device_node *np, - const char *propname, - u64 *out_values, size_t sz) -{ - int ret = of_property_read_variable_u64_array(np, propname, out_values, - sz, 0); - if (ret >= 0) - return 0; - else - return ret; -} - /* * struct property *prop; * const __be32 *p; @@ -728,32 +604,6 @@ static inline int of_property_count_elems_of_size(const struct device_node *np, return -ENOSYS; } -static inline int of_property_read_u8_array(const struct device_node *np, - const char *propname, u8 *out_values, size_t sz) -{ - return -ENOSYS; -} - -static inline int of_property_read_u16_array(const struct device_node *np, - const char *propname, u16 *out_values, size_t sz) -{ - return -ENOSYS; -} - -static inline int of_property_read_u32_array(const struct device_node *np, - const char *propname, - u32 *out_values, size_t sz) -{ - return -ENOSYS; -} - -static inline int of_property_read_u64_array(const struct device_node *np, - const char *propname, - u64 *out_values, size_t sz) -{ - return -ENOSYS; -} - static inline int of_property_read_u32_index(const struct device_node *np, const char *propname, u32 index, u32 *out_value) { @@ -1328,6 +1178,130 @@ static inline bool of_property_read_bool(const struct device_node *np, return prop ? true : false; } +/** + * of_property_read_u8_array - Find and read an array of u8 from a property. + * + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_values: pointer to return value, modified only if return value is 0. + * @sz: number of array elements to read + * + * Search for a property in a device node and read 8-bit value(s) from + * it. + * + * dts entry of array should be like: + * ``property = /bits/ 8 <0x50 0x60 0x70>;`` + * + * Return: 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + * + * The out_values is modified only if a valid u8 value can be decoded. + */ +static inline int of_property_read_u8_array(const struct device_node *np, + const char *propname, + u8 *out_values, size_t sz) +{ + int ret = of_property_read_variable_u8_array(np, propname, out_values, + sz, 0); + if (ret >= 0) + return 0; + else + return ret; +} + +/** + * of_property_read_u16_array - Find and read an array of u16 from a property. + * + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_values: pointer to return value, modified only if return value is 0. + * @sz: number of array elements to read + * + * Search for a property in a device node and read 16-bit value(s) from + * it. + * + * dts entry of array should be like: + * ``property = /bits/ 16 <0x5000 0x6000 0x7000>;`` + * + * Return: 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + * + * The out_values is modified only if a valid u16 value can be decoded. + */ +static inline int of_property_read_u16_array(const struct device_node *np, + const char *propname, + u16 *out_values, size_t sz) +{ + int ret = of_property_read_variable_u16_array(np, propname, out_values, + sz, 0); + if (ret >= 0) + return 0; + else + return ret; +} + +/** + * of_property_read_u32_array - Find and read an array of 32 bit integers + * from a property. + * + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_values: pointer to return value, modified only if return value is 0. + * @sz: number of array elements to read + * + * Search for a property in a device node and read 32-bit value(s) from + * it. + * + * Return: 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + * + * The out_values is modified only if a valid u32 value can be decoded. + */ +static inline int of_property_read_u32_array(const struct device_node *np, + const char *propname, + u32 *out_values, size_t sz) +{ + int ret = of_property_read_variable_u32_array(np, propname, out_values, + sz, 0); + if (ret >= 0) + return 0; + else + return ret; +} + +/** + * of_property_read_u64_array - Find and read an array of 64 bit integers + * from a property. + * + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_values: pointer to return value, modified only if return value is 0. + * @sz: number of array elements to read + * + * Search for a property in a device node and read 64-bit value(s) from + * it. + * + * Return: 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + * + * The out_values is modified only if a valid u64 value can be decoded. + */ +static inline int of_property_read_u64_array(const struct device_node *np, + const char *propname, + u64 *out_values, size_t sz) +{ + int ret = of_property_read_variable_u64_array(np, propname, out_values, + sz, 0); + if (ret >= 0) + return 0; + else + return ret; +} + static inline int of_property_read_u8(const struct device_node *np, const char *propname, u8 *out_value) From 9b22c17a3cc5f61b195da624cbb48634b4e42055 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 18 Jan 2022 11:34:04 -0600 Subject: [PATCH 0292/1295] of: Check 'of_node_reused' flag on of_match_device() Commit 0f153a1b8193 ("usb: chipidea: Set the DT node on the child device") caused the child device to match on the parent driver instead of the child's driver since the child's DT node pointer matched. The worst case result is a loop of the parent driver probing another instance and creating yet another child device eventually exhausting the stack. If the child driver happens to match first, then everything works fine. A device sharing the DT node should never do DT based driver matching, so let's simply check of_node_reused in of_match_device() to prevent that. Fixes: 0f153a1b8193 ("usb: chipidea: Set the DT node on the child device") Link: https://lore.kernel.org/all/20220114105620.GK18506@ediswmail.ad.cirrus.com/ Reported-by: Charles Keepax Cc: Frank Rowand Cc: Arnd Bergmann Cc: Tony Lindgren Cc: Greg Kroah-Hartman Cc: Peter Chen Tested-by: Charles Keepax Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220118173404.1891800-1-robh@kernel.org --- drivers/of/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/of/device.c b/drivers/of/device.c index b0800c260f64..874f031442dc 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -28,7 +28,7 @@ const struct of_device_id *of_match_device(const struct of_device_id *matches, const struct device *dev) { - if ((!matches) || (!dev->of_node)) + if (!matches || !dev->of_node || dev->of_node_reused) return NULL; return of_match_node(matches, dev->of_node); } From bd25c378527f3fde38a496ac2744cbd3924f0803 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 14 Jan 2022 07:52:07 +0100 Subject: [PATCH 0293/1295] parisc: Use safer strscpy() in setup_cmdline() Signed-off-by: Helge Deller --- arch/parisc/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index cceb09855e03..456087a2350c 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -56,7 +56,7 @@ void __init setup_cmdline(char **cmdline_p) /* called from hpux boot loader */ boot_command_line[0] = '\0'; } else { - strlcpy(boot_command_line, (char *)__va(boot_args[1]), + strscpy(boot_command_line, (char *)__va(boot_args[1]), COMMAND_LINE_SIZE); #ifdef CONFIG_BLK_DEV_INITRD @@ -68,7 +68,7 @@ void __init setup_cmdline(char **cmdline_p) #endif } - strcpy(command_line, boot_command_line); + strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; } From 5f7ee6e37a3cadefe45378c17c4285fa41141d92 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 14 Jan 2022 07:57:20 +0100 Subject: [PATCH 0294/1295] parisc: Autodetect default output device and set console= kernel parameter Usually palo (the PA-RISC boot loader) will check at boot time if the machine/firmware was configured to use the serial line (ttyS0, SERIAL_x) or the graphical display (tty0, graph) as default output device and add the correct "console=ttyS0" or "console=tty0" Linux kernel parameter to the kernel command line when starting the Linux kernel. But the kernel could also have been started via the HP-UX boot loader or directly in qemu, in which cases the console parameter is missing. This patch fixes this problem by adding the correct console= parameter if it's missing in the current kernel command line. Signed-off-by: Helge Deller --- arch/parisc/kernel/setup.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index 456087a2350c..b91cb45ffd4e 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -48,6 +48,7 @@ struct proc_dir_entry * proc_mckinley_root __read_mostly = NULL; void __init setup_cmdline(char **cmdline_p) { extern unsigned int boot_args[]; + char *p; /* Collect stuff passed in from the boot loader */ @@ -59,6 +60,16 @@ void __init setup_cmdline(char **cmdline_p) strscpy(boot_command_line, (char *)__va(boot_args[1]), COMMAND_LINE_SIZE); + /* autodetect console type (if not done by palo yet) */ + p = boot_command_line; + if (!str_has_prefix(p, "console=") && !strstr(p, " console=")) { + strlcat(p, " console=", COMMAND_LINE_SIZE); + if (PAGE0->mem_cons.cl_class == CL_DUPLEX) + strlcat(p, "ttyS0", COMMAND_LINE_SIZE); + else + strlcat(p, "tty0", COMMAND_LINE_SIZE); + } + #ifdef CONFIG_BLK_DEV_INITRD if (boot_args[2] != 0) /* did palo pass us a ramdisk? */ { From 30f308999426871e1b896384093e9a681099f521 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 17 Jan 2022 10:10:10 +0100 Subject: [PATCH 0295/1295] parisc: Fix missing prototype for 'toc_intr' warning in toc.c Fix a missing prototype warning noticed by the kernel test robot. Reported-by: kernel test robot Signed-off-by: Helge Deller --- arch/parisc/include/asm/processor.h | 1 + arch/parisc/kernel/toc.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index b669f4b9040b..3a3d05438408 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -289,6 +289,7 @@ extern int _parisc_requires_coherency; extern int running_on_qemu; +extern void __noreturn toc_intr(struct pt_regs *regs); extern void toc_handler(void); extern unsigned int toc_handler_size; extern unsigned int toc_handler_csum; diff --git a/arch/parisc/kernel/toc.c b/arch/parisc/kernel/toc.c index fa5a10eaf0aa..e4b48d07afbd 100644 --- a/arch/parisc/kernel/toc.c +++ b/arch/parisc/kernel/toc.c @@ -10,6 +10,7 @@ #include #include #include +#include static unsigned int __aligned(16) toc_lock = 1; DEFINE_PER_CPU_PAGE_ALIGNED(char [16384], toc_stack) __visible; From 5e547d60dae7c66fe0c33654474eedcc1ddace67 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 20 Jan 2022 11:40:08 +0100 Subject: [PATCH 0296/1295] dt-bindings: display: bridge: drop Enric Balletbo i Serra from maintainers Enric Balletbo i Serra emails bounce: : Recipient address rejected: User unknown in local recipient table so drop him from the maintainers, similarly to commit 3119c28634dd ("MAINTAINERS: Chrome: Drop Enric Balletbo i Serra"). Add generic DRM bridge maintainers to Analogix ANX7814. Signed-off-by: Krzysztof Kozlowski Acked-by: Neil Armstrong Acked-by: Enric Balletbo i Serra Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220120104009.159147-1-krzysztof.kozlowski@canonical.com --- .../devicetree/bindings/display/bridge/analogix,anx7814.yaml | 4 +++- .../bindings/display/bridge/google,cros-ec-anx7688.yaml | 1 - Documentation/devicetree/bindings/display/bridge/ps8640.yaml | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/display/bridge/analogix,anx7814.yaml b/Documentation/devicetree/bindings/display/bridge/analogix,anx7814.yaml index 8e13f27b28ed..bce96b5b0db0 100644 --- a/Documentation/devicetree/bindings/display/bridge/analogix,anx7814.yaml +++ b/Documentation/devicetree/bindings/display/bridge/analogix,anx7814.yaml @@ -7,7 +7,9 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Analogix ANX7814 SlimPort (Full-HD Transmitter) maintainers: - - Enric Balletbo i Serra + - Andrzej Hajda + - Neil Armstrong + - Robert Foss properties: compatible: diff --git a/Documentation/devicetree/bindings/display/bridge/google,cros-ec-anx7688.yaml b/Documentation/devicetree/bindings/display/bridge/google,cros-ec-anx7688.yaml index 9f7cc6b757cb..a88a5d8c7ba5 100644 --- a/Documentation/devicetree/bindings/display/bridge/google,cros-ec-anx7688.yaml +++ b/Documentation/devicetree/bindings/display/bridge/google,cros-ec-anx7688.yaml @@ -8,7 +8,6 @@ title: ChromeOS EC ANX7688 HDMI to DP Converter through Type-C Port maintainers: - Nicolas Boichat - - Enric Balletbo i Serra description: | ChromeOS EC ANX7688 is a display bridge that converts HDMI 2.0 to diff --git a/Documentation/devicetree/bindings/display/bridge/ps8640.yaml b/Documentation/devicetree/bindings/display/bridge/ps8640.yaml index cdaf7a7a8f88..186e17be51fb 100644 --- a/Documentation/devicetree/bindings/display/bridge/ps8640.yaml +++ b/Documentation/devicetree/bindings/display/bridge/ps8640.yaml @@ -8,7 +8,6 @@ title: MIPI DSI to eDP Video Format Converter Device Tree Bindings maintainers: - Nicolas Boichat - - Enric Balletbo i Serra description: | The PS8640 is a low power MIPI-to-eDP video format converter supporting From 18a86e5907f7160fb548d0d717e0f842b310708a Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 20 Jan 2022 11:40:09 +0100 Subject: [PATCH 0297/1295] dt-bindings: google,cros-ec: drop Enric Balletbo i Serra from maintainers Enric Balletbo i Serra emails bounce: : Recipient address rejected: User unknown in local recipient table so drop him from the maintainers, similarly to commit 3119c28634dd ("MAINTAINERS: Chrome: Drop Enric Balletbo i Serra"). Signed-off-by: Krzysztof Kozlowski Acked-by: Lee Jones Acked-by: Enric Balletbo i Serra Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220120104009.159147-2-krzysztof.kozlowski@canonical.com --- .../devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml | 1 - .../devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml | 1 - .../bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml | 1 - Documentation/devicetree/bindings/input/google,cros-ec-keyb.yaml | 1 - Documentation/devicetree/bindings/mfd/google,cros-ec.yaml | 1 - 5 files changed, 5 deletions(-) diff --git a/Documentation/devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml b/Documentation/devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml index 20e1ccfc8630..2d82b44268db 100644 --- a/Documentation/devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml +++ b/Documentation/devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml @@ -8,7 +8,6 @@ title: ChromeOS EC USB Type-C cable and accessories detection maintainers: - Benson Leung - - Enric Balletbo i Serra description: | On ChromeOS systems with USB Type C ports, the ChromeOS Embedded Controller is diff --git a/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml b/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml index b386e4128a79..6e1c70e9275e 100644 --- a/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml +++ b/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml @@ -10,7 +10,6 @@ title: I2C bus that tunnels through the ChromeOS EC (cros-ec) maintainers: - Doug Anderson - Benson Leung - - Enric Balletbo i Serra description: | On some ChromeOS board designs we've got a connection to the EC diff --git a/Documentation/devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml b/Documentation/devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml index 099b4be927d4..00e3b59641d2 100644 --- a/Documentation/devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml +++ b/Documentation/devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml @@ -10,7 +10,6 @@ title: ChromeOS EC MKBP Proximity Sensor maintainers: - Stephen Boyd - Benson Leung - - Enric Balletbo i Serra description: | Google's ChromeOS EC sometimes has the ability to detect user proximity. diff --git a/Documentation/devicetree/bindings/input/google,cros-ec-keyb.yaml b/Documentation/devicetree/bindings/input/google,cros-ec-keyb.yaml index 5377b232fa10..e8f137abb03c 100644 --- a/Documentation/devicetree/bindings/input/google,cros-ec-keyb.yaml +++ b/Documentation/devicetree/bindings/input/google,cros-ec-keyb.yaml @@ -10,7 +10,6 @@ title: ChromeOS EC Keyboard maintainers: - Simon Glass - Benson Leung - - Enric Balletbo i Serra description: | Google's ChromeOS EC Keyboard is a simple matrix keyboard diff --git a/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml b/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml index d793dd0316b7..58a1a9405228 100644 --- a/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml +++ b/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml @@ -8,7 +8,6 @@ title: ChromeOS Embedded Controller maintainers: - Benson Leung - - Enric Balletbo i Serra - Guenter Roeck description: From c59cd507fb640c2acc6b07cb60d7f765839e18c7 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Wed, 19 Jan 2022 10:39:37 -0800 Subject: [PATCH 0298/1295] RISC-V: nommu_virt: Drop unused SLAB_MERGE_DEFAULT Our nommu_virt_defconfig set SLOB=y and SLAB_MERGE_DEFAULT=n. As of eb52c0fc2331 ("mm: Make SLAB_MERGE_DEFAULT depend on SL[AU]B") it's no longer necessary to set the second, which appears to never have had any effect for SLOB=y anyway. This was suggested by savedefconfig. Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/nommu_virt_defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/configs/nommu_virt_defconfig b/arch/riscv/configs/nommu_virt_defconfig index 5f078156dfcd..e1c9864b6237 100644 --- a/arch/riscv/configs/nommu_virt_defconfig +++ b/arch/riscv/configs/nommu_virt_defconfig @@ -24,7 +24,6 @@ CONFIG_EXPERT=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_COMPAT_BRK is not set CONFIG_SLOB=y -# CONFIG_SLAB_MERGE_DEFAULT is not set # CONFIG_MMU is not set CONFIG_SOC_VIRT=y CONFIG_SMP=y From b0ac702f3329cdc8a06dcaac73183d4b5a2b942d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 18 Jan 2022 19:39:05 -0800 Subject: [PATCH 0299/1295] Documentation: fix firewire.rst ABI file path error Adjust the path of the ABI files for firewire.rst to prevent a documentation build error. Prevents this problem: Sphinx parallel build error: docutils.utils.SystemMessage: Documentation/driver-api/firewire.rst:22: (SEVERE/4) Problems with "include" directive path: InputError: [Errno 2] No such file or directory: '../Documentation/driver-api/ABI/stable/firewire-cdev'. Fixes: 2f4830ef96d2 ("FireWire: add driver-api Introduction section") Signed-off-by: Randy Dunlap Tested-by: Akira Yokosawa Link: https://lore.kernel.org/r/20220119033905.4779-1-rdunlap@infradead.org Signed-off-by: Jonathan Corbet --- Documentation/driver-api/firewire.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/driver-api/firewire.rst b/Documentation/driver-api/firewire.rst index 94a2d7f01d99..d3cfa73cbb2b 100644 --- a/Documentation/driver-api/firewire.rst +++ b/Documentation/driver-api/firewire.rst @@ -19,7 +19,7 @@ of kernel interfaces is available via exported symbols in `firewire-core` module Firewire char device data structures ==================================== -.. include:: /ABI/stable/firewire-cdev +.. include:: ../ABI/stable/firewire-cdev :literal: .. kernel-doc:: include/uapi/linux/firewire-cdev.h @@ -28,7 +28,7 @@ Firewire char device data structures Firewire device probing and sysfs interfaces ============================================ -.. include:: /ABI/stable/sysfs-bus-firewire +.. include:: ../ABI/stable/sysfs-bus-firewire :literal: .. kernel-doc:: drivers/firewire/core-device.c From 5298d4bfe80f6ae6ae2777bcd1357b0022d98573 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jan 2022 07:56:14 +0100 Subject: [PATCH 0300/1295] unicode: clean up the Kconfig symbol confusion Turn the CONFIG_UNICODE symbol into a tristate that generates some always built in code and remove the confusing CONFIG_UNICODE_UTF8_DATA symbol. Note that a lot of the IS_ENABLED() checks could be turned from cpp statements into normal ifs, but this change is intended to be fairly mechanic, so that should be cleaned up later. Fixes: 2b3d04787012 ("unicode: Add utf8-data module") Reported-by: Linus Torvalds Reviewed-by: Eric Biggers Signed-off-by: Christoph Hellwig Signed-off-by: Gabriel Krisman Bertazi --- fs/Makefile | 2 +- fs/ext4/ext4.h | 14 +++++++------- fs/ext4/hash.c | 2 +- fs/ext4/namei.c | 12 ++++++------ fs/ext4/super.c | 10 +++++----- fs/ext4/sysfs.c | 8 ++++---- fs/f2fs/dir.c | 10 +++++----- fs/f2fs/f2fs.h | 2 +- fs/f2fs/hash.c | 2 +- fs/f2fs/namei.c | 4 ++-- fs/f2fs/recovery.c | 4 ++-- fs/f2fs/super.c | 10 +++++----- fs/f2fs/sysfs.c | 10 +++++----- fs/libfs.c | 10 +++++----- fs/unicode/Kconfig | 18 +++++------------- fs/unicode/Makefile | 6 ++++-- include/linux/fs.h | 2 +- 17 files changed, 60 insertions(+), 66 deletions(-) diff --git a/fs/Makefile b/fs/Makefile index 84c5e4cdfee5..c71ee0127866 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -94,7 +94,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs/ obj-$(CONFIG_NFSD) += nfsd/ obj-$(CONFIG_LOCKD) += lockd/ obj-$(CONFIG_NLS) += nls/ -obj-$(CONFIG_UNICODE) += unicode/ +obj-y += unicode/ obj-$(CONFIG_SYSV_FS) += sysv/ obj-$(CONFIG_SMBFS_COMMON) += smbfs_common/ obj-$(CONFIG_CIFS) += cifs/ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 71a3cdceaa03..242e74cfb060 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2485,7 +2485,7 @@ struct ext4_filename { #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_str crypto_buf; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct fscrypt_str cf_name; #endif }; @@ -2721,7 +2721,7 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) extern int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *fname); @@ -2754,7 +2754,7 @@ static inline int ext4_fname_setup_filename(struct inode *dir, ext4_fname_from_fscrypt_name(fname, &name); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif return err; @@ -2773,7 +2773,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir, ext4_fname_from_fscrypt_name(fname, &name); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); #endif return err; @@ -2790,7 +2790,7 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) fname->usr_fname = NULL; fname->disk_name.name = NULL; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) kfree(fname->cf_name.name); fname->cf_name.name = NULL; #endif @@ -2806,7 +2806,7 @@ static inline int ext4_fname_setup_filename(struct inode *dir, fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif @@ -2822,7 +2822,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir, static inline void ext4_fname_free_filename(struct ext4_filename *fname) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) kfree(fname->cf_name.name); fname->cf_name.name = NULL; #endif diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index f34f4176c1e7..147b5241dd94 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -290,7 +290,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, int ext4fs_dirhash(const struct inode *dir, const char *name, int len, struct dx_hash_info *hinfo) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) const struct unicode_map *um = dir->i_sb->s_encoding; int r, dlen; unsigned char *buff; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 52c9bd154122..269d2d051ede 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1317,7 +1317,7 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) dx_set_count(entries, count + 1); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * Test whether a case-insensitive directory entry matches the filename * being searched for. If quick is set, assume the name being looked up @@ -1428,7 +1428,7 @@ static bool ext4_match(struct inode *parent, f.crypto_buf = fname->crypto_buf; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { if (fname->cf_name.name) { @@ -1800,7 +1800,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi } } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (!inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as @@ -2308,7 +2308,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name)) return -EINVAL; @@ -3126,7 +3126,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) ext4_fc_track_unlink(handle, dentry); retval = ext4_mark_inode_dirty(handle, dir); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the @@ -3231,7 +3231,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry)); if (!retval) ext4_fc_track_unlink(handle, dentry); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the diff --git a/fs/ext4/super.c b/fs/ext4/super.c index db9fe4843529..52be1ca38eef 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1302,7 +1302,7 @@ static void ext4_put_super(struct super_block *sb) kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); @@ -1962,7 +1962,7 @@ static const struct mount_opts { {Opt_err, 0, 0} }; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) static const struct ext4_sb_encodings { __u16 magic; char *name; @@ -3609,7 +3609,7 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } -#ifndef CONFIG_UNICODE +#if !IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb)) { ext4_msg(sb, KERN_ERR, "Filesystem with casefold feature cannot be " @@ -4613,7 +4613,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err < 0) goto failed_mount; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { const struct ext4_sb_encodings *encoding_info; struct unicode_map *encoding; @@ -5517,7 +5517,7 @@ failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index f61e65ae27d8..d233c24ea342 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -309,7 +309,7 @@ EXT4_ATTR_FEATURE(meta_bg_resize); EXT4_ATTR_FEATURE(encryption); EXT4_ATTR_FEATURE(test_dummy_encryption_v2); #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) EXT4_ATTR_FEATURE(casefold); #endif #ifdef CONFIG_FS_VERITY @@ -317,7 +317,7 @@ EXT4_ATTR_FEATURE(verity); #endif EXT4_ATTR_FEATURE(metadata_csum_seed); EXT4_ATTR_FEATURE(fast_commit); -#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) EXT4_ATTR_FEATURE(encrypted_casefold); #endif @@ -329,7 +329,7 @@ static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(encryption), ATTR_LIST(test_dummy_encryption_v2), #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) ATTR_LIST(casefold), #endif #ifdef CONFIG_FS_VERITY @@ -337,7 +337,7 @@ static struct attribute *ext4_feat_attrs[] = { #endif ATTR_LIST(metadata_csum_seed), ATTR_LIST(fast_commit), -#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) ATTR_LIST(encrypted_casefold), #endif NULL, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 1820e9c106f7..166f08623362 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -16,7 +16,7 @@ #include "xattr.h" #include -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif @@ -79,7 +79,7 @@ unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de) int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct super_block *sb = dir->i_sb; if (IS_CASEFOLDED(dir)) { @@ -174,7 +174,7 @@ void f2fs_free_filename(struct f2fs_filename *fname) kfree(fname->crypto_buf.name); fname->crypto_buf.name = NULL; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (fname->cf_name.name) { kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); fname->cf_name.name = NULL; @@ -208,7 +208,7 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir, return f2fs_find_target_dentry(&d, fname, max_slots); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * Test whether a case-insensitive directory entry matches the filename * being searched for. @@ -266,7 +266,7 @@ static inline int f2fs_match_name(const struct inode *dir, { struct fscrypt_name f; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (fname->cf_name.name) { struct qstr cf = FSTR_TO_QSTR(&fname->cf_name); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d0d603187171..4da88928ffb5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -487,7 +487,7 @@ struct f2fs_filename { */ struct fscrypt_str crypto_buf; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * For casefolded directories: the casefolded name, but it's left NULL * if the original name is not valid Unicode, if the directory is both diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index e3beac546c63..3cb1e7a24740 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -105,7 +105,7 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) return; } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (IS_CASEFOLDED(dir)) { /* * If the casefolded name is provided, hash it instead of the diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a728a0af9ce0..5f213f05556d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -561,7 +561,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, goto out_iput; } out_splice: -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (!inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as @@ -622,7 +622,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } f2fs_delete_entry(de, page, dir, inode); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d1664a0567ef..2fbbc820c00a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -46,7 +46,7 @@ static struct kmem_cache *fsync_entry_slab; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif @@ -149,7 +149,7 @@ static int init_recovered_filename(const struct inode *dir, if (err) return err; f2fs_hash_filename(dir, fname); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* Case-sensitive match is fine for recovery */ kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); fname->cf_name.name = NULL; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 15f12ece0ac6..b870c6459fa1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -256,7 +256,7 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...) va_end(args); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) static const struct f2fs_sb_encodings { __u16 magic; char *name; @@ -1218,7 +1218,7 @@ default_check: return -EINVAL; } #endif -#ifndef CONFIG_UNICODE +#if !IS_ENABLED(CONFIG_UNICODE) if (f2fs_sb_has_casefold(sbi)) { f2fs_err(sbi, "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE"); @@ -1578,7 +1578,7 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_iostat(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); @@ -3861,7 +3861,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (f2fs_sb_has_casefold(sbi) && !sbi->sb->s_encoding) { const struct f2fs_sb_encodings *encoding_info; struct unicode_map *encoding; @@ -4412,7 +4412,7 @@ free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); sb->s_encoding = NULL; #endif diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 8408f77764e8..fa3d9cb2d69b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -192,7 +192,7 @@ static ssize_t unusable_show(struct f2fs_attr *a, static ssize_t encoding_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct super_block *sb = sbi->sb; if (f2fs_sb_has_casefold(sbi)) @@ -756,7 +756,7 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks); #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption); F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) F2FS_FEATURE_RO_ATTR(encrypted_casefold); #endif #endif /* CONFIG_FS_ENCRYPTION */ @@ -775,7 +775,7 @@ F2FS_FEATURE_RO_ATTR(lost_found); F2FS_FEATURE_RO_ATTR(verity); #endif F2FS_FEATURE_RO_ATTR(sb_checksum); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) F2FS_FEATURE_RO_ATTR(casefold); #endif F2FS_FEATURE_RO_ATTR(readonly); @@ -886,7 +886,7 @@ static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), ATTR_LIST(test_dummy_encryption_v2), -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) ATTR_LIST(encrypted_casefold), #endif #endif /* CONFIG_FS_ENCRYPTION */ @@ -905,7 +905,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(verity), #endif ATTR_LIST(sb_checksum), -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) ATTR_LIST(casefold), #endif ATTR_LIST(readonly), diff --git a/fs/libfs.c b/fs/libfs.c index ba7438ab9371..974125270a42 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1379,7 +1379,7 @@ bool is_empty_dir_inode(struct inode *inode) (inode->i_op == &empty_dir_inode_operations); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * Determine if the name of a dentry should be casefolded. * @@ -1473,7 +1473,7 @@ static const struct dentry_operations generic_encrypted_dentry_ops = { }; #endif -#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE) +#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) static const struct dentry_operations generic_encrypted_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, @@ -1508,10 +1508,10 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) #ifdef CONFIG_FS_ENCRYPTION bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) bool needs_ci_ops = dentry->d_sb->s_encoding; #endif -#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE) +#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) if (needs_encrypt_ops && needs_ci_ops) { d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops); return; @@ -1523,7 +1523,7 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) return; } #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (needs_ci_ops) { d_set_d_op(dentry, &generic_ci_dentry_ops); return; diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig index 610d7bc05d6e..da786a687fdc 100644 --- a/fs/unicode/Kconfig +++ b/fs/unicode/Kconfig @@ -3,21 +3,13 @@ # UTF-8 normalization # config UNICODE - bool "UTF-8 normalization and casefolding support" + tristate "UTF-8 normalization and casefolding support" help Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding - support. - -config UNICODE_UTF8_DATA - tristate "UTF-8 normalization and casefolding tables" - depends on UNICODE - default UNICODE - help - This contains a large table of case foldings, which can be loaded as - a separate module if you say M here. To be on the safe side stick - to the default of Y. Saying N here makes no sense, if you do not want - utf8 casefolding support, disable CONFIG_UNICODE instead. + support. If you say M here the large table of case foldings will + be a separate loadable module that gets requested only when a file + system actually use it. config UNICODE_NORMALIZATION_SELFTEST tristate "Test UTF-8 normalization support" - depends on UNICODE_UTF8_DATA + depends on UNICODE diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index 2f9d9188852b..0cc87423de82 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -1,8 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_UNICODE) += unicode.o +ifneq ($(CONFIG_UNICODE),) +obj-y += unicode.o +endif +obj-$(CONFIG_UNICODE) += utf8data.o obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o -obj-$(CONFIG_UNICODE_UTF8_DATA) += utf8data.o unicode-y := utf8-norm.o utf8-core.o diff --git a/include/linux/fs.h b/include/linux/fs.h index c8510da6cc6d..fdac22d16c2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1490,7 +1490,7 @@ struct super_block { #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif From a0af3d1104f752b6d0dba71788e3fddd67c857a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Jan 2022 18:54:52 +0100 Subject: [PATCH 0301/1295] PCI/MSI: Prevent UAF in error path When the core MSI allocation fails, then the PCI/MSI code uses an already freed MSI descriptor to unmask the MSI mask register in order to bring it back into reset state. Remove MSI_FLAG_FREE_MSI_DESCS from the PCI/MSI irqdomain flags and let the PCI/MSI code free the MSI descriptors after usage. Fixes: 0f62d941acf9 ("genirq/msi: Provide msi_domain_alloc/free_irqs_descs_locked()") Reported-by: Tong Zhang Signed-off-by: Thomas Gleixner Tested-by: Tong Zhang Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/87r1938vbn.ffs@tglx --- drivers/pci/msi/irqdomain.c | 4 ++-- drivers/pci/msi/legacy.c | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 0d63541c4052..e9cf318e6670 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -28,6 +28,7 @@ void pci_msi_teardown_msi_irqs(struct pci_dev *dev) msi_domain_free_irqs_descs_locked(domain, &dev->dev); else pci_msi_legacy_teardown_msi_irqs(dev); + msi_free_msi_descs(&dev->dev); } /** @@ -171,8 +172,7 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) pci_msi_domain_update_chip_ops(info); - info->flags |= MSI_FLAG_ACTIVATE_EARLY | MSI_FLAG_DEV_SYSFS | - MSI_FLAG_FREE_MSI_DESCS; + info->flags |= MSI_FLAG_ACTIVATE_EARLY | MSI_FLAG_DEV_SYSFS; if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) info->flags |= MSI_FLAG_MUST_REACTIVATE; diff --git a/drivers/pci/msi/legacy.c b/drivers/pci/msi/legacy.c index cdbb4689db78..db761adef652 100644 --- a/drivers/pci/msi/legacy.c +++ b/drivers/pci/msi/legacy.c @@ -77,5 +77,4 @@ void pci_msi_legacy_teardown_msi_irqs(struct pci_dev *dev) { msi_device_destroy_sysfs(&dev->dev); arch_teardown_msi_irqs(dev); - msi_free_msi_descs(&dev->dev); } From cbda1b16687580d5beee38273f6241ae3725960c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Wed, 19 Jan 2022 17:27:48 +0100 Subject: [PATCH 0302/1295] phylib: fix potential use-after-free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit bafbdd527d56 ("phylib: Add device reset GPIO support") added call to phy_device_reset(phydev) after the put_device() call in phy_detach(). The comment before the put_device() call says that the phydev might go away with put_device(). Fix potential use-after-free by calling phy_device_reset() before put_device(). Fixes: bafbdd527d56 ("phylib: Add device reset GPIO support") Signed-off-by: Marek Behún Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220119162748.32418-1-kabel@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 74d8e1dc125f..ce0bb5951b81 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1746,6 +1746,9 @@ void phy_detach(struct phy_device *phydev) phy_driver_is_genphy_10g(phydev)) device_release_driver(&phydev->mdio.dev); + /* Assert the reset signal */ + phy_device_reset(phydev, 1); + /* * The phydev might go away on the put_device() below, so avoid * a use-after-free bug by reading the underlying bus first. @@ -1757,9 +1760,6 @@ void phy_detach(struct phy_device *phydev) ndev_owner = dev->dev.parent->driver->owner; if (ndev_owner != bus->owner) module_put(bus->owner); - - /* Assert the reset signal */ - phy_device_reset(phydev, 1); } EXPORT_SYMBOL(phy_detach); From 48cec899e357cfb92d022a9c0df6bbe72a7f6951 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 20 Jan 2022 14:34:40 +0200 Subject: [PATCH 0303/1295] tcp: Add a stub for sk_defer_free_flush() When compiling the kernel with CONFIG_INET disabled, the sk_defer_free_flush() should be defined as a nop. This resolves the following compilation error: ld: net/core/sock.o: in function `sk_defer_free_flush': ./include/net/tcp.h:1378: undefined reference to `__sk_defer_free_flush' Fixes: 79074a72d335 ("net: Flush deferred skb free on socket destroy") Reported-by: kernel test robot Reviewed-by: Tariq Toukan Signed-off-by: Gal Pressman Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220120123440.9088-1-gal@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index 44e442bf23f9..b9fc978fb2ca 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1369,6 +1369,7 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); +#ifdef CONFIG_INET void __sk_defer_free_flush(struct sock *sk); static inline void sk_defer_free_flush(struct sock *sk) @@ -1377,6 +1378,9 @@ static inline void sk_defer_free_flush(struct sock *sk) return; __sk_defer_free_flush(sk); } +#else +static inline void sk_defer_free_flush(struct sock *sk) {} +#endif int tcp_filter(struct sock *sk, struct sk_buff *skb); void tcp_set_state(struct sock *sk, int state); From ebdc1a0309629e71e5910b353e6b005f022ce171 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Jan 2022 04:45:30 -0800 Subject: [PATCH 0304/1295] tcp: add a missing sk_defer_free_flush() in tcp_splice_read() Without it, splice users can hit the warning added in commit 79074a72d335 ("net: Flush deferred skb free on socket destroy") Fixes: f35f821935d8 ("tcp: defer skb freeing after socket lock is released") Fixes: 79074a72d335 ("net: Flush deferred skb free on socket destroy") Suggested-by: Jakub Kicinski Signed-off-by: Eric Dumazet Cc: Gal Pressman Link: https://lore.kernel.org/r/20220120124530.925607-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3b75836db19b..78e81465f5f3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -842,6 +842,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, } release_sock(sk); + sk_defer_free_flush(sk); if (spliced) return spliced; From aafc2e3285c2d7a79b7ee15221c19fbeca7b1509 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Jan 2022 09:41:12 -0800 Subject: [PATCH 0305/1295] ipv6: annotate accesses to fn->fn_sernum struct fib6_node's fn_sernum field can be read while other threads change it. Add READ_ONCE()/WRITE_ONCE() annotations. Do not change existing smp barriers in fib6_get_cookie_safe() and __fib6_update_sernum_upto_root() syzbot reported: BUG: KCSAN: data-race in fib6_clean_node / inet6_csk_route_socket write to 0xffff88813df62e2c of 4 bytes by task 1920 on cpu 1: fib6_clean_node+0xc2/0x260 net/ipv6/ip6_fib.c:2178 fib6_walk_continue+0x38e/0x430 net/ipv6/ip6_fib.c:2112 fib6_walk net/ipv6/ip6_fib.c:2160 [inline] fib6_clean_tree net/ipv6/ip6_fib.c:2240 [inline] __fib6_clean_all+0x1a9/0x2e0 net/ipv6/ip6_fib.c:2256 fib6_flush_trees+0x6c/0x80 net/ipv6/ip6_fib.c:2281 rt_genid_bump_ipv6 include/net/net_namespace.h:488 [inline] addrconf_dad_completed+0x57f/0x870 net/ipv6/addrconf.c:4230 addrconf_dad_work+0x908/0x1170 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x1bf/0x1e0 kernel/kthread.c:359 ret_from_fork+0x1f/0x30 read to 0xffff88813df62e2c of 4 bytes by task 15701 on cpu 0: fib6_get_cookie_safe include/net/ip6_fib.h:285 [inline] rt6_get_cookie include/net/ip6_fib.h:306 [inline] ip6_dst_store include/net/ip6_route.h:234 [inline] inet6_csk_route_socket+0x352/0x3c0 net/ipv6/inet6_connection_sock.c:109 inet6_csk_xmit+0x91/0x1e0 net/ipv6/inet6_connection_sock.c:121 __tcp_transmit_skb+0x1323/0x1840 net/ipv4/tcp_output.c:1402 tcp_transmit_skb net/ipv4/tcp_output.c:1420 [inline] tcp_write_xmit+0x1450/0x4460 net/ipv4/tcp_output.c:2680 __tcp_push_pending_frames+0x68/0x1c0 net/ipv4/tcp_output.c:2864 tcp_push+0x2d9/0x2f0 net/ipv4/tcp.c:725 mptcp_push_release net/mptcp/protocol.c:1491 [inline] __mptcp_push_pending+0x46c/0x490 net/mptcp/protocol.c:1578 mptcp_sendmsg+0x9ec/0xa50 net/mptcp/protocol.c:1764 inet6_sendmsg+0x5f/0x80 net/ipv6/af_inet6.c:643 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] kernel_sendmsg+0x97/0xd0 net/socket.c:745 sock_no_sendpage+0x84/0xb0 net/core/sock.c:3086 inet_sendpage+0x9d/0xc0 net/ipv4/af_inet.c:834 kernel_sendpage+0x187/0x200 net/socket.c:3492 sock_sendpage+0x5a/0x70 net/socket.c:1007 pipe_to_sendpage+0x128/0x160 fs/splice.c:364 splice_from_pipe_feed fs/splice.c:418 [inline] __splice_from_pipe+0x207/0x500 fs/splice.c:562 splice_from_pipe fs/splice.c:597 [inline] generic_splice_sendpage+0x94/0xd0 fs/splice.c:746 do_splice_from fs/splice.c:767 [inline] direct_splice_actor+0x80/0xa0 fs/splice.c:936 splice_direct_to_actor+0x345/0x650 fs/splice.c:891 do_splice_direct+0x106/0x190 fs/splice.c:979 do_sendfile+0x675/0xc40 fs/read_write.c:1245 __do_sys_sendfile64 fs/read_write.c:1310 [inline] __se_sys_sendfile64 fs/read_write.c:1296 [inline] __x64_sys_sendfile64+0x102/0x140 fs/read_write.c:1296 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000026f -> 0x00000271 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 15701 Comm: syz-executor.2 Not tainted 5.16.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 The Fixes tag I chose is probably arbitrary, I do not think we need to backport this patch to older kernels. Fixes: c5cff8561d2d ("ipv6: add rcu grace period before freeing fib6_node") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20220120174112.1126644-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ip6_fib.h | 2 +- net/ipv6/ip6_fib.c | 23 +++++++++++++---------- net/ipv6/route.c | 2 +- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index a9a4ccc0cdb5..40ae8f1b18e5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -282,7 +282,7 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, fn = rcu_dereference(f6i->fib6_node); if (fn) { - *cookie = fn->fn_sernum; + *cookie = READ_ONCE(fn->fn_sernum); /* pairs with smp_wmb() in __fib6_update_sernum_upto_root() */ smp_rmb(); status = true; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 463c37dea449..413f66781e50 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -112,7 +112,7 @@ void fib6_update_sernum(struct net *net, struct fib6_info *f6i) fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) - fn->fn_sernum = fib6_new_sernum(net); + WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* @@ -590,12 +590,13 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; - cb->args[5] = w->root->fn_sernum; + cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { - if (cb->args[5] != w->root->fn_sernum) { + int sernum = READ_ONCE(w->root->fn_sernum); + if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ - cb->args[5] = w->root->fn_sernum; + cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; @@ -1345,7 +1346,7 @@ static void __fib6_update_sernum_upto_root(struct fib6_info *rt, /* paired with smp_rmb() in fib6_get_cookie_safe() */ smp_wmb(); while (fn) { - fn->fn_sernum = sernum; + WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } @@ -2174,8 +2175,8 @@ static int fib6_clean_node(struct fib6_walker *w) }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && - w->node->fn_sernum != c->sernum) - w->node->fn_sernum = c->sernum; + READ_ONCE(w->node->fn_sernum) != c->sernum) + WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); @@ -2543,7 +2544,7 @@ static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; - iter->sernum = iter->w.root->fn_sernum; + iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } @@ -2571,8 +2572,10 @@ static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { - if (iter->sernum != iter->w.root->fn_sernum) { - iter->sernum = iter->w.root->fn_sernum; + int sernum = READ_ONCE(iter->w.root->fn_sernum); + + if (iter->sernum != sernum) { + iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e6de94203c13..f4884cda13b9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2802,7 +2802,7 @@ static void ip6_link_failure(struct sk_buff *skb) if (from) { fn = rcu_dereference(from->fib6_node); if (fn && (rt->rt6i_flags & RTF_DEFAULT)) - fn->fn_sernum = -1; + WRITE_ONCE(fn->fn_sernum, -1); } } rcu_read_unlock(); From 8e9eacad7ec7a9cbf262649ebf1fa6e6f6cc7d82 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 20 Jan 2022 16:35:27 -0800 Subject: [PATCH 0306/1295] mptcp: fix msk traversal in mptcp_nl_cmd_set_flags() The MPTCP endpoint list is under RCU protection, guarded by the pernet spinlock. mptcp_nl_cmd_set_flags() traverses the list without acquiring the spin-lock nor under the RCU critical section. This change addresses the issue performing the lookup and the endpoint update under the pernet spinlock. Fixes: 0f9f696a502e ("mptcp: add set_flags command in PM netlink") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 75af1f701e1d..f17a09f7fbf9 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -478,6 +478,20 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) return NULL; } +static struct mptcp_pm_addr_entry * +__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info, + bool lookup_by_id) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, &pernet->local_addr_list, list) { + if ((!lookup_by_id && addresses_equal(&entry->addr, info, true)) || + (lookup_by_id && entry->addr.id == info->id)) + return entry; + } + return NULL; +} + static int lookup_id_by_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *addr) { @@ -1763,18 +1777,21 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; } - list_for_each_entry(entry, &pernet->local_addr_list, list) { - if ((!lookup_by_id && addresses_equal(&entry->addr, &addr.addr, true)) || - (lookup_by_id && entry->addr.id == addr.addr.id)) { - mptcp_nl_addr_backup(net, &entry->addr, bkup); - - if (bkup) - entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; - else - entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; - } + spin_lock_bh(&pernet->lock); + entry = __lookup_addr(pernet, &addr.addr, lookup_by_id); + if (!entry) { + spin_unlock_bh(&pernet->lock); + return -EINVAL; } + if (bkup) + entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else + entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; + addr = *entry; + spin_unlock_bh(&pernet->lock); + + mptcp_nl_addr_backup(net, &addr.addr, bkup); return 0; } From a4c0214fbee97c46e3f41fee37931d66c0fc3cb1 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 20 Jan 2022 16:35:28 -0800 Subject: [PATCH 0307/1295] mptcp: fix removing ids bitmap setting In mptcp_pm_nl_rm_addr_or_subflow(), the bit of rm_list->ids[i] in the id_avail_bitmap should be set, not rm_list->ids[1]. This patch fixed it. Fixes: 86e39e04482b ("mptcp: keep track of local endpoint still available for each msk") Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index f17a09f7fbf9..782b1d452269 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -791,7 +791,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, removed = true; __MPTCP_INC_STATS(sock_net(sk), rm_type); } - __set_bit(rm_list->ids[1], msk->pm.id_avail_bitmap); + __set_bit(rm_list->ids[i], msk->pm.id_avail_bitmap); if (!removed) continue; From 9846921dba4936d92f7608315b5d1e0a8ec3a538 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 20 Jan 2022 16:35:29 -0800 Subject: [PATCH 0308/1295] selftests: mptcp: fix ipv6 routing setup MPJ ipv6 selftests currently lack per link route to the server net. Additionally, ipv6 subflows endpoints are created without any interface specified. The end-result is that in ipv6 self-tests subflows are created all on the same link, leading to expected delays and sporadic self-tests failures. Fix the issue by adding the missing setup bits. Fixes: 523514ed0a99 ("selftests: mptcp: add ADD_ADDR IPv6 test cases") Reported-and-tested-by: Geliang Tang Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 27d0eb9afdca..b8bdbec0cf69 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -75,6 +75,7 @@ init() # let $ns2 reach any $ns1 address from any interface ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i + ip -net "$ns2" route add default via dead:beef:$i::1 dev ns2eth$i metric 10$i done } @@ -1476,7 +1477,7 @@ ipv6_tests() reset ip netns exec $ns1 ./pm_nl_ctl limits 0 1 ip netns exec $ns2 ./pm_nl_ctl limits 0 1 - ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 flags subflow + ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 dev ns2eth3 flags subflow run_tests $ns1 $ns2 dead:beef:1::1 0 0 0 slow chk_join_nr "single subflow IPv6" 1 1 1 @@ -1511,7 +1512,7 @@ ipv6_tests() ip netns exec $ns1 ./pm_nl_ctl limits 0 2 ip netns exec $ns1 ./pm_nl_ctl add dead:beef:2::1 flags signal ip netns exec $ns2 ./pm_nl_ctl limits 1 2 - ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 flags subflow + ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 dev ns2eth3 flags subflow run_tests $ns1 $ns2 dead:beef:1::1 0 -1 -1 slow chk_join_nr "remove subflow and signal IPv6" 2 2 2 chk_add_nr 1 1 From b875b39e7373dcaccb19a600a52a956061c2c833 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 21 Jan 2022 17:19:33 +0900 Subject: [PATCH 0309/1295] ata: pata_octeon_cf: fix call to trace_ata_bmdma_stop() The first argument of trace_ata_bmdma_stop() must be a pointer to a struct ata_port, not to a struct ata_queued_cmd. Reported-by: Linux Kernel Functional Testing Fixes: d3e140f2b008 ("ata: pata_octeon_cf: Drop pointless VPRINTK() calls and convert the remaining one") Signed-off-by: Damien Le Moal Tested-by: Linux Kernel Functional Testing --- drivers/ata/pata_octeon_cf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/pata_octeon_cf.c b/drivers/ata/pata_octeon_cf.c index 0912846bc1b0..05c2ab375756 100644 --- a/drivers/ata/pata_octeon_cf.c +++ b/drivers/ata/pata_octeon_cf.c @@ -595,7 +595,7 @@ static unsigned int octeon_cf_dma_finished(struct ata_port *ap, union cvmx_mio_boot_dma_intx dma_int; u8 status; - trace_ata_bmdma_stop(qc, &qc->tf, qc->tag); + trace_ata_bmdma_stop(ap, &qc->tf, qc->tag); if (ap->hsm_task_state != HSM_ST_LAST) return 0; From d225c449ab2be25273a3674f476c6c0b57c50254 Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Fri, 21 Jan 2022 12:04:39 +0530 Subject: [PATCH 0310/1295] octeontx2-af: Do not fixup all VF action entries AF modifies all the rules destined for VF to use the action same as default RSS action. This fixup was needed because AF only installs default rules with RSS action. But the action in rules installed by a PF for its VFs should not be changed by this fixup. This is because action can be drop or direct to queue as specified by user(ntuple filters). This patch fixes that problem. Fixes: 967db3529eca ("octeontx2-af: add support for multicast/promisc packet") Signed-off-by: Subbaraya Sundeep Signed-off-by: Naveen Mamindlapalli Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller --- .../ethernet/marvell/octeontx2/af/rvu_npc.c | 22 ++++++++++++++++--- .../marvell/octeontx2/af/rvu_npc_fs.c | 20 ++++++++++------- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index c0005a1feee6..91f86d77cd41 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -402,6 +402,7 @@ static void npc_fixup_vf_rule(struct rvu *rvu, struct npc_mcam *mcam, int blkaddr, int index, struct mcam_entry *entry, bool *enable) { + struct rvu_npc_mcam_rule *rule; u16 owner, target_func; struct rvu_pfvf *pfvf; u64 rx_action; @@ -423,6 +424,12 @@ static void npc_fixup_vf_rule(struct rvu *rvu, struct npc_mcam *mcam, test_bit(NIXLF_INITIALIZED, &pfvf->flags))) *enable = false; + /* fix up not needed for the rules added by user(ntuple filters) */ + list_for_each_entry(rule, &mcam->mcam_rules, list) { + if (rule->entry == index) + return; + } + /* copy VF default entry action to the VF mcam entry */ rx_action = npc_get_default_entry_action(rvu, mcam, blkaddr, target_func); @@ -489,8 +496,8 @@ static void npc_config_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, } /* PF installing VF rule */ - if (intf == NIX_INTF_RX && actindex < mcam->bmap_entries) - npc_fixup_vf_rule(rvu, mcam, blkaddr, index, entry, &enable); + if (is_npc_intf_rx(intf) && actindex < mcam->bmap_entries) + npc_fixup_vf_rule(rvu, mcam, blkaddr, actindex, entry, &enable); /* Set 'action' */ rvu_write64(rvu, blkaddr, @@ -916,7 +923,8 @@ static void npc_update_vf_flow_entry(struct rvu *rvu, struct npc_mcam *mcam, int blkaddr, u16 pcifunc, u64 rx_action) { int actindex, index, bank, entry; - bool enable; + struct rvu_npc_mcam_rule *rule; + bool enable, update; if (!(pcifunc & RVU_PFVF_FUNC_MASK)) return; @@ -924,6 +932,14 @@ static void npc_update_vf_flow_entry(struct rvu *rvu, struct npc_mcam *mcam, mutex_lock(&mcam->lock); for (index = 0; index < mcam->bmap_entries; index++) { if (mcam->entry2target_pffunc[index] == pcifunc) { + update = true; + /* update not needed for the rules added via ntuple filters */ + list_for_each_entry(rule, &mcam->mcam_rules, list) { + if (rule->entry == index) + update = false; + } + if (!update) + continue; bank = npc_get_bank(mcam, index); actindex = index; entry = index & (mcam->banksize - 1); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c index ff2b21999f36..19c53e591d0d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c @@ -1098,14 +1098,6 @@ find_rule: write_req.cntr = rule->cntr; } - err = rvu_mbox_handler_npc_mcam_write_entry(rvu, &write_req, - &write_rsp); - if (err) { - rvu_mcam_remove_counter_from_rule(rvu, owner, rule); - if (new) - kfree(rule); - return err; - } /* update rule */ memcpy(&rule->packet, &dummy.packet, sizeof(rule->packet)); memcpy(&rule->mask, &dummy.mask, sizeof(rule->mask)); @@ -1132,6 +1124,18 @@ find_rule: if (req->default_rule) pfvf->def_ucast_rule = rule; + /* write to mcam entry registers */ + err = rvu_mbox_handler_npc_mcam_write_entry(rvu, &write_req, + &write_rsp); + if (err) { + rvu_mcam_remove_counter_from_rule(rvu, owner, rule); + if (new) { + list_del(&rule->list); + kfree(rule); + } + return err; + } + /* VF's MAC address is being changed via PF */ if (pf_set_vfs_mac) { ether_addr_copy(pfvf->default_mac, req->packet.dmac); From 00bfe94e388fe12bfd0d4f6361b1b1343374ff5b Mon Sep 17 00:00:00 2001 From: Sunil Goutham Date: Fri, 21 Jan 2022 12:04:40 +0530 Subject: [PATCH 0311/1295] octeontx2-af: Fix LBK backpressure id count In rvu_nix_get_bpid() lbk_bpid_cnt is being read from wrong register. Due to this backpressure enable is failing for LBK VF32 onwards. This patch fixes that. Fixes: fe1939bb2340 ("octeontx2-af: Add SDP interface support") Signed-off-by: Sunil Goutham Signed-off-by: Subbaraya Sundeep Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index d8b1948aaa0a..b74ab0dae10d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -512,11 +512,11 @@ static int rvu_nix_get_bpid(struct rvu *rvu, struct nix_bp_cfg_req *req, cfg = rvu_read64(rvu, blkaddr, NIX_AF_CONST); lmac_chan_cnt = cfg & 0xFF; - cfg = rvu_read64(rvu, blkaddr, NIX_AF_CONST1); - sdp_chan_cnt = cfg & 0xFFF; - cgx_bpid_cnt = hw->cgx_links * lmac_chan_cnt; lbk_bpid_cnt = hw->lbk_links * ((cfg >> 16) & 0xFF); + + cfg = rvu_read64(rvu, blkaddr, NIX_AF_CONST1); + sdp_chan_cnt = cfg & 0xFFF; sdp_bpid_cnt = hw->sdp_links * sdp_chan_cnt; pfvf = rvu_get_pfvf(rvu, req->hdr.pcifunc); From 03ffbc9914bd1130fba464f0a41c01372e5fc359 Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Fri, 21 Jan 2022 12:04:41 +0530 Subject: [PATCH 0312/1295] octeontx2-af: Retry until RVU block reset complete Few RVU blocks like SSO require more time for reset on some silicons. Hence retrying the block reset until success. Fixes: c0fa2cff8822c ("octeontx2-af: Handle return value in block reset") Signed-off-by: Geetha sowjanya Signed-off-by: Subbaraya Sundeep Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index 3ca6b942ebe2..54e1b27a7dfe 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -520,8 +520,11 @@ static void rvu_block_reset(struct rvu *rvu, int blkaddr, u64 rst_reg) rvu_write64(rvu, blkaddr, rst_reg, BIT_ULL(0)); err = rvu_poll_reg(rvu, blkaddr, rst_reg, BIT_ULL(63), true); - if (err) - dev_err(rvu->dev, "HW block:%d reset failed\n", blkaddr); + if (err) { + dev_err(rvu->dev, "HW block:%d reset timeout retrying again\n", blkaddr); + while (rvu_poll_reg(rvu, blkaddr, rst_reg, BIT_ULL(63), true) == -EBUSY) + ; + } } static void rvu_reset_all_blocks(struct rvu *rvu) From fae80edeafbbba5ef9a0423aa5e5515518626433 Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Fri, 21 Jan 2022 12:04:42 +0530 Subject: [PATCH 0313/1295] octeontx2-af: cn10k: Use appropriate register for LMAC enable CN10K platforms uses RPM(0..2)_MTI_MAC100(0..3)_COMMAND_CONFIG register for lmac TX/RX enable whereas CN9xxx platforms use CGX_CMRX_CONFIG register. This config change was missed when adding support for CN10K RPM. Fixes: 91c6945ea1f9 ("octeontx2-af: cn10k: Add RPM MAC support") Signed-off-by: Geetha sowjanya Signed-off-by: Subbaraya Sundeep Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller --- .../net/ethernet/marvell/octeontx2/af/cgx.c | 2 + .../marvell/octeontx2/af/lmac_common.h | 3 ++ .../net/ethernet/marvell/octeontx2/af/rpm.c | 39 +++++++++++++++++++ .../net/ethernet/marvell/octeontx2/af/rpm.h | 4 ++ .../net/ethernet/marvell/octeontx2/af/rvu.h | 1 + .../ethernet/marvell/octeontx2/af/rvu_cgx.c | 14 ++++++- .../ethernet/marvell/octeontx2/af/rvu_nix.c | 10 ++--- 7 files changed, 66 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 186d00a9ab35..3631d612aaca 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -1570,6 +1570,8 @@ static struct mac_ops cgx_mac_ops = { .mac_enadis_pause_frm = cgx_lmac_enadis_pause_frm, .mac_pause_frm_config = cgx_lmac_pause_frm_config, .mac_enadis_ptp_config = cgx_lmac_ptp_config, + .mac_rx_tx_enable = cgx_lmac_rx_tx_enable, + .mac_tx_enable = cgx_lmac_tx_enable, }; static int cgx_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h b/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h index fc6e7423cbd8..b33e7d1d0851 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h @@ -107,6 +107,9 @@ struct mac_ops { void (*mac_enadis_ptp_config)(void *cgxd, int lmac_id, bool enable); + + int (*mac_rx_tx_enable)(void *cgxd, int lmac_id, bool enable); + int (*mac_tx_enable)(void *cgxd, int lmac_id, bool enable); }; struct cgx { diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c index e695fa0e82a9..4cbd91540f99 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c @@ -30,6 +30,8 @@ static struct mac_ops rpm_mac_ops = { .mac_enadis_pause_frm = rpm_lmac_enadis_pause_frm, .mac_pause_frm_config = rpm_lmac_pause_frm_config, .mac_enadis_ptp_config = rpm_lmac_ptp_config, + .mac_rx_tx_enable = rpm_lmac_rx_tx_enable, + .mac_tx_enable = rpm_lmac_tx_enable, }; struct mac_ops *rpm_get_mac_ops(void) @@ -54,6 +56,43 @@ int rpm_get_nr_lmacs(void *rpmd) return hweight8(rpm_read(rpm, 0, CGXX_CMRX_RX_LMACS) & 0xFULL); } +int rpm_lmac_tx_enable(void *rpmd, int lmac_id, bool enable) +{ + rpm_t *rpm = rpmd; + u64 cfg, last; + + if (!is_lmac_valid(rpm, lmac_id)) + return -ENODEV; + + cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG); + last = cfg; + if (enable) + cfg |= RPM_TX_EN; + else + cfg &= ~(RPM_TX_EN); + + if (cfg != last) + rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg); + return !!(last & RPM_TX_EN); +} + +int rpm_lmac_rx_tx_enable(void *rpmd, int lmac_id, bool enable) +{ + rpm_t *rpm = rpmd; + u64 cfg; + + if (!is_lmac_valid(rpm, lmac_id)) + return -ENODEV; + + cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG); + if (enable) + cfg |= RPM_RX_EN | RPM_TX_EN; + else + cfg &= ~(RPM_RX_EN | RPM_TX_EN); + rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg); + return 0; +} + void rpm_lmac_enadis_rx_pause_fwding(void *rpmd, int lmac_id, bool enable) { rpm_t *rpm = rpmd; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rpm.h b/drivers/net/ethernet/marvell/octeontx2/af/rpm.h index 57c8a687b488..ff580311edd0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rpm.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rpm.h @@ -43,6 +43,8 @@ #define RPMX_MTI_STAT_DATA_HI_CDC 0x10038 #define RPM_LMAC_FWI 0xa +#define RPM_TX_EN BIT_ULL(0) +#define RPM_RX_EN BIT_ULL(1) /* Function Declarations */ int rpm_get_nr_lmacs(void *rpmd); @@ -57,4 +59,6 @@ int rpm_lmac_enadis_pause_frm(void *rpmd, int lmac_id, u8 tx_pause, int rpm_get_tx_stats(void *rpmd, int lmac_id, int idx, u64 *tx_stat); int rpm_get_rx_stats(void *rpmd, int lmac_id, int idx, u64 *rx_stat); void rpm_lmac_ptp_config(void *rpmd, int lmac_id, bool enable); +int rpm_lmac_rx_tx_enable(void *rpmd, int lmac_id, bool enable); +int rpm_lmac_tx_enable(void *rpmd, int lmac_id, bool enable); #endif /* RPM_H */ diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index 66e45d733824..5ed94cfb47d2 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -806,6 +806,7 @@ bool is_mac_feature_supported(struct rvu *rvu, int pf, int feature); u32 rvu_cgx_get_fifolen(struct rvu *rvu); void *rvu_first_cgx_pdata(struct rvu *rvu); int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id); +int rvu_cgx_config_tx(void *cgxd, int lmac_id, bool enable); int npc_get_nixlf_mcam_index(struct npc_mcam *mcam, u16 pcifunc, int nixlf, int type); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c index 2ca182a4ce82..8a7ac5a8b821 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c @@ -441,16 +441,26 @@ void rvu_cgx_enadis_rx_bp(struct rvu *rvu, int pf, bool enable) int rvu_cgx_config_rxtx(struct rvu *rvu, u16 pcifunc, bool start) { int pf = rvu_get_pf(pcifunc); + struct mac_ops *mac_ops; u8 cgx_id, lmac_id; + void *cgxd; if (!is_cgx_config_permitted(rvu, pcifunc)) return LMAC_AF_ERR_PERM_DENIED; rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id); + cgxd = rvu_cgx_pdata(cgx_id, rvu); + mac_ops = get_mac_ops(cgxd); - cgx_lmac_rx_tx_enable(rvu_cgx_pdata(cgx_id, rvu), lmac_id, start); + return mac_ops->mac_rx_tx_enable(cgxd, lmac_id, start); +} - return 0; +int rvu_cgx_config_tx(void *cgxd, int lmac_id, bool enable) +{ + struct mac_ops *mac_ops; + + mac_ops = get_mac_ops(cgxd); + return mac_ops->mac_tx_enable(cgxd, lmac_id, enable); } void rvu_cgx_disable_dmac_entries(struct rvu *rvu, u16 pcifunc) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index b74ab0dae10d..de6e5a128864 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -2068,8 +2068,8 @@ static int nix_smq_flush(struct rvu *rvu, int blkaddr, /* enable cgx tx if disabled */ if (is_pf_cgxmapped(rvu, pf)) { rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id); - restore_tx_en = !cgx_lmac_tx_enable(rvu_cgx_pdata(cgx_id, rvu), - lmac_id, true); + restore_tx_en = !rvu_cgx_config_tx(rvu_cgx_pdata(cgx_id, rvu), + lmac_id, true); } cfg = rvu_read64(rvu, blkaddr, NIX_AF_SMQX_CFG(smq)); @@ -2092,7 +2092,7 @@ static int nix_smq_flush(struct rvu *rvu, int blkaddr, rvu_cgx_enadis_rx_bp(rvu, pf, true); /* restore cgx tx state */ if (restore_tx_en) - cgx_lmac_tx_enable(rvu_cgx_pdata(cgx_id, rvu), lmac_id, false); + rvu_cgx_config_tx(rvu_cgx_pdata(cgx_id, rvu), lmac_id, false); return err; } @@ -3878,7 +3878,7 @@ nix_config_link_credits(struct rvu *rvu, int blkaddr, int link, /* Enable cgx tx if disabled for credits to be back */ if (is_pf_cgxmapped(rvu, pf)) { rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id); - restore_tx_en = !cgx_lmac_tx_enable(rvu_cgx_pdata(cgx_id, rvu), + restore_tx_en = !rvu_cgx_config_tx(rvu_cgx_pdata(cgx_id, rvu), lmac_id, true); } @@ -3918,7 +3918,7 @@ exit: /* Restore state of cgx tx */ if (restore_tx_en) - cgx_lmac_tx_enable(rvu_cgx_pdata(cgx_id, rvu), lmac_id, false); + rvu_cgx_config_tx(rvu_cgx_pdata(cgx_id, rvu), lmac_id, false); mutex_unlock(&rvu->rsrc_lock); return rc; From c5d731c54a17677939bd59ee8be4ed74d7485ba4 Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Fri, 21 Jan 2022 12:04:43 +0530 Subject: [PATCH 0314/1295] octeontx2-pf: cn10k: Ensure valid pointers are freed to aura While freeing SQB pointers to aura, driver first memcpy to target address and then triggers lmtst operation to free pointer to the aura. We need to ensure(by adding dmb barrier)that memcpy is finished before pointers are freed to the aura. This patch also adds the missing sq context structure entry in debugfs. Fixes: ef6c8da71eaf ("octeontx2-pf: cn10K: Reserve LMTST lines per core") Signed-off-by: Geetha sowjanya Signed-off-by: Subbaraya Sundeep Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c | 2 ++ drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index a09a507369ac..d1eddb769a41 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -1224,6 +1224,8 @@ static void print_nix_cn10k_sq_ctx(struct seq_file *m, seq_printf(m, "W3: head_offset\t\t\t%d\nW3: smenq_next_sqb_vld\t\t%d\n\n", sq_ctx->head_offset, sq_ctx->smenq_next_sqb_vld); + seq_printf(m, "W3: smq_next_sq_vld\t\t%d\nW3: smq_pend\t\t\t%d\n", + sq_ctx->smq_next_sq_vld, sq_ctx->smq_pend); seq_printf(m, "W4: next_sqb \t\t\t%llx\n\n", sq_ctx->next_sqb); seq_printf(m, "W5: tail_sqb \t\t\t%llx\n\n", sq_ctx->tail_sqb); seq_printf(m, "W6: smenq_sqb \t\t\t%llx\n\n", sq_ctx->smenq_sqb); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 61e52812983f..14509fc64cce 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -603,6 +603,7 @@ static inline void __cn10k_aura_freeptr(struct otx2_nic *pfvf, u64 aura, size++; tar_addr |= ((size - 1) & 0x7) << 4; } + dma_wmb(); memcpy((u64 *)lmt_info->lmt_addr, ptrs, sizeof(u64) * num_ptrs); /* Perform LMTST flush */ cn10k_lmt_flush(val, tar_addr); From 1581d61b42d985cefe7b71eea67ab3bfcbf34d0f Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Fri, 21 Jan 2022 12:04:44 +0530 Subject: [PATCH 0315/1295] octeontx2-af: Increase link credit restore polling timeout It's been observed that sometimes link credit restore takes a lot of time than the current timeout. This patch increases the default timeout value and return the proper error value on failure. Fixes: 1c74b89171c3 ("octeontx2-af: Wait for TX link idle for credits change") Signed-off-by: Geetha sowjanya Signed-off-by: Subbaraya Sundeep Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/mbox.h | 1 + drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h index 4e79e918a161..58e2aeebc14f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h @@ -732,6 +732,7 @@ enum nix_af_status { NIX_AF_ERR_BANDPROF_INVAL_REQ = -428, NIX_AF_ERR_CQ_CTX_WRITE_ERR = -429, NIX_AF_ERR_AQ_CTX_RETRY_WRITE = -430, + NIX_AF_ERR_LINK_CREDITS = -431, }; /* For NIX RX vtag action */ diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index de6e5a128864..97fb61915379 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -3891,8 +3891,8 @@ nix_config_link_credits(struct rvu *rvu, int blkaddr, int link, NIX_AF_TL1X_SW_XOFF(schq), BIT_ULL(0)); } - rc = -EBUSY; - poll_tmo = jiffies + usecs_to_jiffies(10000); + rc = NIX_AF_ERR_LINK_CREDITS; + poll_tmo = jiffies + usecs_to_jiffies(200000); /* Wait for credits to return */ do { if (time_after(jiffies, poll_tmo)) From df66b6ebc5dcf7253e35a640b9ec4add54195c25 Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Fri, 21 Jan 2022 12:04:45 +0530 Subject: [PATCH 0316/1295] octeontx2-af: cn10k: Do not enable RPM loopback for LPC interfaces Internal looback is not supported to low rate LPCS interface like SGMII/QSGMII. Hence don't allow to enable for such interfaces. Fixes: 3ad3f8f93c81 ("octeontx2-af: cn10k: MAC internal loopback support") Signed-off-by: Geetha sowjanya Signed-off-by: Subbaraya Sundeep Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller --- .../net/ethernet/marvell/octeontx2/af/rpm.c | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c index 4cbd91540f99..9ea2f6ac38ec 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c @@ -291,23 +291,20 @@ int rpm_lmac_internal_loopback(void *rpmd, int lmac_id, bool enable) if (!rpm || lmac_id >= rpm->lmac_count) return -ENODEV; lmac_type = rpm->mac_ops->get_lmac_type(rpm, lmac_id); - if (lmac_type == LMAC_MODE_100G_R) { - cfg = rpm_read(rpm, lmac_id, RPMX_MTI_PCS100X_CONTROL1); - if (enable) - cfg |= RPMX_MTI_PCS_LBK; - else - cfg &= ~RPMX_MTI_PCS_LBK; - rpm_write(rpm, lmac_id, RPMX_MTI_PCS100X_CONTROL1, cfg); - } else { - cfg = rpm_read(rpm, lmac_id, RPMX_MTI_LPCSX_CONTROL1); - if (enable) - cfg |= RPMX_MTI_PCS_LBK; - else - cfg &= ~RPMX_MTI_PCS_LBK; - rpm_write(rpm, lmac_id, RPMX_MTI_LPCSX_CONTROL1, cfg); + if (lmac_type == LMAC_MODE_QSGMII || lmac_type == LMAC_MODE_SGMII) { + dev_err(&rpm->pdev->dev, "loopback not supported for LPC mode\n"); + return 0; } + cfg = rpm_read(rpm, lmac_id, RPMX_MTI_PCS100X_CONTROL1); + + if (enable) + cfg |= RPMX_MTI_PCS_LBK; + else + cfg &= ~RPMX_MTI_PCS_LBK; + rpm_write(rpm, lmac_id, RPMX_MTI_PCS100X_CONTROL1, cfg); + return 0; } From a8db854be28622a2477cb21cdf7f829adbb2c42d Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Fri, 21 Jan 2022 12:04:46 +0530 Subject: [PATCH 0317/1295] octeontx2-pf: Forward error codes to VF PF forwards its VF messages to AF and corresponding replies from AF to VF. AF sets proper error code in the replies after processing message requests. Currently PF checks the error codes in replies and sends invalid message to VF. This way VF lacks the information of error code set by AF for its messages. This patch changes that such that PF simply forwards AF replies so that VF can handle error codes. Fixes: d424b6c02415 ("octeontx2-pf: Enable SRIOV and added VF mbox handling") Signed-off-by: Subbaraya Sundeep Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 6080ebd9bd94..d39341e4ab37 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -394,7 +394,12 @@ static int otx2_forward_vf_mbox_msgs(struct otx2_nic *pf, dst_mdev->msg_size = mbox_hdr->msg_size; dst_mdev->num_msgs = num_msgs; err = otx2_sync_mbox_msg(dst_mbox); - if (err) { + /* Error code -EIO indicate there is a communication failure + * to the AF. Rest of the error codes indicate that AF processed + * VF messages and set the error codes in response messages + * (if any) so simply forward responses to VF. + */ + if (err == -EIO) { dev_warn(pf->dev, "AF not responding to VF%d messages\n", vf); /* restore PF mbase and exit */ From 745166fcf01cecc4f5ff3defc6586868349a43f9 Mon Sep 17 00:00:00 2001 From: Kiran Kumar K Date: Fri, 21 Jan 2022 12:04:47 +0530 Subject: [PATCH 0318/1295] octeontx2-af: Add KPU changes to parse NGIO as separate layer With current KPU profile NGIO is being parsed along with CTAG as a single layer. Because of this MCAM/ntuple rules installed with ethertype as 0x8842 are not being hit. Adding KPU profile changes to parse NGIO in separate ltype and CTAG in separate ltype. Fixes: f9c49be90c05 ("octeontx2-af: Update the default KPU profile and fixes") Signed-off-by: Kiran Kumar K Signed-off-by: Subbaraya Sundeep Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller --- .../marvell/octeontx2/af/npc_profile.h | 70 +++++++++---------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h b/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h index 0fe7ad35e36f..4180376fa676 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h @@ -185,7 +185,6 @@ enum npc_kpu_parser_state { NPC_S_KPU2_QINQ, NPC_S_KPU2_ETAG, NPC_S_KPU2_EXDSA, - NPC_S_KPU2_NGIO, NPC_S_KPU2_CPT_CTAG, NPC_S_KPU2_CPT_QINQ, NPC_S_KPU3_CTAG, @@ -212,6 +211,7 @@ enum npc_kpu_parser_state { NPC_S_KPU5_NSH, NPC_S_KPU5_CPT_IP, NPC_S_KPU5_CPT_IP6, + NPC_S_KPU5_NGIO, NPC_S_KPU6_IP6_EXT, NPC_S_KPU6_IP6_HOP_DEST, NPC_S_KPU6_IP6_ROUT, @@ -1120,15 +1120,6 @@ static struct npc_kpu_profile_cam kpu1_cam_entries[] = { 0x0000, 0x0000, }, - { - NPC_S_KPU1_ETHER, 0xff, - NPC_ETYPE_CTAG, - 0xffff, - NPC_ETYPE_NGIO, - 0xffff, - 0x0000, - 0x0000, - }, { NPC_S_KPU1_ETHER, 0xff, NPC_ETYPE_CTAG, @@ -1966,6 +1957,15 @@ static struct npc_kpu_profile_cam kpu2_cam_entries[] = { 0x0000, 0x0000, }, + { + NPC_S_KPU2_CTAG, 0xff, + NPC_ETYPE_NGIO, + 0xffff, + 0x0000, + 0x0000, + 0x0000, + 0x0000, + }, { NPC_S_KPU2_CTAG, 0xff, NPC_ETYPE_PPPOE, @@ -2749,15 +2749,6 @@ static struct npc_kpu_profile_cam kpu2_cam_entries[] = { 0x0000, 0x0000, }, - { - NPC_S_KPU2_NGIO, 0xff, - 0x0000, - 0x0000, - 0x0000, - 0x0000, - 0x0000, - 0x0000, - }, { NPC_S_KPU2_CPT_CTAG, 0xff, NPC_ETYPE_IP, @@ -5089,6 +5080,15 @@ static struct npc_kpu_profile_cam kpu5_cam_entries[] = { 0x0000, 0x0000, }, + { + NPC_S_KPU5_NGIO, 0xff, + 0x0000, + 0x0000, + 0x0000, + 0x0000, + 0x0000, + 0x0000, + }, { NPC_S_NA, 0X00, 0x0000, @@ -8422,14 +8422,6 @@ static struct npc_kpu_profile_action kpu1_action_entries[] = { 0, 0, 0, 0, 0, }, - { - NPC_ERRLEV_RE, NPC_EC_NOERR, - 8, 12, 0, 0, 0, - NPC_S_KPU2_NGIO, 12, 1, - NPC_LID_LA, NPC_LT_LA_ETHER, - 0, - 0, 0, 0, 0, - }, { NPC_ERRLEV_RE, NPC_EC_NOERR, 8, 12, 0, 0, 0, @@ -9194,6 +9186,14 @@ static struct npc_kpu_profile_action kpu2_action_entries[] = { 0, 0, 0, 0, 0, }, + { + NPC_ERRLEV_RE, NPC_EC_NOERR, + 0, 0, 0, 2, 0, + NPC_S_KPU5_NGIO, 6, 1, + NPC_LID_LB, NPC_LT_LB_CTAG, + 0, + 0, 0, 0, 0, + }, { NPC_ERRLEV_RE, NPC_EC_NOERR, 8, 0, 6, 2, 0, @@ -9890,14 +9890,6 @@ static struct npc_kpu_profile_action kpu2_action_entries[] = { NPC_F_LB_U_UNK_ETYPE | NPC_F_LB_L_EXDSA, 0, 0, 0, 0, }, - { - NPC_ERRLEV_RE, NPC_EC_NOERR, - 0, 0, 0, 0, 1, - NPC_S_NA, 0, 1, - NPC_LID_LC, NPC_LT_LC_NGIO, - 0, - 0, 0, 0, 0, - }, { NPC_ERRLEV_RE, NPC_EC_NOERR, 8, 0, 6, 2, 0, @@ -11973,6 +11965,14 @@ static struct npc_kpu_profile_action kpu5_action_entries[] = { 0, 0, 0, 0, 0, }, + { + NPC_ERRLEV_RE, NPC_EC_NOERR, + 0, 0, 0, 0, 1, + NPC_S_NA, 0, 1, + NPC_LID_LC, NPC_LT_LC_NGIO, + 0, + 0, 0, 0, 0, + }, { NPC_ERRLEV_LC, NPC_EC_UNK, 0, 0, 0, 0, 1, From 248be352bbae1a0f14d0d3511a5b0bb9665097f5 Mon Sep 17 00:00:00 2001 From: Ajit Kumar Pandey Date: Thu, 20 Jan 2022 19:06:01 +0530 Subject: [PATCH 0319/1295] ASoC: amd: acp-mach: Fix Left and Right rt1019 amp devices We're setting wrong card codec conf for rt1019 amp devices in our machine driver. Due to this left and right amp channels data are reversed in our machines as wrong device prefix results in wrong value for "Mono LR Select" rt1019 mixer control. Reverse dev ids in codec conf with Left and Right name_prefix to fix such issue. Signed-off-by: Ajit Kumar Pandey Link: https://lore.kernel.org/r/20220120133605.476138-1-AjitKumar.Pandey@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-mach-common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/amd/acp/acp-mach-common.c b/sound/soc/amd/acp/acp-mach-common.c index c9caade5cb74..cd05ee2802c9 100644 --- a/sound/soc/amd/acp/acp-mach-common.c +++ b/sound/soc/amd/acp/acp-mach-common.c @@ -303,11 +303,11 @@ static const struct snd_soc_dapm_route rt1019_map_lr[] = { static struct snd_soc_codec_conf rt1019_conf[] = { { - .dlc = COMP_CODEC_CONF("i2c-10EC1019:00"), + .dlc = COMP_CODEC_CONF("i2c-10EC1019:01"), .name_prefix = "Left", }, { - .dlc = COMP_CODEC_CONF("i2c-10EC1019:01"), + .dlc = COMP_CODEC_CONF("i2c-10EC1019:00"), .name_prefix = "Right", }, }; From 80a00ab8344f0fe4d555a1f97960215b659436e9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2022 13:30:17 +0000 Subject: [PATCH 0320/1295] fscache: Fix the volume collision wait condition The condition that the waits in fscache_wait_on_volume_collision() are waiting until are inverted. This suddenly started happening on the upstream kernel with something like the following appearing in dmesg when running xfstests: CacheFiles: cachefiles: Inode already in use: Iafs,example.com,100055 Fix them by inverting the conditions. Fixes: 62ab63352350 ("fscache: Implement volume registration") Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/164251398010.3435901.943876048104930939.stgit@warthog.procyon.org.uk/ # v1 --- fs/fscache/volume.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c index a57c6cbee858..f2aa7dbad766 100644 --- a/fs/fscache/volume.c +++ b/fs/fscache/volume.c @@ -142,12 +142,12 @@ static void fscache_wait_on_volume_collision(struct fscache_volume *candidate, unsigned int collidee_debug_id) { wait_var_event_timeout(&candidate->flags, - fscache_is_acquire_pending(candidate), 20 * HZ); + !fscache_is_acquire_pending(candidate), 20 * HZ); if (!fscache_is_acquire_pending(candidate)) { pr_notice("Potential volume collision new=%08x old=%08x", candidate->debug_id, collidee_debug_id); fscache_stat(&fscache_n_volumes_collision); - wait_var_event(&candidate->flags, fscache_is_acquire_pending(candidate)); + wait_var_event(&candidate->flags, !fscache_is_acquire_pending(candidate)); } } From 5638b067d370583c6c455f019129ce33340b4142 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2022 14:13:59 +0000 Subject: [PATCH 0321/1295] cachefiles: Calculate the blockshift in terms of bytes, not pages Cachefiles keeps track of how much space is available on the backing filesystem and refuses new writes permission to start if there isn't enough (we especially don't want ENOSPC happening). It also tracks the amount of data pending in DIO writes (cache->b_writing) and reduces the amount of free space available by this amount before deciding if it can set up a new write. However, the old fscache I/O API was very much page-granularity dependent and, as such, cachefiles's cache->bshift was meant to be a multiplier to get from PAGE_SIZE to block size (ie. a blocksize of 512 would give a shift of 3 for a 4KiB page) - and this was incorrectly being used to turn the number of bytes in a DIO write into a number of blocks, leading to a massive over estimation of the amount of data in flight. Fix this by changing cache->bshift to be a multiplier from bytes to blocksize and deal with quantities of blocks, not quantities of pages. Fix also the rounding in the calculation in cachefiles_write() which needs a "- 1" inserting. Fixes: 047487c947e8 ("cachefiles: Implement the I/O routines") Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/164251398954.3435901.7138806620218474123.stgit@warthog.procyon.org.uk/ # v1 --- fs/cachefiles/cache.c | 7 ++----- fs/cachefiles/internal.h | 2 +- fs/cachefiles/io.c | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c index ce4d4785003c..1e9c71666c6a 100644 --- a/fs/cachefiles/cache.c +++ b/fs/cachefiles/cache.c @@ -84,9 +84,7 @@ int cachefiles_add_cache(struct cachefiles_cache *cache) goto error_unsupported; cache->bsize = stats.f_bsize; - cache->bshift = 0; - if (stats.f_bsize < PAGE_SIZE) - cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize); + cache->bshift = ilog2(stats.f_bsize); _debug("blksize %u (shift %u)", cache->bsize, cache->bshift); @@ -106,7 +104,6 @@ int cachefiles_add_cache(struct cachefiles_cache *cache) (unsigned long long) cache->fcull, (unsigned long long) cache->fstop); - stats.f_blocks >>= cache->bshift; do_div(stats.f_blocks, 100); cache->bstop = stats.f_blocks * cache->bstop_percent; cache->bcull = stats.f_blocks * cache->bcull_percent; @@ -209,7 +206,7 @@ int cachefiles_has_space(struct cachefiles_cache *cache, return ret; } - b_avail = stats.f_bavail >> cache->bshift; + b_avail = stats.f_bavail; b_writing = atomic_long_read(&cache->b_writing); if (b_avail > b_writing) b_avail -= b_writing; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 8dd54d9375b6..c793d33b0224 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -86,7 +86,7 @@ struct cachefiles_cache { unsigned bcull_percent; /* when to start culling (% blocks) */ unsigned bstop_percent; /* when to stop allocating (% blocks) */ unsigned bsize; /* cache's block size */ - unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */ + unsigned bshift; /* ilog2(bsize) */ uint64_t frun; /* when to stop culling */ uint64_t fcull; /* when to start culling */ uint64_t fstop; /* when to stop allocating */ diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 60b1eac2ce78..04eb52736990 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -264,7 +264,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres, ki->term_func = term_func; ki->term_func_priv = term_func_priv; ki->was_async = true; - ki->b_writing = (len + (1 << cache->bshift)) >> cache->bshift; + ki->b_writing = (len + (1 << cache->bshift) - 1) >> cache->bshift; if (ki->term_func) ki->iocb.ki_complete = cachefiles_write_complete; From c7ca73155762684a896ba57edf48519b645ea528 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Fri, 14 Jan 2022 15:30:29 +0800 Subject: [PATCH 0322/1295] cachefiles: set default tag name if it's unspecified fscache_acquire_cache() requires a non-empty name, while 'tag ' command is optional for cachefilesd. Thus set default tag name if it's unspecified to avoid the regression of cachefilesd. The logic is the same with that before rewritten. Signed-off-by: Jeffle Xu Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/164251399914.3435901.4761991152407411408.stgit@warthog.procyon.org.uk/ # v1 --- fs/cachefiles/daemon.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 40a792421fc1..7ac04ee2c0a0 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -703,6 +703,17 @@ static int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) return -EBUSY; } + /* Make sure we have copies of the tag string */ + if (!cache->tag) { + /* + * The tag string is released by the fops->release() + * function, so we don't release it on error here + */ + cache->tag = kstrdup("CacheFiles", GFP_KERNEL); + if (!cache->tag) + return -ENOMEM; + } + return cachefiles_add_cache(cache); } From 8c39b8bc82aafcc8dd378bd79c76fac8e8a89c8d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2022 11:44:54 +0000 Subject: [PATCH 0323/1295] cachefiles: Make some tracepoint adjustments Make some adjustments to tracepoints to make the tracing a bit more followable: (1) Standardise on displaying the backing inode number as "B=" with no leading zeros. (2) Make the cachefiles_lookup tracepoint log the directory inode number as well as the looked-up inode number. (3) Add a cachefiles_lookup tracepoint into cachefiles_get_directory() to log directory lookup. (4) Add a new cachefiles_mkdir tracepoint and use that to log a successful mkdir from cachefiles_get_directory(). (5) Make the cachefiles_unlink and cachefiles_rename tracepoints log the inode number of the affected file/dir rather than dentry struct pointers. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/164251403694.3435901.9797725381831316715.stgit@warthog.procyon.org.uk/ # v1 --- fs/cachefiles/namei.c | 8 +-- include/trace/events/cachefiles.h | 82 +++++++++++++++++++------------ 2 files changed, 56 insertions(+), 34 deletions(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 9bd692870617..52c9f0864a87 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -101,6 +101,7 @@ retry: subdir = lookup_one_len(dirname, dir, strlen(dirname)); else subdir = ERR_PTR(ret); + trace_cachefiles_lookup(NULL, dir, subdir); if (IS_ERR(subdir)) { trace_cachefiles_vfs_error(NULL, d_backing_inode(dir), PTR_ERR(subdir), @@ -135,6 +136,7 @@ retry: cachefiles_trace_mkdir_error); goto mkdir_error; } + trace_cachefiles_mkdir(dir, subdir); if (unlikely(d_unhashed(subdir))) { cachefiles_put_directory(subdir); @@ -233,7 +235,7 @@ static int cachefiles_unlink(struct cachefiles_cache *cache, }; int ret; - trace_cachefiles_unlink(object, dentry, why); + trace_cachefiles_unlink(object, d_inode(dentry)->i_ino, why); ret = security_path_unlink(&path, dentry); if (ret < 0) { cachefiles_io_error(cache, "Unlink security error"); @@ -386,7 +388,7 @@ try_again: .new_dir = d_inode(cache->graveyard), .new_dentry = grave, }; - trace_cachefiles_rename(object, rep, grave, why); + trace_cachefiles_rename(object, d_inode(rep)->i_ino, why); ret = cachefiles_inject_read_error(); if (ret == 0) ret = vfs_rename(&rd); @@ -617,7 +619,7 @@ bool cachefiles_look_up_object(struct cachefiles_object *object) object->d_name_len); else dentry = ERR_PTR(ret); - trace_cachefiles_lookup(object, dentry); + trace_cachefiles_lookup(object, fan, dentry); if (IS_ERR(dentry)) { if (dentry == ERR_PTR(-ENOENT)) goto new_file; diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h index 1172529b5b49..093c4acb7a3a 100644 --- a/include/trace/events/cachefiles.h +++ b/include/trace/events/cachefiles.h @@ -233,25 +233,48 @@ TRACE_EVENT(cachefiles_ref, TRACE_EVENT(cachefiles_lookup, TP_PROTO(struct cachefiles_object *obj, + struct dentry *dir, struct dentry *de), - TP_ARGS(obj, de), + TP_ARGS(obj, dir, de), TP_STRUCT__entry( __field(unsigned int, obj ) __field(short, error ) + __field(unsigned long, dino ) __field(unsigned long, ino ) ), TP_fast_assign( - __entry->obj = obj->debug_id; + __entry->obj = obj ? obj->debug_id : 0; + __entry->dino = d_backing_inode(dir)->i_ino; __entry->ino = (!IS_ERR(de) && d_backing_inode(de) ? d_backing_inode(de)->i_ino : 0); __entry->error = IS_ERR(de) ? PTR_ERR(de) : 0; ), - TP_printk("o=%08x i=%lx e=%d", - __entry->obj, __entry->ino, __entry->error) + TP_printk("o=%08x dB=%lx B=%lx e=%d", + __entry->obj, __entry->dino, __entry->ino, __entry->error) + ); + +TRACE_EVENT(cachefiles_mkdir, + TP_PROTO(struct dentry *dir, struct dentry *subdir), + + TP_ARGS(dir, subdir), + + TP_STRUCT__entry( + __field(unsigned int, dir ) + __field(unsigned int, subdir ) + ), + + TP_fast_assign( + __entry->dir = d_backing_inode(dir)->i_ino; + __entry->subdir = d_backing_inode(subdir)->i_ino; + ), + + TP_printk("dB=%x sB=%x", + __entry->dir, + __entry->subdir) ); TRACE_EVENT(cachefiles_tmpfile, @@ -269,7 +292,7 @@ TRACE_EVENT(cachefiles_tmpfile, __entry->backer = backer->i_ino; ), - TP_printk("o=%08x b=%08x", + TP_printk("o=%08x B=%x", __entry->obj, __entry->backer) ); @@ -289,61 +312,58 @@ TRACE_EVENT(cachefiles_link, __entry->backer = backer->i_ino; ), - TP_printk("o=%08x b=%08x", + TP_printk("o=%08x B=%x", __entry->obj, __entry->backer) ); TRACE_EVENT(cachefiles_unlink, TP_PROTO(struct cachefiles_object *obj, - struct dentry *de, + ino_t ino, enum fscache_why_object_killed why), - TP_ARGS(obj, de, why), + TP_ARGS(obj, ino, why), /* Note that obj may be NULL */ TP_STRUCT__entry( __field(unsigned int, obj ) - __field(struct dentry *, de ) + __field(unsigned int, ino ) __field(enum fscache_why_object_killed, why ) ), TP_fast_assign( __entry->obj = obj ? obj->debug_id : UINT_MAX; - __entry->de = de; + __entry->ino = ino; __entry->why = why; ), - TP_printk("o=%08x d=%p w=%s", - __entry->obj, __entry->de, + TP_printk("o=%08x B=%x w=%s", + __entry->obj, __entry->ino, __print_symbolic(__entry->why, cachefiles_obj_kill_traces)) ); TRACE_EVENT(cachefiles_rename, TP_PROTO(struct cachefiles_object *obj, - struct dentry *de, - struct dentry *to, + ino_t ino, enum fscache_why_object_killed why), - TP_ARGS(obj, de, to, why), + TP_ARGS(obj, ino, why), /* Note that obj may be NULL */ TP_STRUCT__entry( __field(unsigned int, obj ) - __field(struct dentry *, de ) - __field(struct dentry *, to ) + __field(unsigned int, ino ) __field(enum fscache_why_object_killed, why ) ), TP_fast_assign( __entry->obj = obj ? obj->debug_id : UINT_MAX; - __entry->de = de; - __entry->to = to; + __entry->ino = ino; __entry->why = why; ), - TP_printk("o=%08x d=%p t=%p w=%s", - __entry->obj, __entry->de, __entry->to, + TP_printk("o=%08x B=%x w=%s", + __entry->obj, __entry->ino, __print_symbolic(__entry->why, cachefiles_obj_kill_traces)) ); @@ -370,7 +390,7 @@ TRACE_EVENT(cachefiles_coherency, __entry->ino = ino; ), - TP_printk("o=%08x %s i=%llx c=%u", + TP_printk("o=%08x %s B=%llx c=%u", __entry->obj, __print_symbolic(__entry->why, cachefiles_coherency_traces), __entry->ino, @@ -397,7 +417,7 @@ TRACE_EVENT(cachefiles_vol_coherency, __entry->ino = ino; ), - TP_printk("V=%08x %s i=%llx", + TP_printk("V=%08x %s B=%llx", __entry->vol, __print_symbolic(__entry->why, cachefiles_coherency_traces), __entry->ino) @@ -435,7 +455,7 @@ TRACE_EVENT(cachefiles_prep_read, __entry->cache_inode = cache_inode; ), - TP_printk("R=%08x[%u] %s %s f=%02x s=%llx %zx ni=%x b=%x", + TP_printk("R=%08x[%u] %s %s f=%02x s=%llx %zx ni=%x B=%x", __entry->rreq, __entry->index, __print_symbolic(__entry->source, netfs_sreq_sources), __print_symbolic(__entry->why, cachefiles_prepare_read_traces), @@ -466,7 +486,7 @@ TRACE_EVENT(cachefiles_read, __entry->len = len; ), - TP_printk("o=%08x b=%08x s=%llx l=%zx", + TP_printk("o=%08x B=%x s=%llx l=%zx", __entry->obj, __entry->backer, __entry->start, @@ -495,7 +515,7 @@ TRACE_EVENT(cachefiles_write, __entry->len = len; ), - TP_printk("o=%08x b=%08x s=%llx l=%zx", + TP_printk("o=%08x B=%x s=%llx l=%zx", __entry->obj, __entry->backer, __entry->start, @@ -524,7 +544,7 @@ TRACE_EVENT(cachefiles_trunc, __entry->why = why; ), - TP_printk("o=%08x b=%08x %s l=%llx->%llx", + TP_printk("o=%08x B=%x %s l=%llx->%llx", __entry->obj, __entry->backer, __print_symbolic(__entry->why, cachefiles_trunc_traces), @@ -549,7 +569,7 @@ TRACE_EVENT(cachefiles_mark_active, __entry->inode = inode->i_ino; ), - TP_printk("o=%08x i=%lx", + TP_printk("o=%08x B=%lx", __entry->obj, __entry->inode) ); @@ -570,7 +590,7 @@ TRACE_EVENT(cachefiles_mark_inactive, __entry->inode = inode->i_ino; ), - TP_printk("o=%08x i=%lx", + TP_printk("o=%08x B=%lx", __entry->obj, __entry->inode) ); @@ -594,7 +614,7 @@ TRACE_EVENT(cachefiles_vfs_error, __entry->where = where; ), - TP_printk("o=%08x b=%08x %s e=%d", + TP_printk("o=%08x B=%x %s e=%d", __entry->obj, __entry->backer, __print_symbolic(__entry->where, cachefiles_error_traces), @@ -621,7 +641,7 @@ TRACE_EVENT(cachefiles_io_error, __entry->where = where; ), - TP_printk("o=%08x b=%08x %s e=%d", + TP_printk("o=%08x B=%x %s e=%d", __entry->obj, __entry->backer, __print_symbolic(__entry->where, cachefiles_error_traces), From b64a3314989df8e44c114f377808407f36dbf4f4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2022 11:05:13 +0000 Subject: [PATCH 0324/1295] cachefiles: Trace active-mark failure Add a tracepoint to log failure to apply an active mark to a file in addition to tracing successfully setting and unsetting the mark. Also include the backing file inode number in the message logged to dmesg. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/164251404666.3435901.17331742792401482190.stgit@warthog.procyon.org.uk/ # v1 --- fs/cachefiles/namei.c | 4 +++- include/trace/events/cachefiles.h | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 52c9f0864a87..f256c8aff7bb 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -25,7 +25,9 @@ static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object, trace_cachefiles_mark_active(object, inode); can_use = true; } else { - pr_notice("cachefiles: Inode already in use: %pd\n", dentry); + trace_cachefiles_mark_failed(object, inode); + pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + dentry, inode->i_ino); } return can_use; diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h index 093c4acb7a3a..c6f5aa74db89 100644 --- a/include/trace/events/cachefiles.h +++ b/include/trace/events/cachefiles.h @@ -573,6 +573,27 @@ TRACE_EVENT(cachefiles_mark_active, __entry->obj, __entry->inode) ); +TRACE_EVENT(cachefiles_mark_failed, + TP_PROTO(struct cachefiles_object *obj, + struct inode *inode), + + TP_ARGS(obj, inode), + + /* Note that obj may be NULL */ + TP_STRUCT__entry( + __field(unsigned int, obj ) + __field(ino_t, inode ) + ), + + TP_fast_assign( + __entry->obj = obj ? obj->debug_id : 0; + __entry->inode = inode->i_ino; + ), + + TP_printk("o=%08x B=%lx", + __entry->obj, __entry->inode) + ); + TRACE_EVENT(cachefiles_mark_inactive, TP_PROTO(struct cachefiles_object *obj, struct inode *inode), From 14b9d0902dfa25dac9c41bf346aa655fdeafe5b2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Jan 2022 10:51:13 +0000 Subject: [PATCH 0325/1295] cachefiles: Explain checks in a comment Add a comment to explain the checks that cachefiles is making of the backing filesystem[1]. Suggested-by: Jeff Layton Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/568749bd7cc02908ecf6f3d6a611b6f9cf5c4afd.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/164251405621.3435901.771439791811515914.stgit@warthog.procyon.org.uk/ # v1 --- fs/cachefiles/cache.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c index 1e9c71666c6a..2b2879c5d1d2 100644 --- a/fs/cachefiles/cache.c +++ b/fs/cachefiles/cache.c @@ -49,7 +49,13 @@ int cachefiles_add_cache(struct cachefiles_cache *cache) goto error_unsupported; } - /* check parameters */ + /* Check features of the backing filesystem: + * - Directories must support looking up and directory creation + * - We use xattrs to store metadata + * - We need to be able to query the amount of space available + * - We want to be able to sync the filesystem when stopping the cache + * - We use DIO to/from pages, so the blocksize mustn't be too big. + */ ret = -EOPNOTSUPP; if (d_is_negative(root) || !d_backing_inode(root)->i_op->lookup || From 6633213139d827fb9abf9a9a280f3d9e89fc7091 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Jan 2022 10:57:45 +0000 Subject: [PATCH 0326/1295] cachefiles: Check that the backing filesystem supports tmpfiles Add a check that the backing filesystem supports the creation of tmpfiles[1]. Suggested-by: Jeff Layton Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/568749bd7cc02908ecf6f3d6a611b6f9cf5c4afd.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/164251406558.3435901.1249023136670058162.stgit@warthog.procyon.org.uk/ # v1 --- fs/cachefiles/cache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c index 2b2879c5d1d2..7077f72e6f47 100644 --- a/fs/cachefiles/cache.c +++ b/fs/cachefiles/cache.c @@ -51,6 +51,7 @@ int cachefiles_add_cache(struct cachefiles_cache *cache) /* Check features of the backing filesystem: * - Directories must support looking up and directory creation + * - We create tmpfiles to handle invalidation * - We use xattrs to store metadata * - We need to be able to query the amount of space available * - We want to be able to sync the filesystem when stopping the cache @@ -60,6 +61,7 @@ int cachefiles_add_cache(struct cachefiles_cache *cache) if (d_is_negative(root) || !d_backing_inode(root)->i_op->lookup || !d_backing_inode(root)->i_op->mkdir || + !d_backing_inode(root)->i_op->tmpfile || !(d_backing_inode(root)->i_opflags & IOP_XATTR) || !root->d_sb->s_op->statfs || !root->d_sb->s_op->sync_fs || From c522e3ad296b7b692ed3960dfde467f2a34b434f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Jan 2022 09:28:41 +0000 Subject: [PATCH 0327/1295] fscache: Add a comment explaining how page-release optimisation works Add a comment into fscache_note_page_release() to explain how the page-release optimisation logic works[1]. It's not entirely obvious as it has nothing to do with whether or not the netfs file contains data. FSCACHE_COOKIE_NO_DATA_TO_READ is set if we have no data in the cache yet (ie. the backing file lookup was negative, the file is 0 length or the cookie got invalidated). It means that we have no data in the cache, not that the file is necessarily empty on the server. FSCACHE_COOKIE_HAVE_DATA is set once we've stored data in the backing file. From that point on, we have data we *could* read - however, it's covered by pages in the netfs pagecache until at such time one of those covering pages is released. So if we've written data to the cache (HAVE_DATA) and there wasn't any data in the cache when we started (NO_DATA_TO_READ), it may no longer be true that we can skip reading from the cache. Read skipping is done by cachefiles_prepare_read(). Note that tracking is not done on a per-page basis, but only on a per-file basis. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/043a206f03929c2667a465314144e518070a9b2d.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/164251408479.3435901.9540165422908194636.stgit@warthog.procyon.org.uk/ # v1 --- include/linux/fscache.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/fscache.h b/include/linux/fscache.h index ede50406bcb0..296c5f1d9f35 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -665,6 +665,11 @@ static inline void fscache_clear_inode_writeback(struct fscache_cookie *cookie, static inline void fscache_note_page_release(struct fscache_cookie *cookie) { + /* If we've written data to the cache (HAVE_DATA) and there wasn't any + * data in the cache when we started (NO_DATA_TO_READ), it may no + * longer be true that we can skip reading from the cache - so clear + * the flag that causes reads to be skipped. + */ if (cookie && test_bit(FSCACHE_COOKIE_HAVE_DATA, &cookie->flags) && test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) From cef0223191452b3c493a1070baad9ffe806babac Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Tue, 28 Dec 2021 20:44:19 +0800 Subject: [PATCH 0328/1295] netfs: Make ops->init_rreq() optional Make the ops->init_rreq() callback optional. This isn't required for the erofs changes I'm implementing to do on-demand read through fscache[1]. Further, ceph has an empty init_rreq method that can then be removed and it's marked optional in the documentation. Signed-off-by: Jeffle Xu Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/20211227125444.21187-1-jefflexu@linux.alibaba.com/ [1] Link: https://lore.kernel.org/r/20211228124419.103020-1-jefflexu@linux.alibaba.com Link: https://lore.kernel.org/r/164251410387.3435901.2504600788262093313.stgit@warthog.procyon.org.uk/ # v1 --- fs/ceph/addr.c | 5 ----- fs/netfs/read_helper.c | 3 ++- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b3d9459c9bbd..c98e5238a1b6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -297,10 +297,6 @@ out: dout("%s: result %d\n", __func__, err); } -static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file) -{ -} - static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) { struct inode *inode = mapping->host; @@ -312,7 +308,6 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) } static const struct netfs_read_request_ops ceph_netfs_read_ops = { - .init_rreq = ceph_init_rreq, .is_cache_enabled = ceph_is_cache_enabled, .begin_cache_operation = ceph_begin_cache_operation, .issue_op = ceph_netfs_issue_op, diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c index 6169659857b3..501da990c259 100644 --- a/fs/netfs/read_helper.c +++ b/fs/netfs/read_helper.c @@ -55,7 +55,8 @@ static struct netfs_read_request *netfs_alloc_read_request( INIT_WORK(&rreq->work, netfs_rreq_work); refcount_set(&rreq->usage, 1); __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - ops->init_rreq(rreq, file); + if (ops->init_rreq) + ops->init_rreq(rreq, file); netfs_stat(&netfs_n_rh_rreq); } From d24846a4246b6e61ecbd036880a4adf61681d241 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 20 Jan 2022 12:18:12 +0000 Subject: [PATCH 0329/1295] parisc: pdc_stable: Fix memory leak in pdcs_register_pathentries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kobject_init_and_add() takes reference even when it fails. According to the doc of kobject_init_and_add(): If this function returns an error, kobject_put() must be called to properly clean up the memory associated with the object. Fix memory leak by calling kobject_put(). Fixes: 73f368cf679b ("Kobject: change drivers/parisc/pdc_stable.c to use kobject_init_and_add") Signed-off-by: Miaoqian Lin Signed-off-by: Helge Deller --- drivers/parisc/pdc_stable.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c index 9513c39719d1..d9e51036a4fa 100644 --- a/drivers/parisc/pdc_stable.c +++ b/drivers/parisc/pdc_stable.c @@ -980,8 +980,10 @@ pdcs_register_pathentries(void) entry->kobj.kset = paths_kset; err = kobject_init_and_add(&entry->kobj, &ktype_pdcspath, NULL, "%s", entry->name); - if (err) + if (err) { + kobject_put(&entry->kobj); return err; + } /* kobject is now registered */ write_lock(&entry->rw_lock); From 2c13c05c5ff4b9fc907b07f7311821910ebaaf8a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Jan 2022 23:12:58 +0000 Subject: [PATCH 0330/1295] rxrpc: Adjust retransmission backoff Improve retransmission backoff by only backing off when we retransmit data packets rather than when we set the lost ack timer. To this end: (1) In rxrpc_resend(), use rxrpc_get_rto_backoff() when setting the retransmission timer and only tell it that we are retransmitting if we actually have things to retransmit. Note that it's possible for the retransmission algorithm to race with the processing of a received ACK, so we may see no packets needing retransmission. (2) In rxrpc_send_data_packet(), don't bump the backoff when setting the ack_lost_at timer, as it may then get bumped twice. With this, when looking at one particular packet, the retransmission intervals were seen to be 1.5ms, 2ms, 3ms, 5ms, 9ms, 17ms, 33ms, 71ms, 136ms, 264ms, 544ms, 1.088s, 2.1s, 4.2s and 8.3s. Fixes: c410bf01933e ("rxrpc: Fix the excessive initial retransmission timeout") Suggested-by: Marc Dionne Signed-off-by: David Howells Reviewed-by: Marc Dionne Tested-by: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/164138117069.2023386.17446904856843997127.stgit@warthog.procyon.org.uk/ Signed-off-by: David S. Miller --- net/rxrpc/call_event.c | 8 +++----- net/rxrpc/output.c | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 6be2672a65ea..df864e692267 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -157,7 +157,7 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call) static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) { struct sk_buff *skb; - unsigned long resend_at, rto_j; + unsigned long resend_at; rxrpc_seq_t cursor, seq, top; ktime_t now, max_age, oldest, ack_ts; int ix; @@ -165,10 +165,8 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); - rto_j = call->peer->rto_j; - now = ktime_get_real(); - max_age = ktime_sub(now, jiffies_to_usecs(rto_j)); + max_age = ktime_sub(now, jiffies_to_usecs(call->peer->rto_j)); spin_lock_bh(&call->lock); @@ -213,7 +211,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) } resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest))); - resend_at += jiffies + rto_j; + resend_at += jiffies + rxrpc_get_rto_backoff(call->peer, retrans); WRITE_ONCE(call->resend_at, resend_at); if (unacked) diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 10f2bf2e9068..a45c83f22236 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -468,7 +468,7 @@ done: if (call->peer->rtt_count > 1) { unsigned long nowj = jiffies, ack_lost_at; - ack_lost_at = rxrpc_get_rto_backoff(call->peer, retrans); + ack_lost_at = rxrpc_get_rto_backoff(call->peer, false); ack_lost_at += nowj; WRITE_ONCE(call->ack_lost_at, ack_lost_at); rxrpc_reduce_call_timer(call, ack_lost_at, nowj, From 63ec72bd58487935a2e40d2cdffe5c9498f1275e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 20 Jan 2022 23:39:35 -0800 Subject: [PATCH 0331/1295] mptcp: Use struct_group() to avoid cross-field memset() In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally writing across neighboring fields. Use struct_group() to capture the fields to be reset, so that memset() can be appropriately bounds-checked by the compiler. Cc: Matthieu Baerts Cc: mptcp@lists.linux.dev Signed-off-by: Kees Cook Reviewed-by: Mat Martineau Link: https://lore.kernel.org/r/20220121073935.1154263-1-keescook@chromium.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0e6b42c76ea0..85317ce38e3f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -408,7 +408,7 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); struct mptcp_subflow_context { struct list_head node;/* conn_list of subflows */ - char reset_start[0]; + struct_group(reset, unsigned long avg_pacing_rate; /* protected by msk socket lock */ u64 local_key; @@ -458,7 +458,7 @@ struct mptcp_subflow_context { long delegated_status; - char reset_end[0]; + ); struct list_head delegated_node; /* link into delegated_action, protected by local BH */ @@ -494,7 +494,7 @@ mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) static inline void mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow) { - memset(subflow->reset_start, 0, subflow->reset_end - subflow->reset_start); + memset(&subflow->reset, 0, sizeof(subflow->reset)); subflow->request_mptcp = 1; } From afa114d987c40e72ebbbc36bedf7d66b7cdc5883 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Fri, 21 Jan 2022 18:34:49 +0100 Subject: [PATCH 0332/1295] selftests: net: ioam: expect support for Queue depth data The IOAM queue-depth data field was added a few weeks ago, but the test unit was not updated accordingly. Reported-by: kernel test robot Fixes: b63c5478e9cb ("ipv6: ioam: Support for Queue depth data field") Signed-off-by: Justin Iurman Link: https://lore.kernel.org/r/20220121173449.26918-1-justin.iurman@uliege.be Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/ioam6_parser.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/ioam6_parser.c b/tools/testing/selftests/net/ioam6_parser.c index 8f6997d35816..d9d1d4190126 100644 --- a/tools/testing/selftests/net/ioam6_parser.c +++ b/tools/testing/selftests/net/ioam6_parser.c @@ -240,11 +240,8 @@ static int check_ioam6_data(__u8 **p, struct ioam6_trace_hdr *ioam6h, *p += sizeof(__u32); } - if (ioam6h->type.bit6) { - if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) - return 1; + if (ioam6h->type.bit6) *p += sizeof(__u32); - } if (ioam6h->type.bit7) { if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) From ffa65753c43142f3b803486442813744da71cff2 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 21 Jan 2022 22:10:46 -0800 Subject: [PATCH 0333/1295] mm/migrate.c: rework migration_entry_wait() to not take a pageref This fixes the FIXME in migrate_vma_check_page(). Before migrating a page migration code will take a reference and check there are no unexpected page references, failing the migration if there are. When a thread faults on a migration entry it will take a temporary reference to the page to wait for the page to become unlocked signifying the migration entry has been removed. This reference is dropped just prior to waiting on the page lock, however the extra reference can cause migration failures so it is desirable to avoid taking it. As migration code already has a reference to the migrating page an extra reference to wait on PG_locked is unnecessary so long as the reference can't be dropped whilst setting up the wait. When faulting on a migration entry the ptl is taken to check the migration entry. Removing a migration entry also requires the ptl, and migration code won't drop its page reference until after the migration entry has been removed. Therefore retaining the ptl of a migration entry is sufficient to ensure the page has a reference. Reworking migration_entry_wait() to hold the ptl until the wait setup is complete means the extra page reference is no longer needed. [apopple@nvidia.com: v5] Link: https://lkml.kernel.org/r/20211213033848.1973946-1-apopple@nvidia.com Link: https://lkml.kernel.org/r/20211118020754.954425-1-apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 + mm/filemap.c | 91 +++++++++++++++++++++++++++++++++++++++++ mm/migrate.c | 38 ++--------------- 3 files changed, 97 insertions(+), 34 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 4850cc5bf813..db96e10eb8da 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -40,6 +40,8 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); +void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, + spinlock_t *ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); void folio_migrate_copy(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, diff --git a/mm/filemap.c b/mm/filemap.c index 2fd9b2f24025..60866ae711e2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -1388,6 +1390,95 @@ repeat: return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } +#ifdef CONFIG_MIGRATION +/** + * migration_entry_wait_on_locked - Wait for a migration entry to be removed + * @entry: migration swap entry. + * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required + * for pte entries, pass NULL for pmd entries. + * @ptl: already locked ptl. This function will drop the lock. + * + * Wait for a migration entry referencing the given page to be removed. This is + * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except + * this can be called without taking a reference on the page. Instead this + * should be called while holding the ptl for the migration entry referencing + * the page. + * + * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock(). + * + * This follows the same logic as folio_wait_bit_common() so see the comments + * there. + */ +void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, + spinlock_t *ptl) +{ + struct wait_page_queue wait_page; + wait_queue_entry_t *wait = &wait_page.wait; + bool thrashing = false; + bool delayacct = false; + unsigned long pflags; + wait_queue_head_t *q; + struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); + + q = folio_waitqueue(folio); + if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { + if (!folio_test_swapbacked(folio)) { + delayacct_thrashing_start(); + delayacct = true; + } + psi_memstall_enter(&pflags); + thrashing = true; + } + + init_wait(wait); + wait->func = wake_page_function; + wait_page.folio = folio; + wait_page.bit_nr = PG_locked; + wait->flags = 0; + + spin_lock_irq(&q->lock); + folio_set_waiters(folio); + if (!folio_trylock_flag(folio, PG_locked, wait)) + __add_wait_queue_entry_tail(q, wait); + spin_unlock_irq(&q->lock); + + /* + * If a migration entry exists for the page the migration path must hold + * a valid reference to the page, and it must take the ptl to remove the + * migration entry. So the page is valid until the ptl is dropped. + */ + if (ptep) + pte_unmap_unlock(ptep, ptl); + else + spin_unlock(ptl); + + for (;;) { + unsigned int flags; + + set_current_state(TASK_UNINTERRUPTIBLE); + + /* Loop until we've been woken or interrupted */ + flags = smp_load_acquire(&wait->flags); + if (!(flags & WQ_FLAG_WOKEN)) { + if (signal_pending_state(TASK_UNINTERRUPTIBLE, current)) + break; + + io_schedule(); + continue; + } + break; + } + + finish_wait(q, wait); + + if (thrashing) { + if (delayacct) + delayacct_thrashing_end(); + psi_memstall_leave(&pflags); + } +} +#endif + void folio_wait_bit(struct folio *folio, int bit_nr) { folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); diff --git a/mm/migrate.c b/mm/migrate.c index 18ce840914f0..c7da064b4781 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -291,7 +291,6 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, { pte_t pte; swp_entry_t entry; - struct folio *folio; spin_lock(ptl); pte = *ptep; @@ -302,17 +301,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, if (!is_migration_entry(entry)) goto out; - folio = page_folio(pfn_swap_entry_to_page(entry)); - - /* - * Once page cache replacement of page migration started, page_count - * is zero; but we must not call folio_put_wait_locked() without - * a ref. Use folio_try_get(), and just fault again if it fails. - */ - if (!folio_try_get(folio)) - goto out; - pte_unmap_unlock(ptep, ptl); - folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); + migration_entry_wait_on_locked(entry, ptep, ptl); return; out: pte_unmap_unlock(ptep, ptl); @@ -337,16 +326,11 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl; - struct folio *folio; ptl = pmd_lock(mm, pmd); if (!is_pmd_migration_entry(*pmd)) goto unlock; - folio = page_folio(pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd))); - if (!folio_try_get(folio)) - goto unlock; - spin_unlock(ptl); - folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); + migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl); return; unlock: spin_unlock(ptl); @@ -2431,22 +2415,8 @@ static bool migrate_vma_check_page(struct page *page) return false; /* Page from ZONE_DEVICE have one extra reference */ - if (is_zone_device_page(page)) { - /* - * Private page can never be pin as they have no valid pte and - * GUP will fail for those. Yet if there is a pending migration - * a thread might try to wait on the pte migration entry and - * will bump the page reference count. Sadly there is no way to - * differentiate a regular pin from migration wait. Hence to - * avoid 2 racing thread trying to migrate back to CPU to enter - * infinite loop (one stopping migration because the other is - * waiting on pte migration entry). We always return true here. - * - * FIXME proper solution is to rework migration_entry_wait() so - * it does not need to take a reference on page. - */ - return is_device_private_page(page); - } + if (is_zone_device_page(page)) + extra++; /* For file back page */ if (page_mapping(page)) From 3ddd9a808cee7284931312f2f3e854c9617f44b2 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:10:50 -0800 Subject: [PATCH 0334/1295] sysctl: add a new register_sysctl_init() interface Patch series "sysctl: first set of kernel/sysctl cleanups", v2. Finally had time to respin the series of the work we had started last year on cleaning up the kernel/sysct.c kitchen sink. People keeps stuffing their sysctls in that file and this creates a maintenance burden. So this effort is aimed at placing sysctls where they actually belong. I'm going to split patches up into series as there is quite a bit of work. This first set adds register_sysctl_init() for uses of registerting a sysctl on the init path, adds const where missing to a few places, generalizes common values so to be more easy to share, and starts the move of a few kernel/sysctl.c out where they belong. The majority of rework on v2 in this first patch set is 0-day fixes. Eric Biederman's feedback is later addressed in subsequent patch sets. I'll only post the first two patch sets for now. We can address the rest once the first two patch sets get completely reviewed / Acked. This patch (of 9): The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. Today though folks heavily rely on tables on kernel/sysctl.c so they can easily just extend this table with their needed sysctls. In order to help users move their sysctls out we need to provide a helper which can be used during code initialization. We special-case the initialization use of register_sysctl() since it *is* safe to fail, given all that sysctls do is provide a dynamic interface to query or modify at runtime an existing variable. So the use case of register_sysctl() on init should *not* stop if the sysctls don't end up getting registered. It would be counter productive to stop boot if a simple sysctl registration failed. Provide a helper for init then, and document the recommended init levels to use for callers of this routine. We will later use this in subsequent patches to start slimming down kernel/sysctl.c tables and moving sysctl registration to the code which actually needs these sysctls. [mcgrof@kernel.org: major commit log and documentation rephrasing also moved to fs/proc/proc_sysctl.c ] Link: https://lkml.kernel.org/r/20211123202347.818157-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211123202347.818157-2-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Cc: Iurii Zaikin Cc: "Eric W. Biederman" Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Paul Turner Cc: Andy Shevchenko Cc: Sebastian Reichel Cc: Tetsuo Handa Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Qing Wang Cc: Benjamin LaHaise Cc: Al Viro Cc: Jan Kara Cc: Amir Goldstein Cc: Stephen Kitt Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 33 +++++++++++++++++++++++++++++++++ include/linux/sysctl.h | 3 +++ 2 files changed, 36 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 389e1e42e7d9..55a54973f061 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -1383,6 +1384,38 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab } EXPORT_SYMBOL(register_sysctl); +/** + * __register_sysctl_init() - register sysctl table to path + * @path: path name for sysctl base + * @table: This is the sysctl table that needs to be registered to the path + * @table_name: The name of sysctl table, only used for log printing when + * registration fails + * + * The sysctl interface is used by userspace to query or modify at runtime + * a predefined value set on a variable. These variables however have default + * values pre-set. Code which depends on these variables will always work even + * if register_sysctl() fails. If register_sysctl() fails you'd just loose the + * ability to query or modify the sysctls dynamically at run time. Chances of + * register_sysctl() failing on init are extremely low, and so for both reasons + * this function does not return any error as it is used by initialization code. + * + * Context: Can only be called after your respective sysctl base path has been + * registered. So for instance, most base directories are registered early on + * init before init levels are processed through proc_sys_init() and + * sysctl_init(). + */ +void __init __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name) +{ + struct ctl_table_header *hdr = register_sysctl(path, table); + + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl %s to %s\n", table_name, path); + return; + } + kmemleak_not_leak(hdr); +} + static char *append_path(const char *path, char *pos, const char *name) { int namelen; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 1fa2b69c6fc3..d3ab7969b6b5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -199,6 +199,9 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +extern void __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name); +#define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) void do_sysctl_args(void); extern int pwrsw_enabled; From 78e36f3b0dae586f623c4a37ec5eb5496f5abbe1 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:10:55 -0800 Subject: [PATCH 0335/1295] sysctl: move some boundary constants from sysctl.c to sysctl_vals sysctl has helpers which let us specify boundary values for a min or max int value. Since these are used for a boundary check only they don't change, so move these variables to sysctl_vals to avoid adding duplicate variables. This will help with our cleanup of kernel/sysctl.c. [akpm@linux-foundation.org: update it for "mm/pagealloc: sysctl: change watermark_scale_factor max limit to 30%"] [mcgrof@kernel.org: major rebase] Link: https://lkml.kernel.org/r/20211123202347.818157-3-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 2 +- include/linux/sysctl.h | 13 +++++++++--- kernel/sysctl.c | 45 ++++++++++++++++++------------------------ 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 55a54973f061..cd8bbf13d0f0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -26,7 +26,7 @@ static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { 0, 1, INT_MAX }; +const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX }; EXPORT_SYMBOL(sysctl_vals); /* Support for permanently empty directories */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index d3ab7969b6b5..47cf70c8eb93 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -38,9 +38,16 @@ struct ctl_table_header; struct ctl_dir; /* Keep the same order as in fs/proc/proc_sysctl.c */ -#define SYSCTL_ZERO ((void *)&sysctl_vals[0]) -#define SYSCTL_ONE ((void *)&sysctl_vals[1]) -#define SYSCTL_INT_MAX ((void *)&sysctl_vals[2]) +#define SYSCTL_NEG_ONE ((void *)&sysctl_vals[0]) +#define SYSCTL_ZERO ((void *)&sysctl_vals[1]) +#define SYSCTL_ONE ((void *)&sysctl_vals[2]) +#define SYSCTL_TWO ((void *)&sysctl_vals[3]) +#define SYSCTL_FOUR ((void *)&sysctl_vals[4]) +#define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[5]) +#define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[6]) +#define SYSCTL_ONE_THOUSAND ((void *)&sysctl_vals[7]) +#define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[8]) +#define SYSCTL_INT_MAX ((void *)&sysctl_vals[9]) extern const int sysctl_vals[]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ef77be575d87..901a9dfe4d62 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -114,16 +114,9 @@ static int sixty = 60; #endif -static int __maybe_unused neg_one = -1; -static int __maybe_unused two = 2; -static int __maybe_unused four = 4; static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; -static int one_hundred = 100; -static int two_hundred = 200; -static int one_thousand = 1000; -static int three_thousand = 3000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -1964,7 +1957,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, + .extra1 = SYSCTL_NEG_ONE, .extra2 = SYSCTL_ONE, }, #endif @@ -2306,7 +2299,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax_sysadmin, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, #endif { @@ -2566,7 +2559,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, + .extra1 = SYSCTL_NEG_ONE, }, #endif #ifdef CONFIG_RT_MUTEXES @@ -2628,7 +2621,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_cpu_time_max_percent_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "perf_event_max_stack", @@ -2646,7 +2639,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_thousand, + .extra2 = SYSCTL_ONE_THOUSAND, }, #endif { @@ -2677,7 +2670,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = bpf_unpriv_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "bpf_stats_enabled", @@ -2731,7 +2724,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = overcommit_policy_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "panic_on_oom", @@ -2740,7 +2733,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "oom_kill_allocating_task", @@ -2785,7 +2778,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = dirty_background_ratio_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_background_bytes", @@ -2802,7 +2795,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = dirty_ratio_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_bytes", @@ -2842,7 +2835,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two_hundred, + .extra2 = SYSCTL_TWO_HUNDRED, }, #ifdef CONFIG_HUGETLB_PAGE { @@ -2899,7 +2892,7 @@ static struct ctl_table vm_table[] = { .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &four, + .extra2 = SYSCTL_FOUR, }, #ifdef CONFIG_COMPACTION { @@ -2916,7 +2909,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = compaction_proactiveness_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "extfrag_threshold", @@ -2961,7 +2954,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = watermark_scale_factor_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &three_thousand, + .extra2 = SYSCTL_THREE_THOUSAND, }, { .procname = "percpu_pagelist_high_fraction", @@ -3040,7 +3033,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "min_slab_ratio", @@ -3049,7 +3042,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = sysctl_min_slab_ratio_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, #endif #ifdef CONFIG_SMP @@ -3339,7 +3332,7 @@ static struct ctl_table fs_table[] = { .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "protected_regular", @@ -3348,7 +3341,7 @@ static struct ctl_table fs_table[] = { .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "suid_dumpable", @@ -3357,7 +3350,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax_coredump, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) { From bbe7a10ed83a5fa0b0ff6161ecdc4e65a0e9c993 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:00 -0800 Subject: [PATCH 0336/1295] hung_task: move hung_task sysctl interface to hung_task.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move hung_task sysctl interface to hung_task.c and use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: commit log refresh and fixed 2-3 0day reported compile issues] Link: https://lkml.kernel.org/r/20211123202347.818157-4-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Paul Turner Cc: Peter Zijlstra Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/sysctl.h | 14 +------ kernel/hung_task.c | 81 ++++++++++++++++++++++++++++++++++-- kernel/sysctl.c | 61 --------------------------- 3 files changed, 79 insertions(+), 77 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 304f431178fd..c19dd5a2c05c 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -7,20 +7,8 @@ struct ctl_table; #ifdef CONFIG_DETECT_HUNG_TASK - -#ifdef CONFIG_SMP -extern unsigned int sysctl_hung_task_all_cpu_backtrace; -#else -#define sysctl_hung_task_all_cpu_backtrace 0 -#endif /* CONFIG_SMP */ - -extern int sysctl_hung_task_check_count; -extern unsigned int sysctl_hung_task_panic; +/* used for hung_task and block/ */ extern unsigned long sysctl_hung_task_timeout_secs; -extern unsigned long sysctl_hung_task_check_interval_secs; -extern int sysctl_hung_task_warnings; -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); #else /* Avoid need for ifdefs elsewhere in the code */ enum { sysctl_hung_task_timeout_secs = 0 }; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9888e2bc8c76..52501e5f7655 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -63,7 +63,9 @@ static struct task_struct *watchdog_task; * Should we dump all CPUs backtraces in a hung task event? * Defaults to 0, can be changed via sysctl. */ -unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#else +#define sysctl_hung_task_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ /* @@ -222,11 +224,13 @@ static long hung_timeout_jiffies(unsigned long last_checked, MAX_SCHEDULE_TIMEOUT; } +#ifdef CONFIG_SYSCTL /* * Process updating of timeout sysctl */ -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -241,6 +245,76 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ +static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ); +static struct ctl_table hung_task_sysctls[] = { +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = &sysctl_hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ + { + .procname = "hung_task_panic", + .data = &sysctl_hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hung_task_check_count", + .data = &sysctl_hung_task_check_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "hung_task_timeout_secs", + .data = &sysctl_hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)&hung_task_timeout_max, + }, + { + .procname = "hung_task_check_interval_secs", + .data = &sysctl_hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)&hung_task_timeout_max, + }, + { + .procname = "hung_task_warnings", + .data = &sysctl_hung_task_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_NEG_ONE, + }, + {} +}; + +static void __init hung_task_sysctl_init(void) +{ + register_sysctl_init("kernel", hung_task_sysctls); +} +#else +#define hung_task_sysctl_init() do { } while (0) +#endif /* CONFIG_SYSCTL */ + + static atomic_t reset_hung_task = ATOMIC_INIT(0); void reset_hung_task_detector(void) @@ -310,6 +384,7 @@ static int __init hung_task_init(void) pm_notifier(hungtask_pm_notify, 0); watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); + hung_task_sysctl_init(); return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 901a9dfe4d62..a4e00abe6f78 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -134,13 +134,6 @@ static int minolduid; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -/* - * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs - * and hung_task_check_interval_secs - */ -#ifdef CONFIG_DETECT_HUNG_TASK -static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); -#endif #ifdef CONFIG_INOTIFY_USER #include @@ -2508,60 +2501,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_DETECT_HUNG_TASK -#ifdef CONFIG_SMP - { - .procname = "hung_task_all_cpu_backtrace", - .data = &sysctl_hung_task_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ - { - .procname = "hung_task_panic", - .data = &sysctl_hung_task_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "hung_task_check_count", - .data = &sysctl_hung_task_check_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "hung_task_timeout_secs", - .data = &sysctl_hung_task_timeout_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_check_interval_secs", - .data = &sysctl_hung_task_check_interval_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_warnings", - .data = &sysctl_hung_task_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_NEG_ONE, - }, -#endif #ifdef CONFIG_RT_MUTEXES { .procname = "max_lock_depth", From dd0693fdf054f2ed37202ed56649ae2cccd29474 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:05 -0800 Subject: [PATCH 0337/1295] watchdog: move watchdog sysctl interface to watchdog.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic of proc sysctl. So, move the watchdog syscl interface to watchdog.c. Use register_sysctl() to register the sysctl interface to avoid merge conflicts when different features modify sysctl.c at the same time. [mcgrof@kernel.org: justify the move on the commit log] Link: https://lkml.kernel.org/r/20211123202347.818157-5-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Paul Turner Cc: Peter Zijlstra Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 96 ------------------------------------------- kernel/watchdog.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 96 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a4e00abe6f78..55e25996e3cf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -103,16 +103,10 @@ #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include #endif -#ifdef CONFIG_LOCKUP_DETECTOR -#include -#endif #if defined(CONFIG_SYSCTL) /* Constants used for minimum and maximum */ -#ifdef CONFIG_LOCKUP_DETECTOR -static int sixty = 60; -#endif static unsigned long zero_ul; static unsigned long one_ul = 1; @@ -2309,96 +2303,6 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_LOCKUP_DETECTOR) - { - .procname = "watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_thresh", - .data = &watchdog_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog_thresh, - .extra1 = SYSCTL_ZERO, - .extra2 = &sixty, - }, - { - .procname = "nmi_watchdog", - .data = &nmi_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = NMI_WATCHDOG_SYSCTL_PERM, - .proc_handler = proc_nmi_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - { - .procname = "soft_watchdog", - .data = &soft_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_soft_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "softlockup_panic", - .data = &softlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR - { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "hardlockup_all_cpu_backtrace", - .data = &sysctl_hardlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#endif - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index ad912511a0c0..99afb88d2e85 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -740,6 +740,106 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, mutex_unlock(&watchdog_mutex); return err; } + +static const int sixty = 60; + +static struct ctl_table watchdog_sysctls[] = { + { + .procname = "watchdog", + .data = &watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_thresh", + .data = &watchdog_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog_thresh, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&sixty, + }, + { + .procname = "nmi_watchdog", + .data = &nmi_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = NMI_WATCHDOG_SYSCTL_PERM, + .proc_handler = proc_nmi_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + { + .procname = "soft_watchdog", + .data = &soft_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_soft_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR + { + .procname = "hardlockup_panic", + .data = &hardlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "hardlockup_all_cpu_backtrace", + .data = &sysctl_hardlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif + {} +}; + +static void __init watchdog_sysctl_init(void) +{ + register_sysctl_init("kernel", watchdog_sysctls); +} +#else +#define watchdog_sysctl_init() do { } while (0) #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) @@ -753,4 +853,5 @@ void __init lockup_detector_init(void) if (!watchdog_nmi_probe()) nmi_watchdog_available = true; lockup_detector_setup(); + watchdog_sysctl_init(); } From f628867da46f8867e1854e43d7200e42ec22eee2 Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Fri, 21 Jan 2022 22:11:09 -0800 Subject: [PATCH 0338/1295] sysctl: make ngroups_max const ngroups_max is a read-only sysctl entry, reflecting NGROUPS_MAX. Make it const, in the same way as cap_last_cap. Link: https://lkml.kernel.org/r/20211123202347.818157-6-mcgrof@kernel.org Signed-off-by: Stephen Kitt Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Tetsuo Handa Cc: Xiaoming Ni Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 55e25996e3cf..29babf1f0d4a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -125,7 +125,7 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static int maxolduid = 65535; static int minolduid; -static int ngroups_max = NGROUPS_MAX; +static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -2291,7 +2291,7 @@ static struct ctl_table kern_table[] = { #endif { .procname = "ngroups_max", - .data = &ngroups_max, + .data = (void *)&ngroups_max, .maxlen = sizeof (int), .mode = 0444, .proc_handler = proc_dointvec, From d73840ec2f747b860331bbba53677d0ce38fb9c1 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:14 -0800 Subject: [PATCH 0339/1295] sysctl: use const for typically used max/min proc sysctls When proc_dointvec_minmax() or proc_doulongvec_minmax() are used we are using the extra1 and extra2 parameters on the sysctl table only for a min and max boundary, these extra1 and extra2 arguments are then used for read-only operations. So make them const to reflect this. [mcgrof@kernel.org: commit log love] Link: https://lkml.kernel.org/r/20211123202347.818157-7-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 53 ++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 29babf1f0d4a..134c2cb9ed4e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -108,27 +108,26 @@ /* Constants used for minimum and maximum */ -static unsigned long zero_ul; -static unsigned long one_ul = 1; -static unsigned long long_max = LONG_MAX; +static const unsigned long zero_ul; +static const unsigned long one_ul = 1; +static const unsigned long long_max = LONG_MAX; #ifdef CONFIG_PRINTK -static int ten_thousand = 10000; +static const int ten_thousand = 10000; #endif #ifdef CONFIG_PERF_EVENTS -static int six_hundred_forty_kb = 640 * 1024; +static const int six_hundred_forty_kb = 640 * 1024; #endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ -static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; +static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ -static int maxolduid = 65535; -static int minolduid; +static const int maxolduid = 65535; +static const int minolduid; static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; - #ifdef CONFIG_INOTIFY_USER #include #endif @@ -172,8 +171,8 @@ int sysctl_legacy_va_layout; #endif #ifdef CONFIG_COMPACTION -static int min_extfrag_threshold; -static int max_extfrag_threshold = 1000; +static const int min_extfrag_threshold; +static const int max_extfrag_threshold = 1000; #endif #endif /* CONFIG_SYSCTL */ @@ -2177,8 +2176,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = (void *)&minolduid, + .extra2 = (void *)&maxolduid, }, { .procname = "overflowgid", @@ -2186,8 +2185,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = (void *)&minolduid, + .extra2 = (void *)&maxolduid, }, #ifdef CONFIG_S390 { @@ -2261,7 +2260,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &ten_thousand, + .extra2 = (void *)&ten_thousand, }, { .procname = "printk_devkmsg", @@ -2473,7 +2472,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &six_hundred_forty_kb, + .extra2 = (void *)&six_hundred_forty_kb, }, { .procname = "perf_event_max_contexts_per_stack", @@ -2629,7 +2628,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirty_background_bytes), .mode = 0644, .proc_handler = dirty_background_bytes_handler, - .extra1 = &one_ul, + .extra1 = (void *)&one_ul, }, { .procname = "dirty_ratio", @@ -2646,7 +2645,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(vm_dirty_bytes), .mode = 0644, .proc_handler = dirty_bytes_handler, - .extra1 = &dirty_bytes_min, + .extra1 = (void *)&dirty_bytes_min, }, { .procname = "dirty_writeback_centisecs", @@ -2760,8 +2759,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &min_extfrag_threshold, - .extra2 = &max_extfrag_threshold, + .extra1 = (void *)&min_extfrag_threshold, + .extra2 = (void *)&max_extfrag_threshold, }, { .procname = "compact_unevictable_allowed", @@ -3047,8 +3046,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &zero_ul, - .extra2 = &long_max, + .extra1 = (void *)&zero_ul, + .extra2 = (void *)&long_max, }, { .procname = "nr_open", @@ -3072,8 +3071,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = (void *)&minolduid, + .extra2 = (void *)&maxolduid, }, { .procname = "overflowgid", @@ -3081,8 +3080,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = (void *)&minolduid, + .extra2 = (void *)&maxolduid, }, #ifdef CONFIG_FILE_LOCKING { From 2452dcb9f7f2bab5b47344148bc23a732be9195c Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:19 -0800 Subject: [PATCH 0340/1295] sysctl: use SYSCTL_ZERO to replace some static int zero uses Use the variable SYSCTL_ZERO to replace some static int boundary variables with a value of 0 (minolduid, min_extfrag_threshold, min_wakeup_granularity_ns). Link: https://lkml.kernel.org/r/20211123202347.818157-8-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 134c2cb9ed4e..a2175a305f10 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -123,7 +123,7 @@ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static const int maxolduid = 65535; -static const int minolduid; +/* minolduid is SYSCTL_ZERO */ static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -171,7 +171,7 @@ int sysctl_legacy_va_layout; #endif #ifdef CONFIG_COMPACTION -static const int min_extfrag_threshold; +/* min_extfrag_threshold is SYSCTL_ZERO */; static const int max_extfrag_threshold = 1000; #endif @@ -2176,7 +2176,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&minolduid, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&maxolduid, }, { @@ -2185,7 +2185,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&minolduid, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&maxolduid, }, #ifdef CONFIG_S390 @@ -2759,7 +2759,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&min_extfrag_threshold, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&max_extfrag_threshold, }, { @@ -3071,7 +3071,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&minolduid, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&maxolduid, }, { @@ -3080,7 +3080,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&minolduid, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&maxolduid, }, #ifdef CONFIG_FILE_LOCKING From 86b12b6c5d6b46e64bf2e8080528781032e4bd90 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:24 -0800 Subject: [PATCH 0341/1295] aio: move aio sysctl to aio.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. Move aio sysctl to aio.c and use the new register_sysctl_init() to register the sysctl interface for aio. [mcgrof@kernel.org: adjust commit log to justify the move] Link: https://lkml.kernel.org/r/20211123202347.818157-9-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Jan Kara Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 31 +++++++++++++++++++++++++++++-- include/linux/aio.h | 4 ---- kernel/sysctl.c | 17 ----------------- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index f6f1cbffef9e..4ceba13a7db0 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -220,9 +220,35 @@ struct aio_kiocb { /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); -unsigned long aio_nr; /* current system wide number of aio requests */ -unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ +static unsigned long aio_nr; /* current system wide number of aio requests */ +static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ +#ifdef CONFIG_SYSCTL +static struct ctl_table aio_sysctls[] = { + { + .procname = "aio-nr", + .data = &aio_nr, + .maxlen = sizeof(aio_nr), + .mode = 0444, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "aio-max-nr", + .data = &aio_max_nr, + .maxlen = sizeof(aio_max_nr), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + {} +}; + +static void __init aio_sysctl_init(void) +{ + register_sysctl_init("fs", aio_sysctls); +} +#else +#define aio_sysctl_init() do { } while (0) +#endif static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; @@ -275,6 +301,7 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); + aio_sysctl_init(); return 0; } __initcall(aio_setup); diff --git a/include/linux/aio.h b/include/linux/aio.h index b83e68dd006f..86892a4fe7c8 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -20,8 +20,4 @@ static inline void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) { } #endif /* CONFIG_AIO */ -/* for sysctl: */ -extern unsigned long aio_nr; -extern unsigned long aio_max_nr; - #endif /* __LINUX__AIO_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a2175a305f10..822555a0ec76 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -20,7 +20,6 @@ */ #include -#include #include #include #include @@ -3111,22 +3110,6 @@ static struct ctl_table fs_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_AIO - { - .procname = "aio-nr", - .data = &aio_nr, - .maxlen = sizeof(aio_nr), - .mode = 0444, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "aio-max-nr", - .data = &aio_max_nr, - .maxlen = sizeof(aio_max_nr), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif /* CONFIG_AIO */ #ifdef CONFIG_INOTIFY_USER { .procname = "inotify", From 49a4de75719b6c0f1f375df9908a95cef1e34945 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:29 -0800 Subject: [PATCH 0342/1295] dnotify: move dnotify sysctl to dnotify.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move dnotify sysctls to dnotify.c and use the new register_sysctl_init() to register the sysctl interface. [mcgrof@kernel.org: adjust the commit log to justify the move] Link: https://lkml.kernel.org/r/20211123202347.818157-10-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Acked-by: Jan Kara Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/dnotify/dnotify.c | 21 ++++++++++++++++++++- include/linux/dnotify.h | 1 - kernel/sysctl.c | 10 ---------- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index d5ebebb034ff..829dd4a61b66 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -19,7 +19,25 @@ #include #include -int dir_notify_enable __read_mostly = 1; +static int dir_notify_enable __read_mostly = 1; +#ifdef CONFIG_SYSCTL +static struct ctl_table dnotify_sysctls[] = { + { + .procname = "dir-notify-enable", + .data = &dir_notify_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +static void __init dnotify_sysctl_init(void) +{ + register_sysctl_init("fs", dnotify_sysctls); +} +#else +#define dnotify_sysctl_init() do { } while (0) +#endif static struct kmem_cache *dnotify_struct_cache __read_mostly; static struct kmem_cache *dnotify_mark_cache __read_mostly; @@ -386,6 +404,7 @@ static int __init dnotify_init(void) dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops); if (IS_ERR(dnotify_group)) panic("unable to allocate fsnotify group for dnotify\n"); + dnotify_sysctl_init(); return 0; } diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h index b87c3b85a166..b1d26f9f1c9f 100644 --- a/include/linux/dnotify.h +++ b/include/linux/dnotify.h @@ -29,7 +29,6 @@ struct dnotify_struct { FS_CREATE | FS_RENAME |\ FS_MOVED_FROM | FS_MOVED_TO) -extern int dir_notify_enable; extern void dnotify_flush(struct file *, fl_owner_t); extern int fcntl_dirnotify(int, struct file *, unsigned long); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 822555a0ec76..fb37825cd3cd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include #include @@ -3091,15 +3090,6 @@ static struct ctl_table fs_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_DNOTIFY - { - .procname = "dir-notify-enable", - .data = &dir_notify_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_MMU #ifdef CONFIG_FILE_LOCKING { From c8dd55410ba07de602169e037cbb62e495ad1880 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:11:34 -0800 Subject: [PATCH 0343/1295] hpet: simplify subdirectory registration with register_sysctl() Patch series "sysctl: second set of kernel/sysctl cleanups", v2. This is the 2nd set of kernel/sysctl.c cleanups. The diff stat should reflect how this is a much better way to deal with theses. Fortunately coccinelle can be used to ensure correctness for most of these and/or future merge conflicts. Note that since this is part of a larger effort to cleanup kernel/sysctl.c I think we have no other option but to go with merging these patches in either Andrew's tree or keep them staged in a separate tree and send a merge request later. Otherwise kernel/sysctl.c will end up becoming a sore spot for the next merge window. This patch (of 8): There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. // pycocci sysctl-subdir-register-sysctl-simplify.cocci drivers/char/hpet.c @c1@ expression E1; identifier subdir, sysctls; @@ static struct ctl_table subdir[] = { { .procname = E1, .maxlen = 0, .mode = 0555, .child = sysctls, }, { } }; @c2@ identifier c1.subdir; expression E2; identifier base; @@ static struct ctl_table base[] = { { .procname = E2, .maxlen = 0, .mode = 0555, .child = subdir, }, { } }; @c3@ identifier c2.base; identifier header; @@ header = register_sysctl_table(base); @r1 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.subdir, c1.sysctls; @@ -static struct ctl_table subdir[] = { - { - .procname = E1, - .maxlen = 0, - .mode = 0555, - .child = sysctls, - }, - { } -}; @r2 depends on c1 && c2 && c3@ identifier c1.subdir; expression c2.E2; identifier c2.base; @@ -static struct ctl_table base[] = { - { - .procname = E2, - .maxlen = 0, - .mode = 0555, - .child = subdir, - }, - { } -}; @initialize:python@ @@ def make_my_fresh_expression(s1, s2): return '"' + s1.strip('"') + "/" + s2.strip('"') + '"' @r3 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.sysctls; expression c2.E2; identifier c2.base; identifier c3.header; fresh identifier E3 = script:python(E2, E1) { make_my_fresh_expression(E2, E1) }; @@ header = -register_sysctl_table(base); +register_sysctl(E3, sysctls); Generated-by: Coccinelle SmPL Link: https://lkml.kernel.org/r/20211123202422.819032-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211123202422.819032-2-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Xiaoming Ni Cc: "Eric W. Biederman" Cc: Clemens Ladisch Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: David Airlie Cc: Benjamin Herrenschmidt Cc: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Cc: Jan Kara Cc: Amir Goldstein Cc: Phillip Potter Cc: Al Viro Cc: Julia Lawall Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/hpet.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 4e5431f01450..563dfae3b8da 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -746,26 +746,6 @@ static struct ctl_table hpet_table[] = { {} }; -static struct ctl_table hpet_root[] = { - { - .procname = "hpet", - .maxlen = 0, - .mode = 0555, - .child = hpet_table, - }, - {} -}; - -static struct ctl_table dev_root[] = { - { - .procname = "dev", - .maxlen = 0, - .mode = 0555, - .child = hpet_root, - }, - {} -}; - static struct ctl_table_header *sysctl_header; /* @@ -1061,7 +1041,7 @@ static int __init hpet_init(void) if (result < 0) return -ENODEV; - sysctl_header = register_sysctl_table(dev_root); + sysctl_header = register_sysctl("dev/hpet", hpet_table); result = acpi_bus_register_driver(&hpet_acpi_driver); if (result < 0) { From e5a1fd997cc2deda1b08d5faae04625de0440a1e Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:11:39 -0800 Subject: [PATCH 0344/1295] i915: simplify subdirectory registration with register_sysctl() There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. // pycocci sysctl-subdir-register-sysctl-simplify.cocci PATH @c1@ expression E1; identifier subdir, sysctls; @@ static struct ctl_table subdir[] = { { .procname = E1, .maxlen = 0, .mode = 0555, .child = sysctls, }, { } }; @c2@ identifier c1.subdir; expression E2; identifier base; @@ static struct ctl_table base[] = { { .procname = E2, .maxlen = 0, .mode = 0555, .child = subdir, }, { } }; @c3@ identifier c2.base; identifier header; @@ header = register_sysctl_table(base); @r1 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.subdir, c1.sysctls; @@ -static struct ctl_table subdir[] = { - { - .procname = E1, - .maxlen = 0, - .mode = 0555, - .child = sysctls, - }, - { } -}; @r2 depends on c1 && c2 && c3@ identifier c1.subdir; expression c2.E2; identifier c2.base; @@ -static struct ctl_table base[] = { - { - .procname = E2, - .maxlen = 0, - .mode = 0555, - .child = subdir, - }, - { } -}; @initialize:python@ @@ def make_my_fresh_expression(s1, s2): return '"' + s1.strip('"') + "/" + s2.strip('"') + '"' @r3 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.sysctls; expression c2.E2; identifier c2.base; identifier c3.header; fresh identifier E3 = script:python(E2, E1) { make_my_fresh_expression(E2, E1) }; @@ header = -register_sysctl_table(base); +register_sysctl(E3, sysctls); Generated-by: Coccinelle SmPL Link: https://lkml.kernel.org/r/20211123202422.819032-3-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Acked-by: Jani Nikula Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Xiaoming Ni Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/i915_perf.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 170bba913c30..e27f3b7cf094 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -4273,26 +4273,6 @@ static struct ctl_table oa_table[] = { {} }; -static struct ctl_table i915_root[] = { - { - .procname = "i915", - .maxlen = 0, - .mode = 0555, - .child = oa_table, - }, - {} -}; - -static struct ctl_table dev_root[] = { - { - .procname = "dev", - .maxlen = 0, - .mode = 0555, - .child = i915_root, - }, - {} -}; - static void oa_init_supported_formats(struct i915_perf *perf) { struct drm_i915_private *i915 = perf->i915; @@ -4488,7 +4468,7 @@ static int destroy_config(int id, void *p, void *data) int i915_perf_sysctl_register(void) { - sysctl_header = register_sysctl_table(dev_root); + sysctl_header = register_sysctl("dev/i915", oa_table); return 0; } From e99f5e7479110868ac04c5f6593a15c2da61f969 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:11:44 -0800 Subject: [PATCH 0345/1295] macintosh/mac_hid.c: simplify subdirectory registration with register_sysctl() There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. // pycocci sysctl-subdir-register-sysctl-simplify.cocci PATH @c1@ expression E1; identifier subdir, sysctls; @@ static struct ctl_table subdir[] = { { .procname = E1, .maxlen = 0, .mode = 0555, .child = sysctls, }, { } }; @c2@ identifier c1.subdir; expression E2; identifier base; @@ static struct ctl_table base[] = { { .procname = E2, .maxlen = 0, .mode = 0555, .child = subdir, }, { } }; @c3@ identifier c2.base; identifier header; @@ header = register_sysctl_table(base); @r1 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.subdir, c1.sysctls; @@ -static struct ctl_table subdir[] = { - { - .procname = E1, - .maxlen = 0, - .mode = 0555, - .child = sysctls, - }, - { } -}; @r2 depends on c1 && c2 && c3@ identifier c1.subdir; expression c2.E2; identifier c2.base; @@ -static struct ctl_table base[] = { - { - .procname = E2, - .maxlen = 0, - .mode = 0555, - .child = subdir, - }, - { } -}; @initialize:python@ @@ def make_my_fresh_expression(s1, s2): return '"' + s1.strip('"') + "/" + s2.strip('"') + '"' @r3 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.sysctls; expression c2.E2; identifier c2.base; identifier c3.header; fresh identifier E3 = script:python(E2, E1) { make_my_fresh_expression(E2, E1) }; @@ header = -register_sysctl_table(base); +register_sysctl(E3, sysctls); Generated-by: Coccinelle SmPL Link: https://lkml.kernel.org/r/20211123202422.819032-4-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Xiaoming Ni Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/macintosh/mac_hid.c | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c index 28b8581b44dd..d8c4d5664145 100644 --- a/drivers/macintosh/mac_hid.c +++ b/drivers/macintosh/mac_hid.c @@ -239,33 +239,11 @@ static struct ctl_table mac_hid_files[] = { { } }; -/* dir in /proc/sys/dev */ -static struct ctl_table mac_hid_dir[] = { - { - .procname = "mac_hid", - .maxlen = 0, - .mode = 0555, - .child = mac_hid_files, - }, - { } -}; - -/* /proc/sys/dev itself, in case that is not there yet */ -static struct ctl_table mac_hid_root_dir[] = { - { - .procname = "dev", - .maxlen = 0, - .mode = 0555, - .child = mac_hid_dir, - }, - { } -}; - static struct ctl_table_header *mac_hid_sysctl_header; static int __init mac_hid_init(void) { - mac_hid_sysctl_header = register_sysctl_table(mac_hid_root_dir); + mac_hid_sysctl_header = register_sysctl("dev/mac_hid", mac_hid_files); if (!mac_hid_sysctl_header) return -ENOMEM; From c42ff46f97c1c25577a84fbfb111710d25a129e0 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:11:49 -0800 Subject: [PATCH 0346/1295] ocfs2: simplify subdirectory registration with register_sysctl() There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. // pycocci sysctl-subdir-register-sysctl-simplify.cocci PATH @c1@ expression E1; identifier subdir, sysctls; @@ static struct ctl_table subdir[] = { { .procname = E1, .maxlen = 0, .mode = 0555, .child = sysctls, }, { } }; @c2@ identifier c1.subdir; expression E2; identifier base; @@ static struct ctl_table base[] = { { .procname = E2, .maxlen = 0, .mode = 0555, .child = subdir, }, { } }; @c3@ identifier c2.base; identifier header; @@ header = register_sysctl_table(base); @r1 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.subdir, c1.sysctls; @@ -static struct ctl_table subdir[] = { - { - .procname = E1, - .maxlen = 0, - .mode = 0555, - .child = sysctls, - }, - { } -}; @r2 depends on c1 && c2 && c3@ identifier c1.subdir; expression c2.E2; identifier c2.base; @@ -static struct ctl_table base[] = { - { - .procname = E2, - .maxlen = 0, - .mode = 0555, - .child = subdir, - }, - { } -}; @initialize:python@ @@ def make_my_fresh_expression(s1, s2): return '"' + s1.strip('"') + "/" + s2.strip('"') + '"' @r3 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.sysctls; expression c2.E2; identifier c2.base; identifier c3.header; fresh identifier E3 = script:python(E2, E1) { make_my_fresh_expression(E2, E1) }; @@ header = -register_sysctl_table(base); +register_sysctl(E3, sysctls); Generated-by: Coccinelle SmPL Link: https://lkml.kernel.org/r/20211123202422.819032-5-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Reviewed-by: Jan Kara Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Xiaoming Ni Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stackglue.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 16f1bfc407f2..731558a6f27d 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -672,31 +672,8 @@ static struct ctl_table ocfs2_mod_table[] = { { } }; -static struct ctl_table ocfs2_kern_table[] = { - { - .procname = "ocfs2", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ocfs2_mod_table - }, - { } -}; - -static struct ctl_table ocfs2_root_table[] = { - { - .procname = "fs", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ocfs2_kern_table - }, - { } -}; - static struct ctl_table_header *ocfs2_table_header; - /* * Initialization */ @@ -705,7 +682,7 @@ static int __init ocfs2_stack_glue_init(void) { strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); - ocfs2_table_header = register_sysctl_table(ocfs2_root_table); + ocfs2_table_header = register_sysctl("fs/ocfs2", ocfs2_mod_table); if (!ocfs2_table_header) { printk(KERN_ERR "ocfs2 stack glue: unable to register sysctl\n"); From 04bc883c986d9c8a64fc1f1cc2cbc328c2b2a496 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:11:54 -0800 Subject: [PATCH 0347/1295] test_sysctl: simplify subdirectory registration with register_sysctl() There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. // pycocci sysctl-subdir-register-sysctl-simplify.cocci lib/test_sysctl.c @c1@ expression E1; identifier subdir, sysctls; @@ static struct ctl_table subdir[] = { { .procname = E1, .maxlen = 0, .mode = 0555, .child = sysctls, }, { } }; @c2@ identifier c1.subdir; expression E2; identifier base; @@ static struct ctl_table base[] = { { .procname = E2, .maxlen = 0, .mode = 0555, .child = subdir, }, { } }; @c3@ identifier c2.base; identifier header; @@ header = register_sysctl_table(base); @r1 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.subdir, c1.sysctls; @@ -static struct ctl_table subdir[] = { - { - .procname = E1, - .maxlen = 0, - .mode = 0555, - .child = sysctls, - }, - { } -}; @r2 depends on c1 && c2 && c3@ identifier c1.subdir; expression c2.E2; identifier c2.base; @@ -static struct ctl_table base[] = { - { - .procname = E2, - .maxlen = 0, - .mode = 0555, - .child = subdir, - }, - { } -}; @initialize:python@ @@ def make_my_fresh_expression(s1, s2): return '"' + s1.strip('"') + "/" + s2.strip('"') + '"' @r3 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.sysctls; expression c2.E2; identifier c2.base; identifier c3.header; fresh identifier E3 = script:python(E2, E1) { make_my_fresh_expression(E2, E1) }; @@ header = -register_sysctl_table(base); +register_sysctl(E3, sysctls); Generated-by: Coccinelle SmPL Link: https://lkml.kernel.org/r/20211123202422.819032-6-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Xiaoming Ni Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_sysctl.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index 3750323973f4..a5a3d6c27e1f 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -128,26 +128,6 @@ static struct ctl_table test_table[] = { { } }; -static struct ctl_table test_sysctl_table[] = { - { - .procname = "test_sysctl", - .maxlen = 0, - .mode = 0555, - .child = test_table, - }, - { } -}; - -static struct ctl_table test_sysctl_root_table[] = { - { - .procname = "debug", - .maxlen = 0, - .mode = 0555, - .child = test_sysctl_table, - }, - { } -}; - static struct ctl_table_header *test_sysctl_header; static int __init test_sysctl_init(void) @@ -155,7 +135,7 @@ static int __init test_sysctl_init(void) test_data.bitmap_0001 = kzalloc(SYSCTL_TEST_BITMAP_SIZE/8, GFP_KERNEL); if (!test_data.bitmap_0001) return -ENOMEM; - test_sysctl_header = register_sysctl_table(test_sysctl_root_table); + test_sysctl_header = register_sysctl("debug/test_sysctl", test_table); if (!test_sysctl_header) { kfree(test_data.bitmap_0001); return -ENOMEM; From 7b9ad122b52c9839e1f68f16c907990a6ad6f793 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:59 -0800 Subject: [PATCH 0348/1295] inotify: simplify subdirectory registration with register_sysctl() There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. Move inotify_user sysctl to inotify_user.c while at it to remove clutter from kernel/sysctl.c. [mcgrof@kernel.org: remember to register fanotify_table] Link: https://lkml.kernel.org/r/YZ5A6iWLb0h3N3RC@bombadil.infradead.org [mcgrof@kernel.org: update commit log to reflect new path we decided to take] Link: https://lkml.kernel.org/r/20211123202422.819032-7-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 10 +++++++++- fs/notify/inotify/inotify_user.c | 11 ++++++++++- include/linux/fanotify.h | 2 -- include/linux/inotify.h | 3 --- kernel/sysctl.c | 21 --------------------- 5 files changed, 19 insertions(+), 28 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 73a3e939c921..73b1615f9d96 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -59,7 +59,7 @@ static int fanotify_max_queued_events __read_mostly; static long ft_zero = 0; static long ft_int_max = INT_MAX; -struct ctl_table fanotify_table[] = { +static struct ctl_table fanotify_table[] = { { .procname = "max_user_groups", .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], @@ -88,6 +88,13 @@ struct ctl_table fanotify_table[] = { }, { } }; + +static void __init fanotify_sysctls_init(void) +{ + register_sysctl("fs/fanotify", fanotify_table); +} +#else +#define fanotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ /* @@ -1743,6 +1750,7 @@ static int __init fanotify_user_setup(void) init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = FANOTIFY_DEFAULT_MAX_GROUPS; init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; + fanotify_sysctls_init(); return 0; } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 29fca3284bb5..54583f62dc44 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -58,7 +58,7 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly; static long it_zero = 0; static long it_int_max = INT_MAX; -struct ctl_table inotify_table[] = { +static struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], @@ -87,6 +87,14 @@ struct ctl_table inotify_table[] = { }, { } }; + +static void __init inotify_sysctls_init(void) +{ + register_sysctl("fs/inotify", inotify_table); +} + +#else +#define inotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg) @@ -849,6 +857,7 @@ static int __init inotify_user_setup(void) inotify_max_queued_events = 16384; init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max; + inotify_sysctls_init(); return 0; } diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 3afdf339d53c..419cadcd7ff5 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -5,8 +5,6 @@ #include #include -extern struct ctl_table fanotify_table[]; /* for sysctl */ - #define FAN_GROUP_FLAG(group, flag) \ ((group)->fanotify_data.flags & (flag)) diff --git a/include/linux/inotify.h b/include/linux/inotify.h index 6a24905f6e1e..8d20caa1b268 100644 --- a/include/linux/inotify.h +++ b/include/linux/inotify.h @@ -7,11 +7,8 @@ #ifndef _LINUX_INOTIFY_H #define _LINUX_INOTIFY_H -#include #include -extern struct ctl_table inotify_table[]; /* for sysctl */ - #define ALL_INOTIFY_BITS (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | \ IN_MOVED_TO | IN_CREATE | IN_DELETE | \ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fb37825cd3cd..2ee0e76888d8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -126,13 +126,6 @@ static const int maxolduid = 65535; static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -#ifdef CONFIG_INOTIFY_USER -#include -#endif -#ifdef CONFIG_FANOTIFY -#include -#endif - #ifdef CONFIG_PROC_SYSCTL /** @@ -3100,20 +3093,6 @@ static struct ctl_table fs_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_INOTIFY_USER - { - .procname = "inotify", - .mode = 0555, - .child = inotify_table, - }, -#endif -#ifdef CONFIG_FANOTIFY - { - .procname = "fanotify", - .mode = 0555, - .child = fanotify_table, - }, -#endif #ifdef CONFIG_EPOLL { .procname = "epoll", From ad8f74315b335c55f3d8bd7c37d21d6c74d1be20 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:04 -0800 Subject: [PATCH 0349/1295] cdrom: simplify subdirectory registration with register_sysctl() There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. // pycocci sysctl-subdir-register-sysctl-simplify.cocci PATH @c1@ expression E1; identifier subdir, sysctls; @@ static struct ctl_table subdir[] = { { .procname = E1, .maxlen = 0, .mode = 0555, .child = sysctls, }, { } }; @c2@ identifier c1.subdir; expression E2; identifier base; @@ static struct ctl_table base[] = { { .procname = E2, .maxlen = 0, .mode = 0555, .child = subdir, }, { } }; @c3@ identifier c2.base; identifier header; @@ header = register_sysctl_table(base); @r1 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.subdir, c1.sysctls; @@ -static struct ctl_table subdir[] = { - { - .procname = E1, - .maxlen = 0, - .mode = 0555, - .child = sysctls, - }, - { } -}; @r2 depends on c1 && c2 && c3@ identifier c1.subdir; expression c2.E2; identifier c2.base; @@ -static struct ctl_table base[] = { - { - .procname = E2, - .maxlen = 0, - .mode = 0555, - .child = subdir, - }, - { } -}; @initialize:python@ @@ def make_my_fresh_expression(s1, s2): return '"' + s1.strip('"') + "/" + s2.strip('"') + '"' @r3 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.sysctls; expression c2.E2; identifier c2.base; identifier c3.header; fresh identifier E3 = script:python(E2, E1) { make_my_fresh_expression(E2, E1) }; @@ header = -register_sysctl_table(base); +register_sysctl(E3, sysctls); Generated-by: Coccinelle SmPL Link: https://lkml.kernel.org/r/20211123202422.819032-8-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Xiaoming Ni Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/cdrom/cdrom.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 9877e413fce3..1b57d4666e43 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3691,27 +3691,6 @@ static struct ctl_table cdrom_table[] = { }, { } }; - -static struct ctl_table cdrom_cdrom_table[] = { - { - .procname = "cdrom", - .maxlen = 0, - .mode = 0555, - .child = cdrom_table, - }, - { } -}; - -/* Make sure that /proc/sys/dev is there */ -static struct ctl_table cdrom_root_table[] = { - { - .procname = "dev", - .maxlen = 0, - .mode = 0555, - .child = cdrom_cdrom_table, - }, - { } -}; static struct ctl_table_header *cdrom_sysctl_header; static void cdrom_sysctl_register(void) @@ -3721,7 +3700,7 @@ static void cdrom_sysctl_register(void) if (!atomic_add_unless(&initialized, 1, 1)) return; - cdrom_sysctl_header = register_sysctl_table(cdrom_root_table); + cdrom_sysctl_header = register_sysctl("dev/cdrom", cdrom_table); /* set the defaults */ cdrom_sysctl_settings.autoclose = autoclose; From a8f5de894f76f1c73f4a068d04897a5e2f873825 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:09 -0800 Subject: [PATCH 0350/1295] eventpoll: simplify sysctl declaration with register_sysctl() The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the epoll_table sysctl to fs/eventpoll.c and use register_sysctl(). Link: https://lkml.kernel.org/r/20211123202422.819032-9-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 10 +++++++++- include/linux/poll.h | 2 -- include/linux/sysctl.h | 1 - kernel/sysctl.c | 7 ------- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 06f4c5ae1451..e2daa940ebce 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -307,7 +307,7 @@ static void unlist_file(struct epitems_head *head) static long long_zero; static long long_max = LONG_MAX; -struct ctl_table epoll_table[] = { +static struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, @@ -319,6 +319,13 @@ struct ctl_table epoll_table[] = { }, { } }; + +static void __init epoll_sysctls_init(void) +{ + register_sysctl("fs/epoll", epoll_table); +} +#else +#define epoll_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static const struct file_operations eventpoll_fops; @@ -2378,6 +2385,7 @@ static int __init eventpoll_init(void) /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); + epoll_sysctls_init(); ephead_cache = kmem_cache_create("ep_head", sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); diff --git a/include/linux/poll.h b/include/linux/poll.h index 1cdc32b1f1b0..a9e0e1c2d1f2 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -8,12 +8,10 @@ #include #include #include -#include #include #include #include -extern struct ctl_table epoll_table[]; /* for sysctl */ /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating additional memory. */ #ifdef __clang__ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 47cf70c8eb93..6dd0f277f844 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -219,7 +219,6 @@ extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; extern struct ctl_table random_table[]; extern struct ctl_table firmware_config_table[]; -extern struct ctl_table epoll_table[]; #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2ee0e76888d8..09fa72299f18 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3093,13 +3093,6 @@ static struct ctl_table fs_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_EPOLL - { - .procname = "epoll", - .mode = 0555, - .child = epoll_table, - }, -#endif #endif { .procname = "protected_symlinks", From 6aad36d421d8bfe156508fa4edfe67827234cf0f Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:13 -0800 Subject: [PATCH 0351/1295] firmware_loader: move firmware sysctl to its own files Patch series "sysctl: 3rd set of kernel/sysctl cleanups", v2. This is the third set of patches to help address cleaning the kitchen seink in kernel/sysctl.c and to move sysctls away to where they are actually implemented / used. This patch (of 8): kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the firmware configuration sysctl table to the only place where it is used, and make it clear that if sysctls are disabled this is not used. [akpm@linux-foundation.org: export register_firmware_config_sysctl and unregister_firmware_config_sysctl to modules] [akpm@linux-foundation.org: use EXPORT_SYMBOL_NS_GPL instead] [sfr@canb.auug.org.au: fix that so it compiles] Link: https://lkml.kernel.org/r/20211201160626.401d828d@canb.auug.org.au [mcgrof@kernel.org: major commit log update to justify the move] Link: https://lkml.kernel.org/r/20211124231435.1445213-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211124231435.1445213-2-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Signed-off-by: Stephen Rothwell Cc: Kees Cook Cc: Iurii Zaikin Cc: Eric Biederman Cc: Stephen Kitt Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: "Theodore Ts'o" Cc: Al Viro Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Steven Rostedt (VMware) Cc: John Ogness Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Martin K. Petersen Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Suren Baghdasaryan Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/firmware_loader/fallback.c | 7 +++++- drivers/base/firmware_loader/fallback.h | 11 ++++++++ drivers/base/firmware_loader/fallback_table.c | 25 +++++++++++++++++-- include/linux/sysctl.h | 1 - kernel/sysctl.c | 7 ------ 5 files changed, 40 insertions(+), 11 deletions(-) diff --git a/drivers/base/firmware_loader/fallback.c b/drivers/base/firmware_loader/fallback.c index d7d63c1aa993..4afb0e9312c0 100644 --- a/drivers/base/firmware_loader/fallback.c +++ b/drivers/base/firmware_loader/fallback.c @@ -199,11 +199,16 @@ static struct class firmware_class = { int register_sysfs_loader(void) { - return class_register(&firmware_class); + int ret = class_register(&firmware_class); + + if (ret != 0) + return ret; + return register_firmware_config_sysctl(); } void unregister_sysfs_loader(void) { + unregister_firmware_config_sysctl(); class_unregister(&firmware_class); } diff --git a/drivers/base/firmware_loader/fallback.h b/drivers/base/firmware_loader/fallback.h index 3af7205b302f..9f3055d3b4ca 100644 --- a/drivers/base/firmware_loader/fallback.h +++ b/drivers/base/firmware_loader/fallback.h @@ -42,6 +42,17 @@ void fw_fallback_set_default_timeout(void); int register_sysfs_loader(void); void unregister_sysfs_loader(void); +#ifdef CONFIG_SYSCTL +extern int register_firmware_config_sysctl(void); +extern void unregister_firmware_config_sysctl(void); +#else +static inline int register_firmware_config_sysctl(void) +{ + return 0; +} +static inline void unregister_firmware_config_sysctl(void) { } +#endif /* CONFIG_SYSCTL */ + #else /* CONFIG_FW_LOADER_USER_HELPER */ static inline int firmware_fallback_sysfs(struct firmware *fw, const char *name, struct device *device, diff --git a/drivers/base/firmware_loader/fallback_table.c b/drivers/base/firmware_loader/fallback_table.c index 46a731dede6f..e5ac098d0742 100644 --- a/drivers/base/firmware_loader/fallback_table.c +++ b/drivers/base/firmware_loader/fallback_table.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -24,7 +25,7 @@ struct firmware_fallback_config fw_fallback_config = { EXPORT_SYMBOL_NS_GPL(fw_fallback_config, FIRMWARE_LOADER_PRIVATE); #ifdef CONFIG_SYSCTL -struct ctl_table firmware_config_table[] = { +static struct ctl_table firmware_config_table[] = { { .procname = "force_sysfs_fallback", .data = &fw_fallback_config.force_sysfs_fallback, @@ -45,4 +46,24 @@ struct ctl_table firmware_config_table[] = { }, { } }; -#endif + +static struct ctl_table_header *firmware_config_sysct_table_header; +int register_firmware_config_sysctl(void) +{ + firmware_config_sysct_table_header = + register_sysctl("kernel/firmware_config", + firmware_config_table); + if (!firmware_config_sysct_table_header) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL_NS_GPL(register_firmware_config_sysctl, FIRMWARE_LOADER_PRIVATE); + +void unregister_firmware_config_sysctl(void) +{ + unregister_sysctl_table(firmware_config_sysct_table_header); + firmware_config_sysct_table_header = NULL; +} +EXPORT_SYMBOL_NS_GPL(unregister_firmware_config_sysctl, FIRMWARE_LOADER_PRIVATE); + +#endif /* CONFIG_SYSCTL */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 6dd0f277f844..3985e9c80155 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -218,7 +218,6 @@ extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; extern struct ctl_table random_table[]; -extern struct ctl_table firmware_config_table[]; #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 09fa72299f18..5fc3ae16e995 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2154,13 +2154,6 @@ static struct ctl_table kern_table[] = { .mode = 0555, .child = usermodehelper_table, }, -#ifdef CONFIG_FW_LOADER_USER_HELPER - { - .procname = "firmware_config", - .mode = 0555, - .child = firmware_config_table, - }, -#endif { .procname = "overflowuid", .data = &overflowuid, From 5475e8f03c80bbce7b43a57d861f5acc44a60b22 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:18 -0800 Subject: [PATCH 0352/1295] random: move the random sysctl declarations to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the random sysctls to their own file and use register_sysctl_init(). [mcgrof@kernel.org: commit log update to justify the move] Link: https://lkml.kernel.org/r/20211124231435.1445213-3-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/random.c | 14 ++++++++++++-- include/linux/sysctl.h | 1 - kernel/sysctl.c | 5 ----- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index b411182df6f6..68613f0b6887 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1992,8 +1992,7 @@ static int proc_do_entropy(struct ctl_table *table, int write, void *buffer, } static int sysctl_poolsize = POOL_BITS; -extern struct ctl_table random_table[]; -struct ctl_table random_table[] = { +static struct ctl_table random_table[] = { { .procname = "poolsize", .data = &sysctl_poolsize, @@ -2055,6 +2054,17 @@ struct ctl_table random_table[] = { #endif { } }; + +/* + * rand_initialize() is called before sysctl_init(), + * so we cannot call register_sysctl_init() in rand_initialize() + */ +static int __init random_sysctls_init(void) +{ + register_sysctl_init("kernel/random", random_table); + return 0; +} +device_initcall(random_sysctls_init); #endif /* CONFIG_SYSCTL */ struct batched_entropy { diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 3985e9c80155..fce05a060bc5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -217,7 +217,6 @@ extern int unaligned_dump_stack; extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; -extern struct ctl_table random_table[]; #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5fc3ae16e995..67a5b04f2f84 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2144,11 +2144,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_max_threads, }, - { - .procname = "random", - .mode = 0555, - .child = random_table, - }, { .procname = "usermodehelper", .mode = 0555, From ee9efac48a082904d17a20131aa73d82f058cdd6 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:23 -0800 Subject: [PATCH 0353/1295] sysctl: add helper to register a sysctl mount point The way to create a subdirectory on top of sysctl_mount_point is a bit obscure, and *why* we do that even so more. Provide a helper which makes it clear why we do this. [akpm@linux-foundation.org: export register_sysctl_mount_point() to modules] Link: https://lkml.kernel.org/r/20211124231435.1445213-4-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Suggested-by: "Eric W. Biederman" Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 14 ++++++++++++++ include/linux/sysctl.h | 7 +++++++ 2 files changed, 21 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index cd8bbf13d0f0..6bb2d9a3513d 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -35,6 +35,20 @@ struct ctl_table sysctl_mount_point[] = { { } }; +/** + * register_sysctl_mount_point() - registers a sysctl mount point + * @path: path for the mount point + * + * Used to create a permanently empty directory to serve as mount point. + * There are some subtle but important permission checks this allows in the + * case of unprivileged mounts. + */ +struct ctl_table_header *register_sysctl_mount_point(const char *path) +{ + return register_sysctl(path, sysctl_mount_point); +} +EXPORT_SYMBOL(register_sysctl_mount_point); + static bool is_empty_dir(struct ctl_table_header *head) { return head->ctl_table[0].child == sysctl_mount_point; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index fce05a060bc5..746c098a6ff5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -209,6 +209,8 @@ extern int sysctl_init(void); extern void __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name); #define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) +extern struct ctl_table_header *register_sysctl_mount_point(const char *path); + void do_sysctl_args(void); extern int pwrsw_enabled; @@ -224,6 +226,11 @@ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * return NULL; } +static inline struct sysctl_header *register_sysctl_mount_point(const char *path) +{ + return NULL; +} + static inline struct ctl_table_header *register_sysctl_paths( const struct ctl_path *path, struct ctl_table *table) { From 3ba442d5331ff4d4339faf4986d0841386196f92 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:28 -0800 Subject: [PATCH 0354/1295] fs: move binfmt_misc sysctl to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. This moves the binfmt_misc sysctl to its own file to help remove clutter from kernel/sysctl.c. Link: https://lkml.kernel.org/r/20211124231435.1445213-5-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 6 +++++- kernel/sysctl.c | 7 ------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index e1eae7ea823a..ddea6acbddde 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -822,7 +822,11 @@ static int __init init_misc_binfmt(void) int err = register_filesystem(&bm_fs_type); if (!err) insert_binfmt(&misc_format); - return err; + if (!register_sysctl_mount_point("fs/binfmt_misc")) { + pr_warn("Failed to create fs/binfmt_misc sysctl mount point"); + return -ENOMEM; + } + return 0; } static void __exit exit_misc_binfmt(void) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 67a5b04f2f84..fb3cc7a1d4ad 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3127,13 +3127,6 @@ static struct ctl_table fs_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, -#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) - { - .procname = "binfmt_misc", - .mode = 0555, - .child = sysctl_mount_point, - }, -#endif { .procname = "pipe-max-size", .data = &pipe_max_size, From faaa357a55e03490fb280ac211be2298e635b220 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:33 -0800 Subject: [PATCH 0355/1295] printk: move printk sysctl to printk/sysctl.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move printk sysctl from kernel/sysctl.c to kernel/printk/sysctl.c. Use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: fixed compile issues when PRINTK is not set, commit log update] Link: https://lkml.kernel.org/r/20211124231435.1445213-6-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/Makefile | 5 ++- kernel/printk/internal.h | 6 +++ kernel/printk/printk.c | 1 + kernel/printk/sysctl.c | 85 ++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 68 -------------------------------- 5 files changed, 96 insertions(+), 69 deletions(-) create mode 100644 kernel/printk/sysctl.c diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index d118739874c0..f5b388e810b9 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -2,5 +2,8 @@ obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o -obj-$(CONFIG_PRINTK) += printk_ringbuffer.o obj-$(CONFIG_PRINTK_INDEX) += index.o + +obj-$(CONFIG_PRINTK) += printk_support.o +printk_support-y := printk_ringbuffer.o +printk_support-$(CONFIG_SYSCTL) += sysctl.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 9f3ed2fdb721..6b1c4b399845 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -4,6 +4,12 @@ */ #include +#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) +void __init printk_sysctl_init(void); +#else +#define printk_sysctl_init() do { } while (0) +#endif + #ifdef CONFIG_PRINTK /* Flags for a single printk record. */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 155229f0cf0f..363a493bded8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3211,6 +3211,7 @@ static int __init printk_late_init(void) ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", console_cpu_notify, NULL); WARN_ON(ret < 0); + printk_sysctl_init(); return 0; } late_initcall(printk_late_init); diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c new file mode 100644 index 000000000000..653ae04aab7f --- /dev/null +++ b/kernel/printk/sysctl.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sysctl.c: General linux system control interface + */ + +#include +#include +#include +#include +#include "internal.h" + +static const int ten_thousand = 10000; + +static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} + +static struct ctl_table printk_sysctls[] = { + { + .procname = "printk", + .data = &console_loglevel, + .maxlen = 4*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_ratelimit", + .data = &printk_ratelimit_state.interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "printk_ratelimit_burst", + .data = &printk_ratelimit_state.burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&ten_thousand, + }, + { + .procname = "printk_devkmsg", + .data = devkmsg_log_str, + .maxlen = DEVKMSG_STR_MAX_SIZE, + .mode = 0644, + .proc_handler = devkmsg_sysctl_set_loglvl, + }, + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "kptr_restrict", + .data = &kptr_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + {} +}; + +void __init printk_sysctl_init(void) +{ + register_sysctl_init("kernel", printk_sysctls); +} diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fb3cc7a1d4ad..509f782483db 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -908,17 +908,6 @@ static int proc_taint(struct ctl_table *table, int write, return err; } -#ifdef CONFIG_PRINTK -static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} -#endif - /** * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure * @min: pointer to minimum allowable value @@ -2210,63 +2199,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, -#if defined CONFIG_PRINTK - { - .procname = "printk", - .data = &console_loglevel, - .maxlen = 4*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_ratelimit", - .data = &printk_ratelimit_state.interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "printk_ratelimit_burst", - .data = &printk_ratelimit_state.burst, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_delay", - .data = &printk_delay_msec, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&ten_thousand, - }, - { - .procname = "printk_devkmsg", - .data = devkmsg_log_str, - .maxlen = DEVKMSG_STR_MAX_SIZE, - .mode = 0644, - .proc_handler = devkmsg_sysctl_set_loglvl, - }, - { - .procname = "dmesg_restrict", - .data = &dmesg_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "kptr_restrict", - .data = &kptr_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, -#endif { .procname = "ngroups_max", .data = (void *)&ngroups_max, From 26d1c80fd61e59d5e2eb6fda00e0148a6704ddeb Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:38 -0800 Subject: [PATCH 0356/1295] scsi/sg: move sg-big-buff sysctl to scsi/sg.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the sg-big-buff sysctl from kernel/sysctl.c to drivers/scsi/sg.c and use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: commit log update] Link: https://lkml.kernel.org/r/20211124231435.1445213-7-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/scsi/sg.c | 35 ++++++++++++++++++++++++++++++++++- include/scsi/sg.h | 4 ---- kernel/sysctl.c | 12 ------------ 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index ad12b3261845..6b43e97bd417 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -77,7 +77,7 @@ static int sg_proc_init(void); #define SG_DEFAULT_TIMEOUT mult_frac(SG_DEFAULT_TIMEOUT_USER, HZ, USER_HZ) -int sg_big_buff = SG_DEF_RESERVED_SIZE; +static int sg_big_buff = SG_DEF_RESERVED_SIZE; /* N.B. This variable is readable and writeable via /proc/scsi/sg/def_reserved_size . Each time sg_open() is called a buffer of this size (or less if there is not enough memory) will be reserved @@ -1634,6 +1634,37 @@ MODULE_PARM_DESC(scatter_elem_sz, "scatter gather element " MODULE_PARM_DESC(def_reserved_size, "size of buffer reserved for each fd"); MODULE_PARM_DESC(allow_dio, "allow direct I/O (default: 0 (disallow))"); +#ifdef CONFIG_SYSCTL +#include + +static struct ctl_table sg_sysctls[] = { + { + .procname = "sg-big-buff", + .data = &sg_big_buff, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + {} +}; + +static struct ctl_table_header *hdr; +static void register_sg_sysctls(void) +{ + if (!hdr) + hdr = register_sysctl("kernel", sg_sysctls); +} + +static void unregister_sg_sysctls(void) +{ + if (hdr) + unregister_sysctl_table(hdr); +} +#else +#define register_sg_sysctls() do { } while (0) +#define unregister_sg_sysctls() do { } while (0) +#endif /* CONFIG_SYSCTL */ + static int __init init_sg(void) { @@ -1666,6 +1697,7 @@ init_sg(void) return 0; } class_destroy(sg_sysfs_class); + register_sg_sysctls(); err_out: unregister_chrdev_region(MKDEV(SCSI_GENERIC_MAJOR, 0), SG_MAX_DEVS); return rc; @@ -1674,6 +1706,7 @@ err_out: static void __exit exit_sg(void) { + unregister_sg_sysctls(); #ifdef CONFIG_SCSI_PROC_FS remove_proc_subtree("scsi/sg", NULL); #endif /* CONFIG_SCSI_PROC_FS */ diff --git a/include/scsi/sg.h b/include/scsi/sg.h index 843cefb8efce..068e35d36557 100644 --- a/include/scsi/sg.h +++ b/include/scsi/sg.h @@ -29,10 +29,6 @@ * For utility and test programs see: http://sg.danny.cz/sg/sg3_utils.html */ -#ifdef __KERNEL__ -extern int sg_big_buff; /* for sysctl */ -#endif - typedef struct sg_iovec /* same structure as used by readv() Linux system */ { /* call. It defines one scatter-gather element. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 509f782483db..b0cced3808d4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -95,9 +95,6 @@ #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) #include #endif -#ifdef CONFIG_CHR_DEV_SG -#include -#endif #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include #endif @@ -2090,15 +2087,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dostring, }, #endif -#ifdef CONFIG_CHR_DEV_SG - { - .procname = "sg-big-buff", - .data = &sg_big_buff, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_BSD_PROCESS_ACCT { .procname = "acct", From 0df8bdd5e3b3e557ce2c2575fce0c64c5dd1045a Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:43 -0800 Subject: [PATCH 0357/1295] stackleak: move stack_erasing sysctl to stackleak.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the stack_erasing sysctl from kernel/sysctl.c to kernel/stackleak.c and use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: commit log update] Link: https://lkml.kernel.org/r/20211124231435.1445213-8-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stackleak.h | 5 ----- kernel/stackleak.c | 26 ++++++++++++++++++++++++-- kernel/sysctl.c | 14 -------------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h index a59db2f08e76..ccaab2043fcd 100644 --- a/include/linux/stackleak.h +++ b/include/linux/stackleak.h @@ -23,11 +23,6 @@ static inline void stackleak_task_init(struct task_struct *t) # endif } -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE -int stack_erasing_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -#endif - #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */ static inline void stackleak_task_init(struct task_struct *t) { } #endif diff --git a/kernel/stackleak.c b/kernel/stackleak.c index ce161a8e8d97..66b8af394e58 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -16,11 +16,13 @@ #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include #include +#include static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); -int stack_erasing_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +#ifdef CONFIG_SYSCTL +static int stack_erasing_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; int state = !static_branch_unlikely(&stack_erasing_bypass); @@ -42,6 +44,26 @@ int stack_erasing_sysctl(struct ctl_table *table, int write, state ? "enabled" : "disabled"); return ret; } +static struct ctl_table stackleak_sysctls[] = { + { + .procname = "stack_erasing", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = stack_erasing_sysctl, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init stackleak_sysctls_init(void) +{ + register_sysctl_init("kernel", stackleak_sysctls); + return 0; +} +late_initcall(stackleak_sysctls_init); +#endif /* CONFIG_SYSCTL */ #define skip_erasing() static_branch_unlikely(&stack_erasing_bypass) #else diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b0cced3808d4..3e06dafdd2c3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -95,9 +95,6 @@ #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) #include #endif -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE -#include -#endif #if defined(CONFIG_SYSCTL) @@ -2442,17 +2439,6 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, -#endif -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE - { - .procname = "stack_erasing", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = stack_erasing_sysctl, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #endif { } }; From b1f2aff888af54a057c2c3c0d88a13ef5d37b52a Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:48 -0800 Subject: [PATCH 0358/1295] sysctl: share unsigned long const values Provide a way to share unsigned long values. This will allow others to not have to re-invent these values. Link: https://lkml.kernel.org/r/20211124231435.1445213-9-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 3 +++ include/linux/sysctl.h | 6 ++++++ kernel/sysctl.c | 9 +++------ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 6bb2d9a3513d..2eb5987091ea 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -29,6 +29,9 @@ static const struct inode_operations proc_sys_dir_operations; const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX }; EXPORT_SYMBOL(sysctl_vals); +const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; +EXPORT_SYMBOL_GPL(sysctl_long_vals); + /* Support for permanently empty directories */ struct ctl_table sysctl_mount_point[] = { diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 746c098a6ff5..2de6d20d191b 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -51,6 +51,12 @@ struct ctl_dir; extern const int sysctl_vals[]; +#define SYSCTL_LONG_ZERO ((void *)&sysctl_long_vals[0]) +#define SYSCTL_LONG_ONE ((void *)&sysctl_long_vals[1]) +#define SYSCTL_LONG_MAX ((void *)&sysctl_long_vals[2]) + +extern const unsigned long sysctl_long_vals[]; + typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e06dafdd2c3..1b8a1cb8aac9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -100,9 +100,6 @@ /* Constants used for minimum and maximum */ -static const unsigned long zero_ul; -static const unsigned long one_ul = 1; -static const unsigned long long_max = LONG_MAX; #ifdef CONFIG_PRINTK static const int ten_thousand = 10000; #endif @@ -2513,7 +2510,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirty_background_bytes), .mode = 0644, .proc_handler = dirty_background_bytes_handler, - .extra1 = (void *)&one_ul, + .extra1 = SYSCTL_LONG_ONE, }, { .procname = "dirty_ratio", @@ -2931,8 +2928,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = (void *)&zero_ul, - .extra2 = (void *)&long_max, + .extra1 = SYSCTL_LONG_ZERO, + .extra2 = SYSCTL_LONG_MAX, }, { .procname = "nr_open", From 1d67fe585049d3e2448b997af78c68cbf90ada09 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:52 -0800 Subject: [PATCH 0359/1295] fs: move inode sysctls to its own file Patch series "sysctl: 4th set of kernel/sysctl cleanups". This is slimming down the fs uses of kernel/sysctl.c to the point that the next step is to just get rid of the fs base directory for it and move that elsehwere, so that next patch series starts dealing with that to demo how we can end up cleaning up a full base directory from kernel/sysctl.c, one at a time. This patch (of 9): kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the inode sysctls to its own file. Since we are no longer using this outside of fs/ remove the extern declaration of its respective proc helper. We use early_initcall() as it is the earliest we can use. [arnd@arndb.de: avoid unused-variable warning] Link: https://lkml.kernel.org/r/20211203190123.874239-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20211129205548.605569-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211129205548.605569-2-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Kees Cook Cc: Iurii Zaikin Cc: Xiaoming Ni Cc: Eric Biederman Cc: Stephen Kitt Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Andy Shevchenko Cc: Jeff Layton Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 39 ++++++++++++++++++++++++++++++++------- include/linux/fs.h | 3 --- kernel/sysctl.c | 14 -------------- 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 980e7b7a5460..63324df6fa27 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -67,11 +67,6 @@ const struct address_space_operations empty_aops = { }; EXPORT_SYMBOL(empty_aops); -/* - * Statistics gathering.. - */ -struct inodes_stat_t inodes_stat; - static DEFINE_PER_CPU(unsigned long, nr_inodes); static DEFINE_PER_CPU(unsigned long, nr_unused); @@ -106,13 +101,43 @@ long get_nr_dirty_inodes(void) * Handle nr_inode sysctl */ #ifdef CONFIG_SYSCTL -int proc_nr_inodes(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +/* + * Statistics gathering.. + */ +static struct inodes_stat_t inodes_stat; + +static int proc_nr_inodes(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } + +static struct ctl_table inodes_sysctls[] = { + { + .procname = "inode-nr", + .data = &inodes_stat, + .maxlen = 2*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { + .procname = "inode-state", + .data = &inodes_stat, + .maxlen = 7*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { } +}; + +static int __init init_fs_inode_sysctls(void) +{ + register_sysctl_init("fs", inodes_sysctls); + return 0; +} +early_initcall(init_fs_inode_sysctls); #endif static int no_open(struct inode *inode, struct file *file) diff --git a/include/linux/fs.h b/include/linux/fs.h index c8510da6cc6d..044de67c8167 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -82,7 +82,6 @@ extern void __init files_maxfiles_init(void); extern struct files_stat_struct files_stat; extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; -extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; extern int sysctl_protected_symlinks; extern int sysctl_protected_hardlinks; @@ -3537,8 +3536,6 @@ int proc_nr_files(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int proc_nr_inodes(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1b8a1cb8aac9..402c9d3246bf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2901,20 +2901,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "inode-nr", - .data = &inodes_stat, - .maxlen = 2*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "inode-state", - .data = &inodes_stat, - .maxlen = 7*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, { .procname = "file-nr", .data = &files_stat, From 204d5a24e15562b2816825c0f9b49d26814b77be Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:56 -0800 Subject: [PATCH 0360/1295] fs: move fs stat sysctls to file_table.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. We can create the sysctl dynamically on early init for fs stat to help with this clutter. This dusts off the fs stat syctls knobs and puts them into where they are declared. Link: https://lkml.kernel.org/r/20211129205548.605569-3-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file_table.c | 47 ++++++++++++++++++++++++++++++++++++++-------- include/linux/fs.h | 3 --- kernel/sysctl.c | 25 ------------------------ 3 files changed, 39 insertions(+), 36 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index 45437f8e1003..57edef16dce4 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -33,7 +33,7 @@ #include "internal.h" /* sysctl tunables... */ -struct files_stat_struct files_stat = { +static struct files_stat_struct files_stat = { .max_files = NR_FILE }; @@ -75,22 +75,53 @@ unsigned long get_max_files(void) } EXPORT_SYMBOL_GPL(get_max_files); +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) + /* * Handle nr_files sysctl */ -#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -int proc_nr_files(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_nr_files(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } -#else -int proc_nr_files(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) + +static struct ctl_table fs_stat_sysctls[] = { + { + .procname = "file-nr", + .data = &files_stat, + .maxlen = sizeof(files_stat), + .mode = 0444, + .proc_handler = proc_nr_files, + }, + { + .procname = "file-max", + .data = &files_stat.max_files, + .maxlen = sizeof(files_stat.max_files), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = SYSCTL_LONG_ZERO, + .extra2 = SYSCTL_LONG_MAX, + }, + { + .procname = "nr_open", + .data = &sysctl_nr_open, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &sysctl_nr_open_min, + .extra2 = &sysctl_nr_open_max, + }, + { } +}; + +static int __init init_fs_stat_sysctls(void) { - return -ENOSYS; + register_sysctl_init("fs", fs_stat_sysctls); + return 0; } +fs_initcall(init_fs_stat_sysctls); #endif static struct file *__alloc_file(int flags, const struct cred *cred) diff --git a/include/linux/fs.h b/include/linux/fs.h index 044de67c8167..1e6761966120 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -79,7 +79,6 @@ extern void __init inode_init_early(void); extern void __init files_init(void); extern void __init files_maxfiles_init(void); -extern struct files_stat_struct files_stat; extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; extern int leases_enable, lease_break_time; @@ -3532,8 +3531,6 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; -int proc_nr_files(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int __init list_bdev_fs_names(char *buf, size_t size); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 402c9d3246bf..d2c411b15854 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2901,31 +2901,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "file-nr", - .data = &files_stat, - .maxlen = sizeof(files_stat), - .mode = 0444, - .proc_handler = proc_nr_files, - }, - { - .procname = "file-max", - .data = &files_stat.max_files, - .maxlen = sizeof(files_stat.max_files), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = SYSCTL_LONG_ZERO, - .extra2 = SYSCTL_LONG_MAX, - }, - { - .procname = "nr_open", - .data = &sysctl_nr_open, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &sysctl_nr_open_min, - .extra2 = &sysctl_nr_open_max, - }, { .procname = "dentry-state", .data = &dentry_stat, From c8c0c239d5ab1e3e8d2bb0453ce642fe2c6357ec Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:59 -0800 Subject: [PATCH 0361/1295] fs: move dcache sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the dcache sysctl clutter out of kernel/sysctl.c. This is a small one-off entry, perhaps later we can simplify this representation, but for now we use the helpers we have. We won't know how we can simplify this further untl we're fully done with the cleanup. [arnd@arndb.de: avoid unused-function warning] Link: https://lkml.kernel.org/r/20211203190123.874239-2-arnd@kernel.org Link: https://lkml.kernel.org/r/20211129205548.605569-4-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 37 +++++++++++++++++++++++++++++++------ include/linux/dcache.h | 10 ---------- include/linux/fs.h | 2 -- kernel/sysctl.c | 7 ------- 4 files changed, 31 insertions(+), 25 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index cf871a81f4fd..c84269c6e8bf 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -115,10 +115,13 @@ static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); } - -/* Statistics gathering. */ -struct dentry_stat_t dentry_stat = { - .age_limit = 45, +struct dentry_stat_t { + long nr_dentry; + long nr_unused; + long age_limit; /* age in seconds */ + long want_pages; /* pages requested by system */ + long nr_negative; /* # of unused negative dentries */ + long dummy; /* Reserved for future use */ }; static DEFINE_PER_CPU(long, nr_dentry); @@ -126,6 +129,10 @@ static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +/* Statistics gathering. */ +static struct dentry_stat_t dentry_stat = { + .age_limit = 45, +}; /* * Here we resort to our own counters instead of using generic per-cpu counters @@ -167,14 +174,32 @@ static long get_nr_dentry_negative(void) return sum < 0 ? 0 : sum; } -int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) +static int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } + +static struct ctl_table fs_dcache_sysctls[] = { + { + .procname = "dentry-state", + .data = &dentry_stat, + .maxlen = 6*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_dentry, + }, + { } +}; + +static int __init init_fs_dcache_sysctls(void) +{ + register_sysctl_init("fs", fs_dcache_sysctls); + return 0; +} +fs_initcall(init_fs_dcache_sysctls); #endif /* diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 9e23d33bb6f1..f5bba51480b2 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -61,16 +61,6 @@ extern const struct qstr empty_name; extern const struct qstr slash_name; extern const struct qstr dotdot_name; -struct dentry_stat_t { - long nr_dentry; - long nr_unused; - long age_limit; /* age in seconds */ - long want_pages; /* pages requested by system */ - long nr_negative; /* # of unused negative dentries */ - long dummy; /* Reserved for future use */ -}; -extern struct dentry_stat_t dentry_stat; - /* * Try to keep struct dentry aligned on 64 byte cachelines (this will * give reasonable cacheline footprint with larger lines without the diff --git a/include/linux/fs.h b/include/linux/fs.h index 1e6761966120..9b856d5da9e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3531,8 +3531,6 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; -int proc_nr_dentry(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d2c411b15854..4d182fb5c22e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2901,13 +2901,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "dentry-state", - .data = &dentry_stat, - .maxlen = 6*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_dentry, - }, { .procname = "overflowuid", .data = &fs_overflowuid, From 54771613e8a7dbbba2a205ddf1b33e25a290b3fd Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:03 -0800 Subject: [PATCH 0362/1295] sysctl: move maxolduid as a sysctl specific const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The maxolduid value is only shared for sysctl purposes for use on a max range. Just stuff this into our shared const array. [akpm@linux-foundation.org: fix sysctl_vals[], per Mickaël] Link: https://lkml.kernel.org/r/20211129205548.605569-5-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Signed-off-by: Mickaël Salaün Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 2 +- include/linux/sysctl.h | 3 +++ kernel/sysctl.c | 12 ++++-------- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 2eb5987091ea..b9d53b53d769 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -26,7 +26,7 @@ static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX }; +const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX, 65535 }; EXPORT_SYMBOL(sysctl_vals); const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 2de6d20d191b..bb921eb8a02d 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -49,6 +49,9 @@ struct ctl_dir; #define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[8]) #define SYSCTL_INT_MAX ((void *)&sysctl_vals[9]) +/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +#define SYSCTL_MAXOLDUID ((void *)&sysctl_vals[10]) + extern const int sysctl_vals[]; #define SYSCTL_LONG_ZERO ((void *)&sysctl_long_vals[0]) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4d182fb5c22e..5bbb4c59dd1a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -110,10 +110,6 @@ static const int six_hundred_forty_kb = 640 * 1024; /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; -/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ -static const int maxolduid = 65535; -/* minolduid is SYSCTL_ZERO */ - static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -2127,7 +2123,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&maxolduid, + .extra2 = SYSCTL_MAXOLDUID, }, { .procname = "overflowgid", @@ -2136,7 +2132,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&maxolduid, + .extra2 = SYSCTL_MAXOLDUID, }, #ifdef CONFIG_S390 { @@ -2908,7 +2904,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&maxolduid, + .extra2 = SYSCTL_MAXOLDUID, }, { .procname = "overflowgid", @@ -2917,7 +2913,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&maxolduid, + .extra2 = SYSCTL_MAXOLDUID, }, #ifdef CONFIG_FILE_LOCKING { From d1d8ac9edf10e83f16d13f009439585a1f93ccb2 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:06 -0800 Subject: [PATCH 0363/1295] fs: move shared sysctls to fs/sysctls.c To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move sysctls which are shared between filesystems into a common file outside of kernel/sysctl.c. Link: https://lkml.kernel.org/r/20211129205548.605569-6-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Makefile | 1 + fs/sysctls.c | 38 ++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 18 ------------------ 3 files changed, 39 insertions(+), 18 deletions(-) create mode 100644 fs/sysctls.c diff --git a/fs/Makefile b/fs/Makefile index 84c5e4cdfee5..ea8770d124da 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -28,6 +28,7 @@ obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-y += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o +obj-$(CONFIG_SYSCTL) += sysctls.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o diff --git a/fs/sysctls.c b/fs/sysctls.c new file mode 100644 index 000000000000..54216cd1ecd7 --- /dev/null +++ b/fs/sysctls.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * /proc/sys/fs shared sysctls + * + * These sysctls are shared between different filesystems. + */ +#include +#include + +static struct ctl_table fs_shared_sysctls[] = { + { + .procname = "overflowuid", + .data = &fs_overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, + { + .procname = "overflowgid", + .data = &fs_overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, + { } +}; + +static int __init init_fs_shared_sysctls(void) +{ + register_sysctl_init("fs", fs_shared_sysctls); + return 0; +} + +early_initcall(init_fs_shared_sysctls); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5bbb4c59dd1a..6d9d2001b790 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2897,24 +2897,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "overflowuid", - .data = &fs_overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_MAXOLDUID, - }, - { - .procname = "overflowgid", - .data = &fs_overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_MAXOLDUID, - }, #ifdef CONFIG_FILE_LOCKING { .procname = "leases-enable", From dd81faa88340a1fe8cd81c8ecbadd8e95c58549c Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:10 -0800 Subject: [PATCH 0364/1295] fs: move locking sysctls where they are used kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. The locking fs sysctls are only used on fs/locks.c, so move them there. Link: https://lkml.kernel.org/r/20211129205548.605569-7-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/locks.c | 34 ++++++++++++++++++++++++++++++++-- include/linux/fs.h | 4 ---- kernel/sysctl.c | 20 -------------------- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 0fca9d680978..8c6df10cd9ed 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -62,6 +62,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -88,8 +89,37 @@ static int target_leasetype(struct file_lock *fl) return fl->fl_type; } -int leases_enable = 1; -int lease_break_time = 45; +static int leases_enable = 1; +static int lease_break_time = 45; + +#ifdef CONFIG_SYSCTL +static struct ctl_table locks_sysctls[] = { + { + .procname = "leases-enable", + .data = &leases_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_MMU + { + .procname = "lease-break-time", + .data = &lease_break_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_MMU */ + {} +}; + +static int __init init_fs_locks_sysctls(void) +{ + register_sysctl_init("fs", locks_sysctls); + return 0; +} +early_initcall(init_fs_locks_sysctls); +#endif /* CONFIG_SYSCTL */ /* * The global file_lock_list is only used for displaying /proc/locks, so we diff --git a/include/linux/fs.h b/include/linux/fs.h index 9b856d5da9e2..0e08c3dd8f75 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -82,10 +82,6 @@ extern void __init files_maxfiles_init(void); extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; extern int leases_enable, lease_break_time; -extern int sysctl_protected_symlinks; -extern int sysctl_protected_hardlinks; -extern int sysctl_protected_fifos; -extern int sysctl_protected_regular; typedef __kernel_rwf_t rwf_t; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6d9d2001b790..073948e9d165 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2897,26 +2897,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { -#ifdef CONFIG_FILE_LOCKING - { - .procname = "leases-enable", - .data = &leases_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MMU -#ifdef CONFIG_FILE_LOCKING - { - .procname = "lease-break-time", - .data = &lease_break_time, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#endif { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, From 9c011be132972ff94bde2ae99064e29f94e85c68 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:13 -0800 Subject: [PATCH 0365/1295] fs: move namei sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move namei's own sysctl knobs to its own file. Other than the move we also avoid initializing two static variables to 0 as this is not needed: * sysctl_protected_symlinks * sysctl_protected_hardlinks Link: https://lkml.kernel.org/r/20211129205548.605569-8-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 58 ++++++++++++++++++++++++++++++++++++++++++---- include/linux/fs.h | 1 - kernel/sysctl.c | 36 ---------------------------- 3 files changed, 54 insertions(+), 41 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index d81f04f8d818..b867a92c078e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1020,10 +1020,60 @@ static inline void put_link(struct nameidata *nd) path_put(&last->link); } -int sysctl_protected_symlinks __read_mostly = 0; -int sysctl_protected_hardlinks __read_mostly = 0; -int sysctl_protected_fifos __read_mostly; -int sysctl_protected_regular __read_mostly; +static int sysctl_protected_symlinks __read_mostly; +static int sysctl_protected_hardlinks __read_mostly; +static int sysctl_protected_fifos __read_mostly; +static int sysctl_protected_regular __read_mostly; + +#ifdef CONFIG_SYSCTL +static struct ctl_table namei_sysctls[] = { + { + .procname = "protected_symlinks", + .data = &sysctl_protected_symlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_hardlinks", + .data = &sysctl_protected_hardlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_fifos", + .data = &sysctl_protected_fifos, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "protected_regular", + .data = &sysctl_protected_regular, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { } +}; + +static int __init init_fs_namei_sysctls(void) +{ + register_sysctl_init("fs", namei_sysctls); + return 0; +} +fs_initcall(init_fs_namei_sysctls); + +#endif /* CONFIG_SYSCTL */ /** * may_follow_link - Check symlink following for unsafe situations diff --git a/include/linux/fs.h b/include/linux/fs.h index 0e08c3dd8f75..9617dea24978 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -81,7 +81,6 @@ extern void __init files_maxfiles_init(void); extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; -extern int leases_enable, lease_break_time; typedef __kernel_rwf_t rwf_t; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 073948e9d165..53fb4692facc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2897,42 +2897,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "protected_symlinks", - .data = &sysctl_protected_symlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_hardlinks", - .data = &sysctl_protected_hardlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_fifos", - .data = &sysctl_protected_fifos, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "protected_regular", - .data = &sysctl_protected_regular, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, { .procname = "suid_dumpable", .data = &suid_dumpable, From 66ad398634c21e0a42ce10002ae06c39352da0d1 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:17 -0800 Subject: [PATCH 0366/1295] fs: move fs/exec.c sysctls into its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the fs/exec.c respective sysctls to its own file. Since checkpatch complains about style issues with the old code, this move also fixes a few of those minor style issues: * Use pr_warn() instead of prink(WARNING * New empty lines are wanted at the beginning of routines Link: https://lkml.kernel.org/r/20211129205548.605569-9-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 66 ------------------------------------ 2 files changed, 90 insertions(+), 66 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 3c3c366a9bcf..310750340e4a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -2099,3 +2100,92 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd, argv, envp, flags); } #endif + +#ifdef CONFIG_SYSCTL + +static void validate_coredump_safety(void) +{ +#ifdef CONFIG_COREDUMP + if (suid_dumpable == SUID_DUMP_ROOT && + core_pattern[0] != '/' && core_pattern[0] != '|') { + pr_warn( +"Unsafe core_pattern used with fs.suid_dumpable=2.\n" +"Pipe handler or fully qualified core dump path required.\n" +"Set kernel.core_pattern before fs.suid_dumpable.\n" + ); + } +#endif +} + +static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!error) + validate_coredump_safety(); + return error; +} + +static struct ctl_table fs_exec_sysctls[] = { + { + .procname = "suid_dumpable", + .data = &suid_dumpable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_coredump, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { } +}; + +#ifdef CONFIG_COREDUMP + +static int proc_dostring_coredump(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dostring(table, write, buffer, lenp, ppos); + + if (!error) + validate_coredump_safety(); + return error; +} + +static struct ctl_table kernel_exec_sysctls[] = { + { + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "core_pattern", + .data = core_pattern, + .maxlen = CORENAME_MAX_SIZE, + .mode = 0644, + .proc_handler = proc_dostring_coredump, + }, + { + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; +#endif + +static int __init init_fs_exec_sysctls(void) +{ + register_sysctl_init("fs", fs_exec_sysctls); +#ifdef CONFIG_COREDUMP + register_sysctl_init("kernel", kernel_exec_sysctls); +#endif + return 0; +} + +fs_initcall(init_fs_exec_sysctls); +#endif /* CONFIG_SYSCTL */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 53fb4692facc..8bab82aa0192 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1117,40 +1117,6 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write, do_proc_dopipe_max_size_conv, NULL); } -static void validate_coredump_safety(void) -{ -#ifdef CONFIG_COREDUMP - if (suid_dumpable == SUID_DUMP_ROOT && - core_pattern[0] != '/' && core_pattern[0] != '|') { - printk(KERN_WARNING -"Unsafe core_pattern used with fs.suid_dumpable=2.\n" -"Pipe handler or fully qualified core dump path required.\n" -"Set kernel.core_pattern before fs.suid_dumpable.\n" - ); - } -#endif -} - -static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error) - validate_coredump_safety(); - return error; -} - -#ifdef CONFIG_COREDUMP -static int proc_dostring_coredump(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dostring(table, write, buffer, lenp, ppos); - if (!error) - validate_coredump_safety(); - return error; -} -#endif - #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -1874,29 +1840,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_COREDUMP - { - .procname = "core_uses_pid", - .data = &core_uses_pid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "core_pattern", - .data = core_pattern, - .maxlen = CORENAME_MAX_SIZE, - .mode = 0644, - .proc_handler = proc_dostring_coredump, - }, - { - .procname = "core_pipe_limit", - .data = &core_pipe_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -2897,15 +2840,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "suid_dumpable", - .data = &suid_dumpable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_coredump, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, { .procname = "pipe-max-size", .data = &pipe_max_size, From 1998f19324d24df7de4e74d81503b4299eb99e7d Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:20 -0800 Subject: [PATCH 0367/1295] fs: move pipe sysctls to is own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the pipe sysctls to its own file. Link: https://lkml.kernel.org/r/20211129205548.605569-10-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 64 +++++++++++++++++++++++++++++++++++++-- include/linux/pipe_fs_i.h | 4 --- include/linux/sysctl.h | 6 ++++ kernel/sysctl.c | 61 ++++--------------------------------- 4 files changed, 73 insertions(+), 62 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 6d4342bad9f1..cc28623a67b6 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -50,13 +51,13 @@ * The max size that a non-root user is allowed to grow the pipe. Can * be set by root in /proc/sys/fs/pipe-max-size */ -unsigned int pipe_max_size = 1048576; +static unsigned int pipe_max_size = 1048576; /* Maximum allocatable pages per user. Hard limit is unset by default, soft * matches default values. */ -unsigned long pipe_user_pages_hard; -unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; +static unsigned long pipe_user_pages_hard; +static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; /* * We use head and tail indices that aren't masked off, except at the point of @@ -1428,6 +1429,60 @@ static struct file_system_type pipe_fs_type = { .kill_sb = kill_anon_super, }; +#ifdef CONFIG_SYSCTL +static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + if (write) { + unsigned int val; + + val = round_pipe_size(*lvalp); + if (val == 0) + return -EINVAL; + + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +static int proc_dopipe_max_size(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_dopipe_max_size_conv, NULL); +} + +static struct ctl_table fs_pipe_sysctls[] = { + { + .procname = "pipe-max-size", + .data = &pipe_max_size, + .maxlen = sizeof(pipe_max_size), + .mode = 0644, + .proc_handler = proc_dopipe_max_size, + }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { } +}; +#endif + static int __init init_pipe_fs(void) { int err = register_filesystem(&pipe_fs_type); @@ -1439,6 +1494,9 @@ static int __init init_pipe_fs(void) unregister_filesystem(&pipe_fs_type); } } +#ifdef CONFIG_SYSCTL + register_sysctl_init("fs", fs_pipe_sysctls); +#endif return err; } diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index fc5642431b92..c00c618ef290 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -238,10 +238,6 @@ void pipe_lock(struct pipe_inode_info *); void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); -extern unsigned int pipe_max_size; -extern unsigned long pipe_user_pages_hard; -extern unsigned long pipe_user_pages_soft; - /* Wait for a pipe to be readable/writable while dropping the pipe lock */ void pipe_wait_readable(struct pipe_inode_info *); void pipe_wait_writable(struct pipe_inode_info *); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index bb921eb8a02d..4294e9668bd5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -221,6 +221,12 @@ extern void __register_sysctl_init(const char *path, struct ctl_table *table, extern struct ctl_table_header *register_sysctl_mount_point(const char *path); void do_sysctl_args(void); +int do_proc_douintvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data); extern int pwrsw_enabled; extern int unaligned_enabled; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8bab82aa0192..f79ec2cf8721 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include #include @@ -761,12 +760,12 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); } -static int do_proc_douintvec(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) +int do_proc_douintvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) { return __do_proc_douintvec(table->data, table, write, buffer, lenp, ppos, conv, data); @@ -1090,33 +1089,6 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write, } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); -static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) -{ - if (write) { - unsigned int val; - - val = round_pipe_size(*lvalp); - if (val == 0) - return -EINVAL; - - *valp = val; - } else { - unsigned int val = *valp; - *lvalp = (unsigned long) val; - } - - return 0; -} - -static int proc_dopipe_max_size(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_dopipe_max_size_conv, NULL); -} - #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -2840,27 +2812,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "pipe-max-size", - .data = &pipe_max_size, - .maxlen = sizeof(pipe_max_size), - .mode = 0644, - .proc_handler = proc_dopipe_max_size, - }, - { - .procname = "pipe-user-pages-hard", - .data = &pipe_user_pages_hard, - .maxlen = sizeof(pipe_user_pages_hard), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "pipe-user-pages-soft", - .data = &pipe_user_pages_soft, - .maxlen = sizeof(pipe_user_pages_soft), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, { .procname = "mount-max", .data = &sysctl_mount_max, From 51cb8dfc5a5c39e6c70376b9dc9a14d624a9d271 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:24 -0800 Subject: [PATCH 0368/1295] sysctl: add and use base directory declarer and registration helper Patch series "sysctl: add and use base directory declarer and registration helper". In this patch series we start addressing base directories, and so we start with the "fs" sysctls. The end goal is we end up completely moving all "fs" sysctl knobs out from kernel/sysctl. This patch (of 6): Add a set of helpers which can be used to declare and register base directory sysctls on their own. We do this so we can later move each of the base sysctl directories like "fs", "kernel", etc, to their own respective files instead of shoving the declarations and registrations all on kernel/sysctl.c. The lazy approach has caught up and with this, we just end up extending the list of base directories / sysctls on one file and this makes maintenance difficult due to merge conflicts from many developers. The declarations are used first by kernel/sysctl.c for registration its own base which over time we'll try to clean up. It will be used in the next patch to demonstrate how to cleanly deal with base sysctl directories. [mcgrof@kernel.org: null-terminate the ctl_table arrays] Link: https://lkml.kernel.org/r/YafJY3rXDYnjK/gs@bombadil.infradead.org Link: https://lkml.kernel.org/r/20211129211943.640266-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211129211943.640266-2-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Kees Cook Cc: Iurii Zaikin Cc: Xiaoming Ni Cc: Eric Biederman Cc: Stephen Kitt Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Christian Brauner Cc: Eric Biggers Cc: "Naveen N. Rao" Cc: "David S. Miller" Cc: Masami Hiramatsu Cc: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 9 +++++++++ include/linux/sysctl.h | 24 ++++++++++++++++++++++++ kernel/sysctl.c | 41 ++++++++++------------------------------- 3 files changed, 43 insertions(+), 31 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b9d53b53d769..95dfd195e04e 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1646,6 +1646,15 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table) } EXPORT_SYMBOL(register_sysctl_table); +int __register_sysctl_base(struct ctl_table *base_table) +{ + struct ctl_table_header *hdr; + + hdr = register_sysctl_table(base_table); + kmemleak_not_leak(hdr); + return 0; +} + static void put_links(struct ctl_table_header *header) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 4294e9668bd5..02134a8abad7 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -194,6 +194,20 @@ struct ctl_path { #ifdef CONFIG_SYSCTL +#define DECLARE_SYSCTL_BASE(_name, _table) \ +static struct ctl_table _name##_base_table[] = { \ + { \ + .procname = #_name, \ + .mode = 0555, \ + .child = _table, \ + }, \ + { }, \ +} + +extern int __register_sysctl_base(struct ctl_table *base_table); + +#define register_sysctl_base(_name) __register_sysctl_base(_name##_base_table) + void proc_sys_poll_notify(struct ctl_table_poll *poll); extern void setup_sysctl_set(struct ctl_table_set *p, @@ -236,6 +250,16 @@ extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; #else /* CONFIG_SYSCTL */ + +#define DECLARE_SYSCTL_BASE(_name, _table) + +static inline int __register_sysctl_base(struct ctl_table *base_table) +{ + return 0; +} + +#define register_sysctl_base(table) __register_sysctl_base(table) + static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) { return NULL; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f79ec2cf8721..12456a535059 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2851,41 +2851,20 @@ static struct ctl_table dev_table[] = { { } }; -static struct ctl_table sysctl_base_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = kern_table, - }, - { - .procname = "vm", - .mode = 0555, - .child = vm_table, - }, - { - .procname = "fs", - .mode = 0555, - .child = fs_table, - }, - { - .procname = "debug", - .mode = 0555, - .child = debug_table, - }, - { - .procname = "dev", - .mode = 0555, - .child = dev_table, - }, - { } -}; +DECLARE_SYSCTL_BASE(kernel, kern_table); +DECLARE_SYSCTL_BASE(vm, vm_table); +DECLARE_SYSCTL_BASE(fs, fs_table); +DECLARE_SYSCTL_BASE(debug, debug_table); +DECLARE_SYSCTL_BASE(dev, dev_table); int __init sysctl_init(void) { - struct ctl_table_header *hdr; + register_sysctl_base(kernel); + register_sysctl_base(vm); + register_sysctl_base(fs); + register_sysctl_base(debug); + register_sysctl_base(dev); - hdr = register_sysctl_table(sysctl_base_table); - kmemleak_not_leak(hdr); return 0; } #endif /* CONFIG_SYSCTL */ From ab171b952c6e065779687b44041038efdadb3915 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:27 -0800 Subject: [PATCH 0369/1295] fs: move namespace sysctls and declare fs base directory This moves the namespace sysctls to its own file as part of the kernel/sysctl.c spring cleaning Since we have now removed all sysctls for "fs", we now have to declare it on the filesystem code, we do that using the new helper, which reduces boiler plate code. We rename init_fs_shared_sysctls() to init_fs_sysctls() to reflect that now fs/sysctls.c is taking on the burden of being the first to register the base directory as well. Lastly, since init code will load in the order in which we link it we have to move the sysctl code to be linked in early, so that its early init routine runs prior to other fs code. This way, other filesystem code can register their own sysctls using the helpers after this: * register_sysctl_init() * register_sysctl() Link: https://lkml.kernel.org/r/20211129211943.640266-3-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Makefile | 3 ++- fs/namespace.c | 24 +++++++++++++++++++++++- fs/sysctls.c | 9 +++++---- include/linux/mount.h | 3 --- kernel/sysctl.c | 14 -------------- 5 files changed, 30 insertions(+), 23 deletions(-) diff --git a/fs/Makefile b/fs/Makefile index ea8770d124da..dab324aea08f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -6,6 +6,8 @@ # Rewritten to use lists instead of if-statements. # +obj-$(CONFIG_SYSCTL) += sysctls.o + obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ ioctl.o readdir.o select.o dcache.o inode.o \ @@ -28,7 +30,6 @@ obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-y += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o -obj-$(CONFIG_SYSCTL) += sysctls.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o diff --git a/fs/namespace.c b/fs/namespace.c index dc31ad6b370f..40b994a29e90 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -37,7 +37,7 @@ #include "internal.h" /* Maximum number of mounts in a mount namespace */ -unsigned int sysctl_mount_max __read_mostly = 100000; +static unsigned int sysctl_mount_max __read_mostly = 100000; static unsigned int m_hash_mask __read_mostly; static unsigned int m_hash_shift __read_mostly; @@ -4620,3 +4620,25 @@ const struct proc_ns_operations mntns_operations = { .install = mntns_install, .owner = mntns_owner, }; + +#ifdef CONFIG_SYSCTL +static struct ctl_table fs_namespace_sysctls[] = { + { + .procname = "mount-max", + .data = &sysctl_mount_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, + { } +}; + +static int __init init_fs_namespace_sysctls(void) +{ + register_sysctl_init("fs", fs_namespace_sysctls); + return 0; +} +fs_initcall(init_fs_namespace_sysctls); + +#endif /* CONFIG_SYSCTL */ diff --git a/fs/sysctls.c b/fs/sysctls.c index 54216cd1ecd7..c701273c9432 100644 --- a/fs/sysctls.c +++ b/fs/sysctls.c @@ -29,10 +29,11 @@ static struct ctl_table fs_shared_sysctls[] = { { } }; -static int __init init_fs_shared_sysctls(void) +DECLARE_SYSCTL_BASE(fs, fs_shared_sysctls); + +static int __init init_fs_sysctls(void) { - register_sysctl_init("fs", fs_shared_sysctls); - return 0; + return register_sysctl_base(fs); } -early_initcall(init_fs_shared_sysctls); +early_initcall(init_fs_sysctls); diff --git a/include/linux/mount.h b/include/linux/mount.h index 5d92a7e1a742..7f18a7555dff 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -113,9 +113,6 @@ extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); extern dev_t name_to_dev_t(const char *name); - -extern unsigned int sysctl_mount_max; - extern bool path_is_mountpoint(const struct path *path); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 12456a535059..cdc3dc5f0005 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2811,18 +2811,6 @@ static struct ctl_table vm_table[] = { { } }; -static struct ctl_table fs_table[] = { - { - .procname = "mount-max", - .data = &sysctl_mount_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, - { } -}; - static struct ctl_table debug_table[] = { #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { @@ -2853,7 +2841,6 @@ static struct ctl_table dev_table[] = { DECLARE_SYSCTL_BASE(kernel, kern_table); DECLARE_SYSCTL_BASE(vm, vm_table); -DECLARE_SYSCTL_BASE(fs, fs_table); DECLARE_SYSCTL_BASE(debug, debug_table); DECLARE_SYSCTL_BASE(dev, dev_table); @@ -2861,7 +2848,6 @@ int __init sysctl_init(void) { register_sysctl_base(kernel); register_sysctl_base(vm); - register_sysctl_base(fs); register_sysctl_base(debug); register_sysctl_base(dev); From d8c0418aac78e661b5283c9d6a1dfc61d44f26fd Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:31 -0800 Subject: [PATCH 0370/1295] kernel/sysctl.c: rename sysctl_init() to sysctl_init_bases() Rename sysctl_init() to sysctl_init_bases() so to reflect exactly what this is doing. Link: https://lkml.kernel.org/r/20211129211943.640266-4-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/alignment.c | 2 +- arch/sh/mm/alignment.c | 2 +- fs/proc/proc_sysctl.c | 4 ++-- include/linux/sysctl.h | 2 +- kernel/sysctl.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index adbb3817d0be..6f499559d193 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c @@ -1005,7 +1005,7 @@ static int __init noalign_setup(char *__unused) __setup("noalign", noalign_setup); /* - * This needs to be done after sysctl_init, otherwise sys/ will be + * This needs to be done after sysctl_init_bases(), otherwise sys/ will be * overwritten. Actually, this shouldn't be in sys/ at all since * it isn't a sysctl, and it doesn't contain sysctl information. * We now locate it in /proc/cpu/alignment instead. diff --git a/arch/sh/mm/alignment.c b/arch/sh/mm/alignment.c index fb517b82a87b..d802801ceb93 100644 --- a/arch/sh/mm/alignment.c +++ b/arch/sh/mm/alignment.c @@ -161,7 +161,7 @@ static const struct proc_ops alignment_proc_ops = { }; /* - * This needs to be done after sysctl_init, otherwise sys/ will be + * This needs to be done after sysctl_init_bases(), otherwise sys/ will be * overwritten. Actually, this shouldn't be in sys/ at all since * it isn't a sysctl, and it doesn't contain sysctl information. * We now locate it in /proc/cpu/alignment instead. diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 95dfd195e04e..7d9cfc730bd4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1419,7 +1419,7 @@ EXPORT_SYMBOL(register_sysctl); * Context: Can only be called after your respective sysctl base path has been * registered. So for instance, most base directories are registered early on * init before init levels are processed through proc_sys_init() and - * sysctl_init(). + * sysctl_init_bases(). */ void __init __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name) @@ -1768,7 +1768,7 @@ int __init proc_sys_init(void) proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; - return sysctl_init(); + return sysctl_init_bases(); } struct sysctl_alias { diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 02134a8abad7..180adf7da785 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -228,7 +228,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); -extern int sysctl_init(void); +extern int sysctl_init_bases(void); extern void __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name); #define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cdc3dc5f0005..bb07183cd305 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2844,7 +2844,7 @@ DECLARE_SYSCTL_BASE(vm, vm_table); DECLARE_SYSCTL_BASE(debug, debug_table); DECLARE_SYSCTL_BASE(dev, dev_table); -int __init sysctl_init(void) +int __init sysctl_init_bases(void) { register_sysctl_base(kernel); register_sysctl_base(vm); From fdcd4073fccc6f989308be3f1d61d8a68cd990ce Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:13:34 -0800 Subject: [PATCH 0371/1295] printk: fix build warning when CONFIG_PRINTK=n build warning when CONFIG_PRINTK=n kernel/printk/printk.c:175:5: warning: no previous prototype for 'devkmsg_sysctl_set_loglvl' [-Wmissing-prototypes] devkmsg_sysctl_set_loglvl() is only used in sysctl.c when CONFIG_PRINTK=y, but it participates in the build when CONFIG_PRINTK=n. So add compile dependency CONFIG_PRINTK=y && CONFIG_SYSCTL=y to fix the build warning. Link: https://lkml.kernel.org/r/20211129211943.640266-5-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 4 ---- kernel/printk/internal.h | 2 ++ kernel/printk/printk.c | 3 ++- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/printk.h b/include/linux/printk.h index 9497f6b98339..1522df223c0f 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -183,10 +183,6 @@ extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, extern int printk_delay_msec; extern int dmesg_restrict; -extern int -devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buf, - size_t *lenp, loff_t *ppos); - extern void wake_up_klogd(void); char *log_buf_addr_get(void); diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 6b1c4b399845..d947ca6c84f9 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -6,6 +6,8 @@ #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) void __init printk_sysctl_init(void); +int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #else #define printk_sysctl_init() do { } while (0) #endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 363a493bded8..82abfaf3c2aa 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -171,7 +171,7 @@ static int __init control_devkmsg(char *str) __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; - +#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -210,6 +210,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, return 0; } +#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */ /* Number of registered extended console drivers. */ static int nr_ext_console_drivers; From f0bc21b268c1464603192a00851cdbbf7c2cdc36 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:13:38 -0800 Subject: [PATCH 0372/1295] fs/coredump: move coredump sysctls into its own file This moves the fs/coredump.c respective sysctls to its own file. Link: https://lkml.kernel.org/r/20211129211943.640266-6-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coredump.c | 66 +++++++++++++++++++++++++++++++++++++--- fs/exec.c | 55 --------------------------------- include/linux/coredump.h | 10 +++--- kernel/sysctl.c | 2 -- 4 files changed, 67 insertions(+), 66 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 7dece20b162b..1c060c0a2d72 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -52,9 +53,9 @@ #include -int core_uses_pid; -unsigned int core_pipe_limit; -char core_pattern[CORENAME_MAX_SIZE] = "core"; +static int core_uses_pid; +static unsigned int core_pipe_limit; +static char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; struct core_name { @@ -62,8 +63,6 @@ struct core_name { int used, size; }; -/* The maximal length of core_pattern is also specified in sysctl.c */ - static int expand_corename(struct core_name *cn, int size) { char *corename = krealloc(cn->corename, size, GFP_KERNEL); @@ -893,6 +892,63 @@ int dump_align(struct coredump_params *cprm, int align) } EXPORT_SYMBOL(dump_align); +#ifdef CONFIG_SYSCTL + +void validate_coredump_safety(void) +{ + if (suid_dumpable == SUID_DUMP_ROOT && + core_pattern[0] != '/' && core_pattern[0] != '|') { + pr_warn( +"Unsafe core_pattern used with fs.suid_dumpable=2.\n" +"Pipe handler or fully qualified core dump path required.\n" +"Set kernel.core_pattern before fs.suid_dumpable.\n" + ); + } +} + +static int proc_dostring_coredump(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dostring(table, write, buffer, lenp, ppos); + + if (!error) + validate_coredump_safety(); + return error; +} + +static struct ctl_table coredump_sysctls[] = { + { + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "core_pattern", + .data = core_pattern, + .maxlen = CORENAME_MAX_SIZE, + .mode = 0644, + .proc_handler = proc_dostring_coredump, + }, + { + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static int __init init_fs_coredump_sysctls(void) +{ + register_sysctl_init("kernel", coredump_sysctls); + return 0; +} +fs_initcall(init_fs_coredump_sysctls); +#endif /* CONFIG_SYSCTL */ + /* * The purpose of always_dump_vma() is to make sure that special kernel mappings * that are useful for post-mortem analysis are included in every core dump. diff --git a/fs/exec.c b/fs/exec.c index 310750340e4a..79f2c9483302 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2103,20 +2103,6 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd, #ifdef CONFIG_SYSCTL -static void validate_coredump_safety(void) -{ -#ifdef CONFIG_COREDUMP - if (suid_dumpable == SUID_DUMP_ROOT && - core_pattern[0] != '/' && core_pattern[0] != '|') { - pr_warn( -"Unsafe core_pattern used with fs.suid_dumpable=2.\n" -"Pipe handler or fully qualified core dump path required.\n" -"Set kernel.core_pattern before fs.suid_dumpable.\n" - ); - } -#endif -} - static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -2140,50 +2126,9 @@ static struct ctl_table fs_exec_sysctls[] = { { } }; -#ifdef CONFIG_COREDUMP - -static int proc_dostring_coredump(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dostring(table, write, buffer, lenp, ppos); - - if (!error) - validate_coredump_safety(); - return error; -} - -static struct ctl_table kernel_exec_sysctls[] = { - { - .procname = "core_uses_pid", - .data = &core_uses_pid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "core_pattern", - .data = core_pattern, - .maxlen = CORENAME_MAX_SIZE, - .mode = 0644, - .proc_handler = proc_dostring_coredump, - }, - { - .procname = "core_pipe_limit", - .data = &core_pipe_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { } -}; -#endif - static int __init init_fs_exec_sysctls(void) { register_sysctl_init("fs", fs_exec_sysctls); -#ifdef CONFIG_COREDUMP - register_sysctl_init("kernel", kernel_exec_sysctls); -#endif return 0; } diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 78fcd776b185..248a68c668b4 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -14,10 +14,6 @@ struct core_vma_metadata { unsigned long dump_size; }; -extern int core_uses_pid; -extern char core_pattern[]; -extern unsigned int core_pipe_limit; - /* * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. @@ -37,4 +33,10 @@ extern void do_coredump(const kernel_siginfo_t *siginfo); static inline void do_coredump(const kernel_siginfo_t *siginfo) {} #endif +#if defined(CONFIG_COREDUMP) && defined(CONFIG_SYSCTL) +extern void validate_coredump_safety(void); +#else +static inline void validate_coredump_safety(void) {} +#endif + #endif /* _LINUX_COREDUMP_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bb07183cd305..c78585292005 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -62,12 +62,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include From a737a3c6744bc822d1e6a837fef550e665ddf877 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:13:41 -0800 Subject: [PATCH 0373/1295] kprobe: move sysctl_kprobes_optimization to kprobes.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. Move sysctl_kprobes_optimization from kernel/sysctl.c to kernel/kprobes.c. Use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: fix compile issue when CONFIG_OPTPROBES is disabled] Link: https://lkml.kernel.org/r/20211129211943.640266-7-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 6 ------ kernel/kprobes.c | 30 ++++++++++++++++++++++++++---- kernel/sysctl.c | 12 ------------ 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 8c8f7a4d93af..19b884353b15 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -348,12 +348,6 @@ extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs); DEFINE_INSN_CACHE_OPS(optinsn); -#ifdef CONFIG_SYSCTL -extern int sysctl_kprobes_optimization; -extern int proc_kprobes_optimization_handler(struct ctl_table *table, - int write, void *buffer, - size_t *length, loff_t *ppos); -#endif /* CONFIG_SYSCTL */ extern void wait_for_kprobe_optimizer(void); #else /* !CONFIG_OPTPROBES */ static inline void wait_for_kprobe_optimizer(void) { } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 21eccc961bba..94cab8c9ce56 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -48,6 +48,9 @@ #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) +#if !defined(CONFIG_OPTPROBES) || !defined(CONFIG_SYSCTL) +#define kprobe_sysctls_init() do { } while (0) +#endif static int kprobes_initialized; /* kprobe_table can be accessed by @@ -938,10 +941,10 @@ static void unoptimize_all_kprobes(void) } static DEFINE_MUTEX(kprobe_sysctl_mutex); -int sysctl_kprobes_optimization; -int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, - loff_t *ppos) +static int sysctl_kprobes_optimization; +static int proc_kprobes_optimization_handler(struct ctl_table *table, + int write, void *buffer, + size_t *length, loff_t *ppos) { int ret; @@ -957,6 +960,24 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, return ret; } + +static struct ctl_table kprobe_sysctls[] = { + { + .procname = "kprobes-optimization", + .data = &sysctl_kprobes_optimization, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_kprobes_optimization_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static void __init kprobe_sysctls_init(void) +{ + register_sysctl_init("debug", kprobe_sysctls); +} #endif /* CONFIG_SYSCTL */ /* Put a breakpoint for a probe. */ @@ -2584,6 +2605,7 @@ static int __init init_kprobes(void) err = register_module_notifier(&kprobe_module_nb); kprobes_initialized = (err == 0); + kprobe_sysctls_init(); return err; } early_initcall(init_kprobes); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c78585292005..c322bb629d10 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include @@ -2818,17 +2817,6 @@ static struct ctl_table debug_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, -#endif -#if defined(CONFIG_OPTPROBES) - { - .procname = "kprobes-optimization", - .data = &sysctl_kprobes_optimization, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_kprobes_optimization_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #endif { } }; From e565a8ed1ee4b481539b66cd6f54df9ecf1e9861 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 21 Jan 2022 22:13:45 -0800 Subject: [PATCH 0374/1295] kernel/sysctl.c: remove unused variable ten_thousand The const variable ten_thousand is not used, it is redundant and can be removed. Cleans up clang warning: kernel/sysctl.c:99:18: warning: unused variable 'ten_thousand' [-Wunused-const-variable] static const int ten_thousand = 10000; Link: https://lkml.kernel.org/r/20211221184501.574670-1-colin.i.king@gmail.com Fixes: c26da54dc8ca ("printk: move printk sysctl to printk/sysctl.c") Signed-off-by: Colin Ian King Acked-by: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c322bb629d10..e92e8d21d6bf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -96,9 +96,6 @@ /* Constants used for minimum and maximum */ -#ifdef CONFIG_PRINTK -static const int ten_thousand = 10000; -#endif #ifdef CONFIG_PERF_EVENTS static const int six_hundred_forty_kb = 640 * 1024; #endif From 1622ed7d0743201293094162c26019d2573ecacb Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Jan 2022 22:13:48 -0800 Subject: [PATCH 0375/1295] sysctl: returns -EINVAL when a negative value is passed to proc_doulongvec_minmax When we pass a negative value to the proc_doulongvec_minmax() function, the function returns 0, but the corresponding interface value does not change. we can easily reproduce this problem with the following commands: cd /proc/sys/fs/epoll echo -1 > max_user_watches; echo $?; cat max_user_watches This function requires a non-negative number to be passed in, so when a negative number is passed in, -EINVAL is returned. Link: https://lkml.kernel.org/r/20211220092627.3744624-1-libaokun1@huawei.com Signed-off-by: Baokun Li Reported-by: Hulk Robot Acked-by: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e92e8d21d6bf..5ae443b2882e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1145,10 +1145,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); - if (err) + if (err || neg) { + err = -EINVAL; break; - if (neg) - continue; + } + val = convmul * val / convdiv; if ((min && val < *min) || (max && val > *max)) { err = -EINVAL; From 67f1c9cd0c561d25354c1e289cdd0d77d3112513 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:13:51 -0800 Subject: [PATCH 0376/1295] zsmalloc: introduce some helper functions Patch series "zsmalloc: remove bit_spin_lock", v2. zsmalloc uses bit_spin_lock to minimize space overhead since it's zpage granularity lock. However, it causes zsmalloc non-working under PREEMPT_RT as well as adding too much complication. This patchset tries to replace the bit_spin_lock with per-pool rwlock. It also removes unnecessary zspage isolation logic from class, which was the other part too much complication added into zsmalloc. Last patch changes the get_cpu_var to local_lock to make it work in PREEMPT_RT. This patch (of 9): get_zspage_mapping returns fullness as well as class_idx. However, the fullness is usually not used since it could be stale in some contexts. It causes misleading as well as unnecessary instructions so this patch introduces zspage_class. obj_to_location also produces page and index but we don't need always the index, either so this patch introduces obj_to_page. Link: https://lkml.kernel.org/r/20211115185909.3949505-1-minchan@kernel.org Link: https://lkml.kernel.org/r/20211115185909.3949505-2-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 54 ++++++++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 0d3b65939016..dfa58511fa5b 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -517,6 +517,12 @@ static void get_zspage_mapping(struct zspage *zspage, *class_idx = zspage->class; } +static struct size_class *zspage_class(struct zs_pool *pool, + struct zspage *zspage) +{ + return pool->size_class[zspage->class]; +} + static void set_zspage_mapping(struct zspage *zspage, unsigned int class_idx, enum fullness_group fullness) @@ -844,6 +850,12 @@ static void obj_to_location(unsigned long obj, struct page **page, *obj_idx = (obj & OBJ_INDEX_MASK); } +static void obj_to_page(unsigned long obj, struct page **page) +{ + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); +} + /** * location_to_obj - get obj value encoded from (, ) * @page: page object resides in zspage @@ -1246,8 +1258,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, unsigned long obj, off; unsigned int obj_idx; - unsigned int class_idx; - enum fullness_group fg; struct size_class *class; struct mapping_area *area; struct page *pages[2]; @@ -1270,8 +1280,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, /* migration cannot move any subpage in this zspage */ migrate_read_lock(zspage); - get_zspage_mapping(zspage, &class_idx, &fg); - class = pool->size_class[class_idx]; + class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; area = &get_cpu_var(zs_map_area); @@ -1304,16 +1313,13 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) unsigned long obj, off; unsigned int obj_idx; - unsigned int class_idx; - enum fullness_group fg; struct size_class *class; struct mapping_area *area; obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); - get_zspage_mapping(zspage, &class_idx, &fg); - class = pool->size_class[class_idx]; + class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; area = this_cpu_ptr(&zs_map_area); @@ -1491,8 +1497,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle) struct zspage *zspage; struct page *f_page; unsigned long obj; - unsigned int f_objidx; - int class_idx; struct size_class *class; enum fullness_group fullness; bool isolated; @@ -1502,13 +1506,11 @@ void zs_free(struct zs_pool *pool, unsigned long handle) pin_tag(handle); obj = handle_to_obj(handle); - obj_to_location(obj, &f_page, &f_objidx); + obj_to_page(obj, &f_page); zspage = get_zspage(f_page); migrate_read_lock(zspage); - - get_zspage_mapping(zspage, &class_idx, &fullness); - class = pool->size_class[class_idx]; + class = zspage_class(pool, zspage); spin_lock(&class->lock); obj_free(class, obj); @@ -1866,8 +1868,6 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { struct zs_pool *pool; struct size_class *class; - int class_idx; - enum fullness_group fullness; struct zspage *zspage; struct address_space *mapping; @@ -1880,15 +1880,10 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) zspage = get_zspage(page); - /* - * Without class lock, fullness could be stale while class_idx is okay - * because class_idx is constant unless page is freed so we should get - * fullness again under class lock. - */ - get_zspage_mapping(zspage, &class_idx, &fullness); mapping = page_mapping(page); pool = mapping->private_data; - class = pool->size_class[class_idx]; + + class = zspage_class(pool, zspage); spin_lock(&class->lock); if (get_zspage_inuse(zspage) == 0) { @@ -1907,6 +1902,9 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) * size_class to prevent further object allocation from the zspage. */ if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { + enum fullness_group fullness; + unsigned int class_idx; + get_zspage_mapping(zspage, &class_idx, &fullness); atomic_long_inc(&pool->isolated_pages); remove_zspage(class, zspage, fullness); @@ -1923,8 +1921,6 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, { struct zs_pool *pool; struct size_class *class; - int class_idx; - enum fullness_group fullness; struct zspage *zspage; struct page *dummy; void *s_addr, *d_addr, *addr; @@ -1949,9 +1945,8 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, /* Concurrent compactor cannot migrate any subpage in zspage */ migrate_write_lock(zspage); - get_zspage_mapping(zspage, &class_idx, &fullness); pool = mapping->private_data; - class = pool->size_class[class_idx]; + class = zspage_class(pool, zspage); offset = get_first_obj_offset(page); spin_lock(&class->lock); @@ -2049,8 +2044,6 @@ static void zs_page_putback(struct page *page) { struct zs_pool *pool; struct size_class *class; - int class_idx; - enum fullness_group fg; struct address_space *mapping; struct zspage *zspage; @@ -2058,10 +2051,9 @@ static void zs_page_putback(struct page *page) VM_BUG_ON_PAGE(!PageIsolated(page), page); zspage = get_zspage(page); - get_zspage_mapping(zspage, &class_idx, &fg); mapping = page_mapping(page); pool = mapping->private_data; - class = pool->size_class[class_idx]; + class = zspage_class(pool, zspage); spin_lock(&class->lock); dec_zspage_isolation(zspage); From 3828a76470792aaa5f2de5c0d7fce497187c1e35 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:13:54 -0800 Subject: [PATCH 0377/1295] zsmalloc: rename zs_stat_type to class_stat_type The stat aims for class stat, not zspage so rename it. Link: https://lkml.kernel.org/r/20211115185909.3949505-3-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Peter Zijlstra (Intel) Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index dfa58511fa5b..e219593cb9de 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -158,7 +158,7 @@ enum fullness_group { NR_ZS_FULLNESS, }; -enum zs_stat_type { +enum class_stat_type { CLASS_EMPTY, CLASS_ALMOST_EMPTY, CLASS_ALMOST_FULL, @@ -549,21 +549,21 @@ static int get_size_class_index(int size) return min_t(int, ZS_SIZE_CLASSES - 1, idx); } -/* type can be of enum type zs_stat_type or fullness_group */ -static inline void zs_stat_inc(struct size_class *class, +/* type can be of enum type class_stat_type or fullness_group */ +static inline void class_stat_inc(struct size_class *class, int type, unsigned long cnt) { class->stats.objs[type] += cnt; } -/* type can be of enum type zs_stat_type or fullness_group */ -static inline void zs_stat_dec(struct size_class *class, +/* type can be of enum type class_stat_type or fullness_group */ +static inline void class_stat_dec(struct size_class *class, int type, unsigned long cnt) { class->stats.objs[type] -= cnt; } -/* type can be of enum type zs_stat_type or fullness_group */ +/* type can be of enum type class_stat_type or fullness_group */ static inline unsigned long zs_stat_get(struct size_class *class, int type) { @@ -725,7 +725,7 @@ static void insert_zspage(struct size_class *class, { struct zspage *head; - zs_stat_inc(class, fullness, 1); + class_stat_inc(class, fullness, 1); head = list_first_entry_or_null(&class->fullness_list[fullness], struct zspage, list); /* @@ -750,7 +750,7 @@ static void remove_zspage(struct size_class *class, VM_BUG_ON(is_zspage_isolated(zspage)); list_del_init(&zspage->list); - zs_stat_dec(class, fullness, 1); + class_stat_dec(class, fullness, 1); } /* @@ -964,7 +964,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, cache_free_zspage(pool, zspage); - zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); + class_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated); } @@ -1394,7 +1394,7 @@ static unsigned long obj_malloc(struct size_class *class, kunmap_atomic(vaddr); mod_zspage_inuse(zspage, 1); - zs_stat_inc(class, OBJ_USED, 1); + class_stat_inc(class, OBJ_USED, 1); obj = location_to_obj(m_page, obj); @@ -1458,7 +1458,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) record_obj(handle, obj); atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); - zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); + class_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); @@ -1489,7 +1489,7 @@ static void obj_free(struct size_class *class, unsigned long obj) kunmap_atomic(vaddr); set_freeobj(zspage, f_objidx); mod_zspage_inuse(zspage, -1); - zs_stat_dec(class, OBJ_USED, 1); + class_stat_dec(class, OBJ_USED, 1); } void zs_free(struct zs_pool *pool, unsigned long handle) From 0a5f079b810765be3bd931fce0f88154035af897 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:13:57 -0800 Subject: [PATCH 0378/1295] zsmalloc: decouple class actions from zspage works This patch moves class stat update out of obj_malloc since it's not related to zspage operation. This is a preparation to introduce new lock scheme in next patch. Link: https://lkml.kernel.org/r/20211115185909.3949505-4-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Peter Zijlstra (Intel) Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e219593cb9de..c45807f170e8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1360,17 +1360,19 @@ size_t zs_huge_class_size(struct zs_pool *pool) } EXPORT_SYMBOL_GPL(zs_huge_class_size); -static unsigned long obj_malloc(struct size_class *class, +static unsigned long obj_malloc(struct zs_pool *pool, struct zspage *zspage, unsigned long handle) { int i, nr_page, offset; unsigned long obj; struct link_free *link; + struct size_class *class; struct page *m_page; unsigned long m_offset; void *vaddr; + class = pool->size_class[zspage->class]; handle |= OBJ_ALLOCATED_TAG; obj = get_freeobj(zspage); @@ -1394,7 +1396,6 @@ static unsigned long obj_malloc(struct size_class *class, kunmap_atomic(vaddr); mod_zspage_inuse(zspage, 1); - class_stat_inc(class, OBJ_USED, 1); obj = location_to_obj(m_page, obj); @@ -1433,10 +1434,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) spin_lock(&class->lock); zspage = find_get_zspage(class); if (likely(zspage)) { - obj = obj_malloc(class, zspage, handle); + obj = obj_malloc(pool, zspage, handle); /* Now move the zspage to another fullness group, if required */ fix_fullness_group(class, zspage); record_obj(handle, obj); + class_stat_inc(class, OBJ_USED, 1); spin_unlock(&class->lock); return handle; @@ -1451,7 +1453,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) } spin_lock(&class->lock); - obj = obj_malloc(class, zspage, handle); + obj = obj_malloc(pool, zspage, handle); newfg = get_fullness_group(class, zspage); insert_zspage(class, zspage, newfg); set_zspage_mapping(zspage, class->index, newfg); @@ -1459,6 +1461,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); class_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); + class_stat_inc(class, OBJ_USED, 1); /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); @@ -1468,7 +1471,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) } EXPORT_SYMBOL_GPL(zs_malloc); -static void obj_free(struct size_class *class, unsigned long obj) +static void obj_free(int class_size, unsigned long obj) { struct link_free *link; struct zspage *zspage; @@ -1478,7 +1481,7 @@ static void obj_free(struct size_class *class, unsigned long obj) void *vaddr; obj_to_location(obj, &f_page, &f_objidx); - f_offset = (class->size * f_objidx) & ~PAGE_MASK; + f_offset = (class_size * f_objidx) & ~PAGE_MASK; zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); @@ -1489,7 +1492,6 @@ static void obj_free(struct size_class *class, unsigned long obj) kunmap_atomic(vaddr); set_freeobj(zspage, f_objidx); mod_zspage_inuse(zspage, -1); - class_stat_dec(class, OBJ_USED, 1); } void zs_free(struct zs_pool *pool, unsigned long handle) @@ -1513,7 +1515,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle) class = zspage_class(pool, zspage); spin_lock(&class->lock); - obj_free(class, obj); + obj_free(class->size, obj); + class_stat_dec(class, OBJ_USED, 1); fullness = fix_fullness_group(class, zspage); if (fullness != ZS_EMPTY) { migrate_read_unlock(zspage); @@ -1671,7 +1674,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, } used_obj = handle_to_obj(handle); - free_obj = obj_malloc(class, get_zspage(d_page), handle); + free_obj = obj_malloc(pool, get_zspage(d_page), handle); zs_object_copy(class, free_obj, used_obj); obj_idx++; /* @@ -1683,7 +1686,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); unpin_tag(handle); - obj_free(class, used_obj); + obj_free(class->size, used_obj); } /* Remember last position in this iteration */ From 3ae92ac23bd88ed18c43a5b138d5a91252c31d2a Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:14:01 -0800 Subject: [PATCH 0379/1295] zsmalloc: introduce obj_allocated The usage pattern for obj_to_head is to check whether the zpage is allocated or not. Thus, introduce obj_allocated. Link: https://lkml.kernel.org/r/20211115185909.3949505-5-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Peter Zijlstra (Intel) Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c45807f170e8..5c72fd5f8fac 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -877,13 +877,21 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static unsigned long obj_to_head(struct page *page, void *obj) +static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) { + unsigned long handle; + if (unlikely(PageHugeObject(page))) { VM_BUG_ON_PAGE(!is_first_page(page), page); - return page->index; + handle = page->index; } else - return *(unsigned long *)obj; + handle = *(unsigned long *)obj; + + if (!(handle & OBJ_ALLOCATED_TAG)) + return false; + + *phandle = handle & ~OBJ_ALLOCATED_TAG; + return true; } static inline int testpin_tag(unsigned long handle) @@ -1606,7 +1614,6 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, static unsigned long find_alloced_obj(struct size_class *class, struct page *page, int *obj_idx) { - unsigned long head; int offset = 0; int index = *obj_idx; unsigned long handle = 0; @@ -1616,9 +1623,7 @@ static unsigned long find_alloced_obj(struct size_class *class, offset += class->size * index; while (offset < PAGE_SIZE) { - head = obj_to_head(page, addr + offset); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; + if (obj_allocated(page, addr + offset, &handle)) { if (trypin_tag(handle)) break; handle = 0; @@ -1928,7 +1933,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, struct page *dummy; void *s_addr, *d_addr, *addr; int offset, pos; - unsigned long handle, head; + unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; int ret = -EAGAIN; @@ -1964,9 +1969,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, pos = offset; s_addr = kmap_atomic(page); while (pos < PAGE_SIZE) { - head = obj_to_head(page, s_addr + pos); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; + if (obj_allocated(page, s_addr + pos, &handle)) { if (!trypin_tag(handle)) goto unpin_objects; } @@ -1982,9 +1985,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, for (addr = s_addr + offset; addr < s_addr + pos; addr += class->size) { - head = obj_to_head(page, addr); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; + if (obj_allocated(page, addr, &handle)) { BUG_ON(!testpin_tag(handle)); old_obj = handle_to_obj(handle); @@ -2029,9 +2030,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, unpin_objects: for (addr = s_addr + offset; addr < s_addr + pos; addr += class->size) { - head = obj_to_head(page, addr); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; + if (obj_allocated(page, addr, &handle)) { BUG_ON(!testpin_tag(handle)); unpin_tag(handle); } From a41ec880aa7beeabcb3bfa476ef35b23f376133b Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:14:04 -0800 Subject: [PATCH 0380/1295] zsmalloc: move huge compressed obj from page to zspage The flag aims for zspage, not per page. Let's move it to zspage. Link: https://lkml.kernel.org/r/20211115185909.3949505-6-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Peter Zijlstra (Intel) Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5c72fd5f8fac..648306245d51 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -121,6 +121,7 @@ #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) +#define HUGE_BITS 1 #define FULLNESS_BITS 2 #define CLASS_BITS 8 #define ISOLATED_BITS 3 @@ -213,22 +214,6 @@ struct size_class { struct zs_size_stat stats; }; -/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ -static void SetPageHugeObject(struct page *page) -{ - SetPageOwnerPriv1(page); -} - -static void ClearPageHugeObject(struct page *page) -{ - ClearPageOwnerPriv1(page); -} - -static int PageHugeObject(struct page *page) -{ - return PageOwnerPriv1(page); -} - /* * Placed within free objects to form a singly linked list. * For every zspage, zspage->freeobj gives head of this list. @@ -278,6 +263,7 @@ struct zs_pool { struct zspage { struct { + unsigned int huge:HUGE_BITS; unsigned int fullness:FULLNESS_BITS; unsigned int class:CLASS_BITS + 1; unsigned int isolated:ISOLATED_BITS; @@ -298,6 +284,17 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; +/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ +static void SetZsHugePage(struct zspage *zspage) +{ + zspage->huge = 1; +} + +static bool ZsHugePage(struct zspage *zspage) +{ + return zspage->huge; +} + #ifdef CONFIG_COMPACTION static int zs_register_migration(struct zs_pool *pool); static void zs_unregister_migration(struct zs_pool *pool); @@ -830,7 +827,9 @@ static struct zspage *get_zspage(struct page *page) static struct page *get_next_page(struct page *page) { - if (unlikely(PageHugeObject(page))) + struct zspage *zspage = get_zspage(page); + + if (unlikely(ZsHugePage(zspage))) return NULL; return (struct page *)page->index; @@ -880,8 +879,9 @@ static unsigned long handle_to_obj(unsigned long handle) static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) { unsigned long handle; + struct zspage *zspage = get_zspage(page); - if (unlikely(PageHugeObject(page))) { + if (unlikely(ZsHugePage(zspage))) { VM_BUG_ON_PAGE(!is_first_page(page), page); handle = page->index; } else @@ -920,7 +920,6 @@ static void reset_page(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); page_mapcount_reset(page); - ClearPageHugeObject(page); page->index = 0; } @@ -1062,7 +1061,7 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, SetPagePrivate(page); if (unlikely(class->objs_per_zspage == 1 && class->pages_per_zspage == 1)) - SetPageHugeObject(page); + SetZsHugePage(zspage); } else { prev_page->index = (unsigned long)page; } @@ -1307,7 +1306,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, ret = __zs_map_object(area, pages, off, class->size); out: - if (likely(!PageHugeObject(page))) + if (likely(!ZsHugePage(zspage))) ret += ZS_HANDLE_SIZE; return ret; @@ -1395,7 +1394,7 @@ static unsigned long obj_malloc(struct zs_pool *pool, vaddr = kmap_atomic(m_page); link = (struct link_free *)vaddr + m_offset / sizeof(*link); set_freeobj(zspage, link->next >> OBJ_TAG_BITS); - if (likely(!PageHugeObject(m_page))) + if (likely(!ZsHugePage(zspage))) /* record handle in the header of allocated chunk */ link->handle = handle; else @@ -1496,7 +1495,10 @@ static void obj_free(int class_size, unsigned long obj) /* Insert this object in containing zspage's freelist */ link = (struct link_free *)(vaddr + f_offset); - link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + if (likely(!ZsHugePage(zspage))) + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + else + f_page->index = 0; kunmap_atomic(vaddr); set_freeobj(zspage, f_objidx); mod_zspage_inuse(zspage, -1); @@ -1867,7 +1869,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, create_page_chain(class, zspage, pages); set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); - if (unlikely(PageHugeObject(oldpage))) + if (unlikely(ZsHugePage(zspage))) newpage->index = oldpage->index; __SetPageMovable(newpage, page_mapping(oldpage)); } From c4549b871102db01ba23cce4bd0cdac511f8991d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:14:07 -0800 Subject: [PATCH 0381/1295] zsmalloc: remove zspage isolation for migration zspage isolation for migration introduced additional exceptions to be dealt with since the zspage was isolated from class list. The reason why I isolated zspage from class list was to prevent race between obj_malloc and page migration via allocating zpage from the zspage further. However, it couldn't prevent object freeing from zspage so it needed corner case handling. This patch removes the whole mess. Now, we are fine since class->lock and zspage->lock can prevent the race. Link: https://lkml.kernel.org/r/20211115185909.3949505-7-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Peter Zijlstra (Intel) Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 157 +++----------------------------------------------- 1 file changed, 8 insertions(+), 149 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 648306245d51..c150cea15a6c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -254,10 +254,6 @@ struct zs_pool { #ifdef CONFIG_COMPACTION struct inode *inode; struct work_struct free_work; - /* A wait queue for when migration races with async_free_zspage() */ - struct wait_queue_head migration_wait; - atomic_long_t isolated_pages; - bool destroying; #endif }; @@ -454,11 +450,6 @@ MODULE_ALIAS("zpool-zsmalloc"); /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); -static bool is_zspage_isolated(struct zspage *zspage) -{ - return zspage->isolated; -} - static __maybe_unused int is_first_page(struct page *page) { return PagePrivate(page); @@ -744,7 +735,6 @@ static void remove_zspage(struct size_class *class, enum fullness_group fullness) { VM_BUG_ON(list_empty(&class->fullness_list[fullness])); - VM_BUG_ON(is_zspage_isolated(zspage)); list_del_init(&zspage->list); class_stat_dec(class, fullness, 1); @@ -770,13 +760,9 @@ static enum fullness_group fix_fullness_group(struct size_class *class, if (newfg == currfg) goto out; - if (!is_zspage_isolated(zspage)) { - remove_zspage(class, zspage, currfg); - insert_zspage(class, zspage, newfg); - } - + remove_zspage(class, zspage, currfg); + insert_zspage(class, zspage, newfg); set_zspage_mapping(zspage, class_idx, newfg); - out: return newfg; } @@ -1511,7 +1497,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle) unsigned long obj; struct size_class *class; enum fullness_group fullness; - bool isolated; if (unlikely(!handle)) return; @@ -1533,11 +1518,9 @@ void zs_free(struct zs_pool *pool, unsigned long handle) goto out; } - isolated = is_zspage_isolated(zspage); migrate_read_unlock(zspage); /* If zspage is isolated, zs_page_putback will free the zspage */ - if (likely(!isolated)) - free_zspage(pool, class, zspage); + free_zspage(pool, class, zspage); out: spin_unlock(&class->lock); @@ -1718,7 +1701,6 @@ static struct zspage *isolate_zspage(struct size_class *class, bool source) zspage = list_first_entry_or_null(&class->fullness_list[fg[i]], struct zspage, list); if (zspage) { - VM_BUG_ON(is_zspage_isolated(zspage)); remove_zspage(class, zspage, fg[i]); return zspage; } @@ -1739,8 +1721,6 @@ static enum fullness_group putback_zspage(struct size_class *class, { enum fullness_group fullness; - VM_BUG_ON(is_zspage_isolated(zspage)); - fullness = get_fullness_group(class, zspage); insert_zspage(class, zspage, fullness); set_zspage_mapping(zspage, class->index, fullness); @@ -1822,35 +1802,10 @@ static void inc_zspage_isolation(struct zspage *zspage) static void dec_zspage_isolation(struct zspage *zspage) { + VM_BUG_ON(zspage->isolated == 0); zspage->isolated--; } -static void putback_zspage_deferred(struct zs_pool *pool, - struct size_class *class, - struct zspage *zspage) -{ - enum fullness_group fg; - - fg = putback_zspage(class, zspage); - if (fg == ZS_EMPTY) - schedule_work(&pool->free_work); - -} - -static inline void zs_pool_dec_isolated(struct zs_pool *pool) -{ - VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); - atomic_long_dec(&pool->isolated_pages); - /* - * Checking pool->destroying must happen after atomic_long_dec() - * for pool->isolated_pages above. Paired with the smp_mb() in - * zs_unregister_migration(). - */ - smp_mb__after_atomic(); - if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) - wake_up_all(&pool->migration_wait); -} - static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @@ -1876,10 +1831,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { - struct zs_pool *pool; - struct size_class *class; struct zspage *zspage; - struct address_space *mapping; /* * Page is locked so zspage couldn't be destroyed. For detail, look at @@ -1889,39 +1841,9 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) VM_BUG_ON_PAGE(PageIsolated(page), page); zspage = get_zspage(page); - - mapping = page_mapping(page); - pool = mapping->private_data; - - class = zspage_class(pool, zspage); - - spin_lock(&class->lock); - if (get_zspage_inuse(zspage) == 0) { - spin_unlock(&class->lock); - return false; - } - - /* zspage is isolated for object migration */ - if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { - spin_unlock(&class->lock); - return false; - } - - /* - * If this is first time isolation for the zspage, isolate zspage from - * size_class to prevent further object allocation from the zspage. - */ - if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { - enum fullness_group fullness; - unsigned int class_idx; - - get_zspage_mapping(zspage, &class_idx, &fullness); - atomic_long_inc(&pool->isolated_pages); - remove_zspage(class, zspage, fullness); - } - + migrate_write_lock(zspage); inc_zspage_isolation(zspage); - spin_unlock(&class->lock); + migrate_write_unlock(zspage); return true; } @@ -2004,21 +1926,6 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, dec_zspage_isolation(zspage); - /* - * Page migration is done so let's putback isolated zspage to - * the list if @page is final isolated subpage in the zspage. - */ - if (!is_zspage_isolated(zspage)) { - /* - * We cannot race with zs_destroy_pool() here because we wait - * for isolation to hit zero before we start destroying. - * Also, we ensure that everyone can see pool->destroying before - * we start waiting. - */ - putback_zspage_deferred(pool, class, zspage); - zs_pool_dec_isolated(pool); - } - if (page_zone(newpage) != page_zone(page)) { dec_zone_page_state(page, NR_ZSPAGES); inc_zone_page_state(newpage, NR_ZSPAGES); @@ -2046,30 +1953,15 @@ unpin_objects: static void zs_page_putback(struct page *page) { - struct zs_pool *pool; - struct size_class *class; - struct address_space *mapping; struct zspage *zspage; VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); zspage = get_zspage(page); - mapping = page_mapping(page); - pool = mapping->private_data; - class = zspage_class(pool, zspage); - - spin_lock(&class->lock); + migrate_write_lock(zspage); dec_zspage_isolation(zspage); - if (!is_zspage_isolated(zspage)) { - /* - * Due to page_lock, we cannot free zspage immediately - * so let's defer. - */ - putback_zspage_deferred(pool, class, zspage); - zs_pool_dec_isolated(pool); - } - spin_unlock(&class->lock); + migrate_write_unlock(zspage); } static const struct address_space_operations zsmalloc_aops = { @@ -2091,36 +1983,8 @@ static int zs_register_migration(struct zs_pool *pool) return 0; } -static bool pool_isolated_are_drained(struct zs_pool *pool) -{ - return atomic_long_read(&pool->isolated_pages) == 0; -} - -/* Function for resolving migration */ -static void wait_for_isolated_drain(struct zs_pool *pool) -{ - - /* - * We're in the process of destroying the pool, so there are no - * active allocations. zs_page_isolate() fails for completely free - * zspages, so we need only wait for the zs_pool's isolated - * count to hit zero. - */ - wait_event(pool->migration_wait, - pool_isolated_are_drained(pool)); -} - static void zs_unregister_migration(struct zs_pool *pool) { - pool->destroying = true; - /* - * We need a memory barrier here to ensure global visibility of - * pool->destroying. Thus pool->isolated pages will either be 0 in which - * case we don't care, or it will be > 0 and pool->destroying will - * ensure that we wake up once isolation hits 0. - */ - smp_mb(); - wait_for_isolated_drain(pool); /* This can block */ flush_work(&pool->free_work); iput(pool->inode); } @@ -2150,7 +2014,6 @@ static void async_free_zspage(struct work_struct *work) spin_unlock(&class->lock); } - list_for_each_entry_safe(zspage, tmp, &free_pages, list) { list_del(&zspage->list); lock_zspage(zspage); @@ -2363,10 +2226,6 @@ struct zs_pool *zs_create_pool(const char *name) if (!pool->name) goto err; -#ifdef CONFIG_COMPACTION - init_waitqueue_head(&pool->migration_wait); -#endif - if (create_cache(pool)) goto err; From 4a57d6bbaecd28c8175dc5da013009e4158018c2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:14:10 -0800 Subject: [PATCH 0382/1295] locking/rwlocks: introduce write_lock_nested In preparation for converting bit_spin_lock to rwlock in zsmalloc so that multiple writers of zspages can run at the same time but those zspages are supposed to be different zspage instance. Thus, it's not deadlock. This patch adds write_lock_nested to support the case for LOCKDEP. [minchan@kernel.org: fix write_lock_nested for RT] Link: https://lkml.kernel.org/r/YZfrMTAXV56HFWJY@google.com [bigeasy@linutronix.de: fixup write_lock_nested() implementation] Link: https://lkml.kernel.org/r/20211123170134.y6xb7pmpgdn4m3bn@linutronix.de Link: https://lkml.kernel.org/r/20211115185909.3949505-8-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Sebastian Andrzej Siewior Acked-by: Peter Zijlstra (Intel) Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Sergey Senozhatsky Cc: Thomas Gleixner Cc: Naresh Kamboju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rwlock.h | 6 ++++++ include/linux/rwlock_api_smp.h | 8 ++++++++ include/linux/rwlock_rt.h | 10 ++++++++++ include/linux/spinlock_api_up.h | 1 + kernel/locking/spinlock.c | 10 ++++++++++ kernel/locking/spinlock_rt.c | 12 ++++++++++++ 6 files changed, 47 insertions(+) diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 2c0ad417ce3c..8f416c5e929e 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -55,6 +55,12 @@ do { \ #define write_lock(lock) _raw_write_lock(lock) #define read_lock(lock) _raw_read_lock(lock) +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define write_lock_nested(lock, subclass) _raw_write_lock_nested(lock, subclass) +#else +#define write_lock_nested(lock, subclass) _raw_write_lock(lock) +#endif + #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define read_lock_irqsave(lock, flags) \ diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index f1db6f17c4fb..dceb0a59b692 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -17,6 +17,7 @@ void __lockfunc _raw_read_lock(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_write_lock(rwlock_t *lock) __acquires(lock); +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) __acquires(lock); void __lockfunc _raw_read_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_write_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_read_lock_irq(rwlock_t *lock) __acquires(lock); @@ -209,6 +210,13 @@ static inline void __raw_write_lock(rwlock_t *lock) LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); } +static inline void __raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + preempt_disable(); + rwlock_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); +} + #endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */ static inline void __raw_write_unlock(rwlock_t *lock) diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h index 49c1f3842ed5..8544ff05e594 100644 --- a/include/linux/rwlock_rt.h +++ b/include/linux/rwlock_rt.h @@ -28,6 +28,7 @@ extern void rt_read_lock(rwlock_t *rwlock); extern int rt_read_trylock(rwlock_t *rwlock); extern void rt_read_unlock(rwlock_t *rwlock); extern void rt_write_lock(rwlock_t *rwlock); +extern void rt_write_lock_nested(rwlock_t *rwlock, int subclass); extern int rt_write_trylock(rwlock_t *rwlock); extern void rt_write_unlock(rwlock_t *rwlock); @@ -83,6 +84,15 @@ static __always_inline void write_lock(rwlock_t *rwlock) rt_write_lock(rwlock); } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static __always_inline void write_lock_nested(rwlock_t *rwlock, int subclass) +{ + rt_write_lock_nested(rwlock, subclass); +} +#else +#define write_lock_nested(lock, subclass) rt_write_lock(((void)(subclass), (lock))) +#endif + static __always_inline void write_lock_bh(rwlock_t *rwlock) { local_bh_disable(); diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index d0d188861ad6..b8ba00ccccde 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -59,6 +59,7 @@ #define _raw_spin_lock_nested(lock, subclass) __LOCK(lock) #define _raw_read_lock(lock) __LOCK(lock) #define _raw_write_lock(lock) __LOCK(lock) +#define _raw_write_lock_nested(lock, subclass) __LOCK(lock) #define _raw_spin_lock_bh(lock) __LOCK_BH(lock) #define _raw_read_lock_bh(lock) __LOCK_BH(lock) #define _raw_write_lock_bh(lock) __LOCK_BH(lock) diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index b562f9289372..7f49baaa4979 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -300,6 +300,16 @@ void __lockfunc _raw_write_lock(rwlock_t *lock) __raw_write_lock(lock); } EXPORT_SYMBOL(_raw_write_lock); + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock))) +#endif + +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + __raw_write_lock_nested(lock, subclass); +} +EXPORT_SYMBOL(_raw_write_lock_nested); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index 9e396a09fe0f..48a19ed8486d 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -239,6 +239,18 @@ void __sched rt_write_lock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_write_lock); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) +{ + rtlock_might_resched(); + rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_); + rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_write_lock_nested); +#endif + void __sched rt_read_unlock(rwlock_t *rwlock) { rwlock_release(&rwlock->dep_map, _RET_IP_); From b475d42d2c43321d8bea685f54916220cb76b511 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:14:13 -0800 Subject: [PATCH 0383/1295] zsmalloc: replace per zpage lock with pool->migrate_lock The zsmalloc has used a bit for spin_lock in zpage handle to keep zpage object alive during several operations. However, it causes the problem for PREEMPT_RT as well as introducing too complicated. This patch replaces the bit spin_lock with pool->migrate_lock rwlock. It could make the code simple as well as zsmalloc work under PREEMPT_RT. The drawback is the pool->migrate_lock is bigger granuarity than per zpage lock so the contention would be higher than old when both IO-related operations(i.e., zsmalloc, zsfree, zs_[map|unmap]) and compaction(page/zpage migration) are going in parallel(*, the migrate_lock is rwlock and IO related functions are all read side lock so there is no contention). However, the write-side is fast enough(dominant overhead is just page copy) so it wouldn't affect much. If the lock granurity becomes more problem later, we could introduce table locks based on handle as a hash value. Link: https://lkml.kernel.org/r/20211115185909.3949505-9-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Peter Zijlstra (Intel) Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 205 +++++++++++++++++++++++--------------------------- 1 file changed, 96 insertions(+), 109 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c150cea15a6c..1b26fa6b9fa8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -30,6 +30,14 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +/* + * lock ordering: + * page_lock + * pool->migrate_lock + * class->lock + * zspage->lock + */ + #include #include #include @@ -100,15 +108,6 @@ #define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) -/* - * Memory for allocating for handle keeps object position by - * encoding and the encoded value has a room - * in least bit(ie, look at obj_to_location). - * We use the bit to synchronize between object access by - * user and migration. - */ -#define HANDLE_PIN_BIT 0 - /* * Head in allocated object should have OBJ_ALLOCATED_TAG * to identify the object was allocated or not. @@ -255,6 +254,8 @@ struct zs_pool { struct inode *inode; struct work_struct free_work; #endif + /* protect page/zspage migration */ + rwlock_t migrate_lock; }; struct zspage { @@ -297,6 +298,9 @@ static void zs_unregister_migration(struct zs_pool *pool); static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); +static void migrate_write_lock(struct zspage *zspage); +static void migrate_write_lock_nested(struct zspage *zspage); +static void migrate_write_unlock(struct zspage *zspage); static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); @@ -308,6 +312,9 @@ static void zs_unregister_migration(struct zs_pool *pool) {} static void migrate_lock_init(struct zspage *zspage) {} static void migrate_read_lock(struct zspage *zspage) {} static void migrate_read_unlock(struct zspage *zspage) {} +static void migrate_write_lock(struct zspage *zspage) {} +static void migrate_write_lock_nested(struct zspage *zspage) {} +static void migrate_write_unlock(struct zspage *zspage) {} static void kick_deferred_free(struct zs_pool *pool) {} static void init_deferred_free(struct zs_pool *pool) {} static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} @@ -359,14 +366,10 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) kmem_cache_free(pool->zspage_cachep, zspage); } +/* class->lock(which owns the handle) synchronizes races */ static void record_obj(unsigned long handle, unsigned long obj) { - /* - * lsb of @obj represents handle lock while other bits - * represent object value the handle is pointing so - * updating shouldn't do store tearing. - */ - WRITE_ONCE(*(unsigned long *)handle, obj); + *(unsigned long *)handle = obj; } /* zpool driver */ @@ -880,26 +883,6 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) return true; } -static inline int testpin_tag(unsigned long handle) -{ - return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); -} - -static inline int trypin_tag(unsigned long handle) -{ - return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); -} - -static void pin_tag(unsigned long handle) __acquires(bitlock) -{ - bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); -} - -static void unpin_tag(unsigned long handle) __releases(bitlock) -{ - bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); -} - static void reset_page(struct page *page) { __ClearPageMovable(page); @@ -968,6 +951,11 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class, VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(list_empty(&zspage->list)); + /* + * Since zs_free couldn't be sleepable, this function cannot call + * lock_page. The page locks trylock_zspage got will be released + * by __free_zspage. + */ if (!trylock_zspage(zspage)) { kick_deferred_free(pool); return; @@ -1263,15 +1251,20 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, */ BUG_ON(in_interrupt()); - /* From now on, migration cannot move the object */ - pin_tag(handle); - + /* It guarantees it can get zspage from handle safely */ + read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); - /* migration cannot move any subpage in this zspage */ + /* + * migration cannot move any zpages in this zspage. Here, class->lock + * is too heavy since callers would take some time until they calls + * zs_unmap_object API so delegate the locking from class to zspage + * which is smaller granularity. + */ migrate_read_lock(zspage); + read_unlock(&pool->migrate_lock); class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; @@ -1330,7 +1323,6 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) put_cpu_var(zs_map_area); migrate_read_unlock(zspage); - unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); @@ -1424,6 +1416,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)]; + /* class->lock effectively protects the zpage migration */ spin_lock(&class->lock); zspage = find_get_zspage(class); if (likely(zspage)) { @@ -1501,30 +1494,27 @@ void zs_free(struct zs_pool *pool, unsigned long handle) if (unlikely(!handle)) return; - pin_tag(handle); + /* + * The pool->migrate_lock protects the race with zpage's migration + * so it's safe to get the page from handle. + */ + read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); obj_to_page(obj, &f_page); zspage = get_zspage(f_page); - - migrate_read_lock(zspage); class = zspage_class(pool, zspage); - spin_lock(&class->lock); + read_unlock(&pool->migrate_lock); + obj_free(class->size, obj); class_stat_dec(class, OBJ_USED, 1); fullness = fix_fullness_group(class, zspage); - if (fullness != ZS_EMPTY) { - migrate_read_unlock(zspage); + if (fullness != ZS_EMPTY) goto out; - } - migrate_read_unlock(zspage); - /* If zspage is isolated, zs_page_putback will free the zspage */ free_zspage(pool, class, zspage); out: - spin_unlock(&class->lock); - unpin_tag(handle); cache_free_handle(pool, handle); } EXPORT_SYMBOL_GPL(zs_free); @@ -1608,11 +1598,8 @@ static unsigned long find_alloced_obj(struct size_class *class, offset += class->size * index; while (offset < PAGE_SIZE) { - if (obj_allocated(page, addr + offset, &handle)) { - if (trypin_tag(handle)) - break; - handle = 0; - } + if (obj_allocated(page, addr + offset, &handle)) + break; offset += class->size; index++; @@ -1658,7 +1645,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, /* Stop if there is no more space */ if (zspage_full(class, get_zspage(d_page))) { - unpin_tag(handle); ret = -ENOMEM; break; } @@ -1667,15 +1653,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, free_obj = obj_malloc(pool, get_zspage(d_page), handle); zs_object_copy(class, free_obj, used_obj); obj_idx++; - /* - * record_obj updates handle's value to free_obj and it will - * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which - * breaks synchronization using pin_tag(e,g, zs_free) so - * let's keep the lock bit. - */ - free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); - unpin_tag(handle); obj_free(class->size, used_obj); } @@ -1789,6 +1767,11 @@ static void migrate_write_lock(struct zspage *zspage) write_lock(&zspage->lock); } +static void migrate_write_lock_nested(struct zspage *zspage) +{ + write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING); +} + static void migrate_write_unlock(struct zspage *zspage) { write_unlock(&zspage->lock); @@ -1856,11 +1839,10 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, struct zspage *zspage; struct page *dummy; void *s_addr, *d_addr, *addr; - int offset, pos; + int offset; unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; - int ret = -EAGAIN; /* * We cannot support the _NO_COPY case here, because copy needs to @@ -1873,32 +1855,25 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); - zspage = get_zspage(page); - - /* Concurrent compactor cannot migrate any subpage in zspage */ - migrate_write_lock(zspage); pool = mapping->private_data; + + /* + * The pool migrate_lock protects the race between zpage migration + * and zs_free. + */ + write_lock(&pool->migrate_lock); + zspage = get_zspage(page); class = zspage_class(pool, zspage); - offset = get_first_obj_offset(page); + /* + * the class lock protects zpage alloc/free in the zspage. + */ spin_lock(&class->lock); - if (!get_zspage_inuse(zspage)) { - /* - * Set "offset" to end of the page so that every loops - * skips unnecessary object scanning. - */ - offset = PAGE_SIZE; - } + /* the migrate_write_lock protects zpage access via zs_map_object */ + migrate_write_lock(zspage); - pos = offset; + offset = get_first_obj_offset(page); s_addr = kmap_atomic(page); - while (pos < PAGE_SIZE) { - if (obj_allocated(page, s_addr + pos, &handle)) { - if (!trypin_tag(handle)) - goto unpin_objects; - } - pos += class->size; - } /* * Here, any user cannot access all objects in the zspage so let's move. @@ -1907,25 +1882,30 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, memcpy(d_addr, s_addr, PAGE_SIZE); kunmap_atomic(d_addr); - for (addr = s_addr + offset; addr < s_addr + pos; + for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; addr += class->size) { if (obj_allocated(page, addr, &handle)) { - BUG_ON(!testpin_tag(handle)); old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); new_obj = (unsigned long)location_to_obj(newpage, obj_idx); - new_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, new_obj); } } + kunmap_atomic(s_addr); replace_sub_page(class, zspage, newpage, page); - get_page(newpage); - + /* + * Since we complete the data copy and set up new zspage structure, + * it's okay to release migration_lock. + */ + write_unlock(&pool->migrate_lock); + spin_unlock(&class->lock); dec_zspage_isolation(zspage); + migrate_write_unlock(zspage); + get_page(newpage); if (page_zone(newpage) != page_zone(page)) { dec_zone_page_state(page, NR_ZSPAGES); inc_zone_page_state(newpage, NR_ZSPAGES); @@ -1933,22 +1913,8 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, reset_page(page); put_page(page); - page = newpage; - ret = MIGRATEPAGE_SUCCESS; -unpin_objects: - for (addr = s_addr + offset; addr < s_addr + pos; - addr += class->size) { - if (obj_allocated(page, addr, &handle)) { - BUG_ON(!testpin_tag(handle)); - unpin_tag(handle); - } - } - kunmap_atomic(s_addr); - spin_unlock(&class->lock); - migrate_write_unlock(zspage); - - return ret; + return MIGRATEPAGE_SUCCESS; } static void zs_page_putback(struct page *page) @@ -2077,8 +2043,13 @@ static unsigned long __zs_compact(struct zs_pool *pool, struct zspage *dst_zspage = NULL; unsigned long pages_freed = 0; + /* protect the race between zpage migration and zs_free */ + write_lock(&pool->migrate_lock); + /* protect zpage allocation/free */ spin_lock(&class->lock); while ((src_zspage = isolate_zspage(class, true))) { + /* protect someone accessing the zspage(i.e., zs_map_object) */ + migrate_write_lock(src_zspage); if (!zs_can_compact(class)) break; @@ -2087,6 +2058,8 @@ static unsigned long __zs_compact(struct zs_pool *pool, cc.s_page = get_first_page(src_zspage); while ((dst_zspage = isolate_zspage(class, false))) { + migrate_write_lock_nested(dst_zspage); + cc.d_page = get_first_page(dst_zspage); /* * If there is no more space in dst_page, resched @@ -2096,6 +2069,10 @@ static unsigned long __zs_compact(struct zs_pool *pool, break; putback_zspage(class, dst_zspage); + migrate_write_unlock(dst_zspage); + dst_zspage = NULL; + if (rwlock_is_contended(&pool->migrate_lock)) + break; } /* Stop if we couldn't find slot */ @@ -2103,19 +2080,28 @@ static unsigned long __zs_compact(struct zs_pool *pool, break; putback_zspage(class, dst_zspage); + migrate_write_unlock(dst_zspage); + if (putback_zspage(class, src_zspage) == ZS_EMPTY) { + migrate_write_unlock(src_zspage); free_zspage(pool, class, src_zspage); pages_freed += class->pages_per_zspage; - } + } else + migrate_write_unlock(src_zspage); spin_unlock(&class->lock); + write_unlock(&pool->migrate_lock); cond_resched(); + write_lock(&pool->migrate_lock); spin_lock(&class->lock); } - if (src_zspage) + if (src_zspage) { putback_zspage(class, src_zspage); + migrate_write_unlock(src_zspage); + } spin_unlock(&class->lock); + write_unlock(&pool->migrate_lock); return pages_freed; } @@ -2221,6 +2207,7 @@ struct zs_pool *zs_create_pool(const char *name) return NULL; init_deferred_free(pool); + rwlock_init(&pool->migrate_lock); pool->name = kstrdup(name, GFP_KERNEL); if (!pool->name) From a37265995c867a4e413761d846cef0445b08d6d5 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 21 Jan 2022 22:14:17 -0800 Subject: [PATCH 0384/1295] zsmalloc: replace get_cpu_var with local_lock The usage of get_cpu_var() in zs_map_object() is problematic because it disables preemption and makes it impossible to acquire any sleeping lock on PREEMPT_RT such as a spinlock_t. Replace the get_cpu_var() usage with a local_lock_t which is embedded struct mapping_area. It ensures that the access the struct is synchronized against all users on the same CPU. [minchan: remove the bit_spin_lock part and change the title] Link: https://lkml.kernel.org/r/20211115185909.3949505-10-minchan@kernel.org Signed-off-by: Mike Galbraith Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Minchan Kim Tested-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra (Intel) Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 1b26fa6b9fa8..9152fbde33b5 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -65,6 +65,7 @@ #include #include #include +#include #define ZSPAGE_MAGIC 0x58 @@ -276,6 +277,7 @@ struct zspage { }; struct mapping_area { + local_lock_t lock; char *vm_buf; /* copy buffer for objects that span pages */ char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ @@ -451,7 +453,9 @@ MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ -static DEFINE_PER_CPU(struct mapping_area, zs_map_area); +static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { + .lock = INIT_LOCAL_LOCK(lock), +}; static __maybe_unused int is_first_page(struct page *page) { @@ -1269,7 +1273,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; - area = &get_cpu_var(zs_map_area); + local_lock(&zs_map_area.lock); + area = this_cpu_ptr(&zs_map_area); area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ @@ -1320,7 +1325,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } - put_cpu_var(zs_map_area); + local_unlock(&zs_map_area.lock); migrate_read_unlock(zspage); } From 6dfbbae14a7b961f41d80a106e1ab60e86d061c5 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 21 Jan 2022 22:14:20 -0800 Subject: [PATCH 0385/1295] fs: proc: store PDE()->data into inode->i_private PDE_DATA(inode) is introduced to get user private data and hide the layout of struct proc_dir_entry. The inode->i_private is used to do the same thing as well. Save a copy of user private data to inode-> i_private when proc inode is allocated. This means the user also can get their private data by inode->i_private. Introduce pde_data() to wrap inode->i_private so that we can remove PDE_DATA() from fs/proc/generic.c and make PTE_DATE() as a wrapper of pde_data(). It will be easier if we decide to remove PDE_DATE() in the future. Link: https://lkml.kernel.org/r/20211124081956.87711-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Christian Brauner Cc: Alexey Dobriyan Cc: Alexey Gladkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 6 ------ fs/proc/inode.c | 1 + fs/proc/internal.h | 5 ----- include/linux/proc_fs.h | 13 ++++++++++++- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5b78739e60e4..f2132407e133 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -791,12 +791,6 @@ void proc_remove(struct proc_dir_entry *de) } EXPORT_SYMBOL(proc_remove); -void *PDE_DATA(const struct inode *inode) -{ - return __PDE_DATA(inode); -} -EXPORT_SYMBOL(PDE_DATA); - /* * Pull a user buffer into memory and pass it to the file's write handler if * one is supplied. The ->write() method is permitted to modify the diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 599eb724ff2d..f84355c5a36d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -650,6 +650,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) return NULL; } + inode->i_private = de->data; inode->i_ino = de->low_ino; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); PROC_I(inode)->pde = de; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 03415f3fb3a8..06a80f78433d 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -115,11 +115,6 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode) return PROC_I(inode)->pde; } -static inline void *__PDE_DATA(const struct inode *inode) -{ - return PDE(inode)->data; -} - static inline struct pid *proc_pid(const struct inode *inode) { return PROC_I(inode)->pid; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 01b9268451a8..b6e7005cc1b2 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -110,7 +110,18 @@ extern struct proc_dir_entry *proc_create_data(const char *, umode_t, struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops); extern void proc_set_size(struct proc_dir_entry *, loff_t); extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t); -extern void *PDE_DATA(const struct inode *); + +/* + * Obtain the private data passed by user through proc_create_data() or + * related. + */ +static inline void *pde_data(const struct inode *inode) +{ + return inode->i_private; +} + +#define PDE_DATA(i) pde_data(i) + extern void *proc_get_parent_data(const struct inode *); extern void proc_remove(struct proc_dir_entry *); extern void remove_proc_entry(const char *, struct proc_dir_entry *); From 359745d78351c6f5442435f81549f0207ece28aa Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 21 Jan 2022 22:14:23 -0800 Subject: [PATCH 0386/1295] proc: remove PDE_DATA() completely Remove PDE_DATA() completely and replace it with pde_data(). [akpm@linux-foundation.org: fix naming clash in drivers/nubus/proc.c] [akpm@linux-foundation.org: now fix it properly] Link: https://lkml.kernel.org/r/20211124081956.87711-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Christian Brauner Cc: Alexey Dobriyan Cc: Alexey Gladkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/srm_env.c | 4 +-- arch/arm/kernel/atags_proc.c | 2 +- arch/ia64/kernel/salinfo.c | 10 +++--- arch/powerpc/kernel/proc_powerpc.c | 4 +-- arch/sh/mm/alignment.c | 2 +- arch/xtensa/platforms/iss/simdisk.c | 4 +-- drivers/acpi/proc.c | 2 +- drivers/hwmon/dell-smm-hwmon.c | 4 +-- drivers/net/bonding/bond_procfs.c | 8 ++--- drivers/net/wireless/cisco/airo.c | 22 ++++++------ .../net/wireless/intersil/hostap/hostap_ap.c | 16 ++++----- .../intersil/hostap/hostap_download.c | 2 +- .../wireless/intersil/hostap/hostap_proc.c | 24 ++++++------- drivers/net/wireless/ray_cs.c | 2 +- drivers/nubus/proc.c | 36 +++++++++---------- drivers/parisc/led.c | 4 +-- drivers/pci/proc.c | 10 +++--- drivers/platform/x86/thinkpad_acpi.c | 4 +-- drivers/platform/x86/toshiba_acpi.c | 16 ++++----- drivers/pnp/isapnp/proc.c | 2 +- drivers/pnp/pnpbios/proc.c | 4 +-- drivers/scsi/scsi_proc.c | 4 +-- drivers/usb/gadget/function/rndis.c | 4 +-- drivers/zorro/proc.c | 2 +- fs/afs/proc.c | 6 ++-- fs/ext4/mballoc.c | 14 ++++---- fs/jbd2/journal.c | 2 +- fs/proc/proc_net.c | 8 ++--- include/linux/proc_fs.h | 4 +-- include/linux/seq_file.h | 2 +- ipc/util.c | 2 +- kernel/irq/proc.c | 8 ++--- kernel/resource.c | 4 +-- net/atm/proc.c | 4 +-- net/bluetooth/af_bluetooth.c | 8 ++--- net/can/bcm.c | 2 +- net/can/proc.c | 2 +- net/core/neighbour.c | 6 ++-- net/core/pktgen.c | 6 ++-- net/ipv4/netfilter/ipt_CLUSTERIP.c | 6 ++-- net/ipv4/raw.c | 8 ++--- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 6 ++-- net/netfilter/x_tables.c | 10 +++--- net/netfilter/xt_hashlimit.c | 18 +++++----- net/netfilter/xt_recent.c | 4 +-- net/sunrpc/auth_gss/svcauth_gss.c | 4 +-- net/sunrpc/cache.c | 24 ++++++------- net/sunrpc/stats.c | 2 +- sound/core/info.c | 4 +-- 50 files changed, 178 insertions(+), 180 deletions(-) diff --git a/arch/alpha/kernel/srm_env.c b/arch/alpha/kernel/srm_env.c index 528d2be58182..217b4dca51dd 100644 --- a/arch/alpha/kernel/srm_env.c +++ b/arch/alpha/kernel/srm_env.c @@ -83,14 +83,14 @@ static int srm_env_proc_show(struct seq_file *m, void *v) static int srm_env_proc_open(struct inode *inode, struct file *file) { - return single_open(file, srm_env_proc_show, PDE_DATA(inode)); + return single_open(file, srm_env_proc_show, pde_data(inode)); } static ssize_t srm_env_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { int res; - unsigned long id = (unsigned long)PDE_DATA(file_inode(file)); + unsigned long id = (unsigned long)pde_data(file_inode(file)); char *buf = (char *) __get_free_page(GFP_USER); unsigned long ret1, ret2; diff --git a/arch/arm/kernel/atags_proc.c b/arch/arm/kernel/atags_proc.c index 3c2faf2bd124..3ec2afe78423 100644 --- a/arch/arm/kernel/atags_proc.c +++ b/arch/arm/kernel/atags_proc.c @@ -13,7 +13,7 @@ struct buffer { static ssize_t atags_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct buffer *b = PDE_DATA(file_inode(file)); + struct buffer *b = pde_data(file_inode(file)); return simple_read_from_buffer(buf, count, ppos, b->data, b->size); } diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c index a25ab9b37953..bd3ba276e69c 100644 --- a/arch/ia64/kernel/salinfo.c +++ b/arch/ia64/kernel/salinfo.c @@ -282,7 +282,7 @@ salinfo_event_open(struct inode *inode, struct file *file) static ssize_t salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { - struct salinfo_data *data = PDE_DATA(file_inode(file)); + struct salinfo_data *data = pde_data(file_inode(file)); char cmd[32]; size_t size; int i, n, cpu = -1; @@ -340,7 +340,7 @@ static const struct proc_ops salinfo_event_proc_ops = { static int salinfo_log_open(struct inode *inode, struct file *file) { - struct salinfo_data *data = PDE_DATA(inode); + struct salinfo_data *data = pde_data(inode); if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -365,7 +365,7 @@ salinfo_log_open(struct inode *inode, struct file *file) static int salinfo_log_release(struct inode *inode, struct file *file) { - struct salinfo_data *data = PDE_DATA(inode); + struct salinfo_data *data = pde_data(inode); if (data->state == STATE_NO_DATA) { vfree(data->log_buffer); @@ -433,7 +433,7 @@ retry: static ssize_t salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { - struct salinfo_data *data = PDE_DATA(file_inode(file)); + struct salinfo_data *data = pde_data(file_inode(file)); u8 *buf; u64 bufsize; @@ -494,7 +494,7 @@ salinfo_log_clear(struct salinfo_data *data, int cpu) static ssize_t salinfo_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - struct salinfo_data *data = PDE_DATA(file_inode(file)); + struct salinfo_data *data = pde_data(file_inode(file)); char cmd[32]; size_t size; u32 offset; diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c index 877817471e3c..6a029f2378e1 100644 --- a/arch/powerpc/kernel/proc_powerpc.c +++ b/arch/powerpc/kernel/proc_powerpc.c @@ -25,7 +25,7 @@ static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes loff_t *ppos) { return simple_read_from_buffer(buf, nbytes, ppos, - PDE_DATA(file_inode(file)), PAGE_SIZE); + pde_data(file_inode(file)), PAGE_SIZE); } static int page_map_mmap( struct file *file, struct vm_area_struct *vma ) @@ -34,7 +34,7 @@ static int page_map_mmap( struct file *file, struct vm_area_struct *vma ) return -EINVAL; remap_pfn_range(vma, vma->vm_start, - __pa(PDE_DATA(file_inode(file))) >> PAGE_SHIFT, + __pa(pde_data(file_inode(file))) >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); return 0; } diff --git a/arch/sh/mm/alignment.c b/arch/sh/mm/alignment.c index d802801ceb93..3a76a766f423 100644 --- a/arch/sh/mm/alignment.c +++ b/arch/sh/mm/alignment.c @@ -140,7 +140,7 @@ static int alignment_proc_open(struct inode *inode, struct file *file) static ssize_t alignment_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - int *data = PDE_DATA(file_inode(file)); + int *data = pde_data(file_inode(file)); char mode; if (count > 0) { diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 07b642c1916a..8eb6ad1a3a1d 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -208,7 +208,7 @@ static int simdisk_detach(struct simdisk *dev) static ssize_t proc_read_simdisk(struct file *file, char __user *buf, size_t size, loff_t *ppos) { - struct simdisk *dev = PDE_DATA(file_inode(file)); + struct simdisk *dev = pde_data(file_inode(file)); const char *s = dev->filename; if (s) { ssize_t n = simple_read_from_buffer(buf, size, ppos, @@ -225,7 +225,7 @@ static ssize_t proc_write_simdisk(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char *tmp = memdup_user_nul(buf, count); - struct simdisk *dev = PDE_DATA(file_inode(file)); + struct simdisk *dev = pde_data(file_inode(file)); int err; if (IS_ERR(tmp)) diff --git a/drivers/acpi/proc.c b/drivers/acpi/proc.c index 0cca7991f186..4322f2da6d10 100644 --- a/drivers/acpi/proc.c +++ b/drivers/acpi/proc.c @@ -127,7 +127,7 @@ static int acpi_system_wakeup_device_open_fs(struct inode *inode, struct file *file) { return single_open(file, acpi_system_wakeup_device_seq_show, - PDE_DATA(inode)); + pde_data(inode)); } static const struct proc_ops acpi_system_wakeup_device_proc_ops = { diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index d401f9acf450..9949eeb79378 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -451,7 +451,7 @@ static int i8k_get_power_status(void) static long i8k_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { - struct dell_smm_data *data = PDE_DATA(file_inode(fp)); + struct dell_smm_data *data = pde_data(file_inode(fp)); int __user *argp = (int __user *)arg; int speed, err; int val = 0; @@ -585,7 +585,7 @@ static int i8k_proc_show(struct seq_file *seq, void *offset) static int i8k_open_fs(struct inode *inode, struct file *file) { - return single_open(file, i8k_proc_show, PDE_DATA(inode)); + return single_open(file, i8k_proc_show, pde_data(inode)); } static const struct proc_ops i8k_proc_ops = { diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c index 2ec11af5f0cc..46b150e6289e 100644 --- a/drivers/net/bonding/bond_procfs.c +++ b/drivers/net/bonding/bond_procfs.c @@ -11,7 +11,7 @@ static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { - struct bonding *bond = PDE_DATA(file_inode(seq->file)); + struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; loff_t off = 0; @@ -30,7 +30,7 @@ static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) static void *bond_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct bonding *bond = PDE_DATA(file_inode(seq->file)); + struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; bool found = false; @@ -57,7 +57,7 @@ static void bond_info_seq_stop(struct seq_file *seq, void *v) static void bond_info_show_master(struct seq_file *seq) { - struct bonding *bond = PDE_DATA(file_inode(seq->file)); + struct bonding *bond = pde_data(file_inode(seq->file)); const struct bond_opt_value *optval; struct slave *curr, *primary; int i; @@ -175,7 +175,7 @@ static void bond_info_show_master(struct seq_file *seq) static void bond_info_show_slave(struct seq_file *seq, const struct slave *slave) { - struct bonding *bond = PDE_DATA(file_inode(seq->file)); + struct bonding *bond = pde_data(file_inode(seq->file)); seq_printf(seq, "\nSlave Interface: %s\n", slave->dev->name); seq_printf(seq, "MII Status: %s\n", bond_slave_link_status(slave->link)); diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c index 45594f003ef7..452d08545d31 100644 --- a/drivers/net/wireless/cisco/airo.c +++ b/drivers/net/wireless/cisco/airo.c @@ -4672,7 +4672,7 @@ static ssize_t proc_write(struct file *file, static int proc_status_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *apriv = dev->ml_priv; CapabilityRid cap_rid; StatusRid status_rid; @@ -4756,7 +4756,7 @@ static int proc_stats_rid_open(struct inode *inode, u16 rid) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *apriv = dev->ml_priv; StatsRid stats; int i, j; @@ -4819,7 +4819,7 @@ static inline int sniffing_mode(struct airo_info *ai) static void proc_config_on_close(struct inode *inode, struct file *file) { struct proc_data *data = file->private_data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; char *line; @@ -5030,7 +5030,7 @@ static const char *get_rmode(__le16 mode) static int proc_config_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; int i; __le16 mode; @@ -5120,7 +5120,7 @@ static int proc_config_open(struct inode *inode, struct file *file) static void proc_SSID_on_close(struct inode *inode, struct file *file) { struct proc_data *data = file->private_data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; SsidRid SSID_rid; int i; @@ -5156,7 +5156,7 @@ static void proc_SSID_on_close(struct inode *inode, struct file *file) static void proc_APList_on_close(struct inode *inode, struct file *file) { struct proc_data *data = file->private_data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; APListRid *APList_rid = &ai->APList; int i; @@ -5280,7 +5280,7 @@ static int set_wep_tx_idx(struct airo_info *ai, u16 index, int perm, int lock) static void proc_wepkey_on_close(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; int i, rc; char key[16]; @@ -5331,7 +5331,7 @@ static void proc_wepkey_on_close(struct inode *inode, struct file *file) static int proc_wepkey_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; char *ptr; WepKeyRid wkr; @@ -5379,7 +5379,7 @@ static int proc_wepkey_open(struct inode *inode, struct file *file) static int proc_SSID_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; int i; char *ptr; @@ -5423,7 +5423,7 @@ static int proc_SSID_open(struct inode *inode, struct file *file) static int proc_APList_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; int i; char *ptr; @@ -5462,7 +5462,7 @@ static int proc_APList_open(struct inode *inode, struct file *file) static int proc_BSSList_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; char *ptr; BSSListRid BSSList_rid; diff --git a/drivers/net/wireless/intersil/hostap/hostap_ap.c b/drivers/net/wireless/intersil/hostap/hostap_ap.c index 8bcc1cdcb75b..462ccc7d7d1a 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_ap.c +++ b/drivers/net/wireless/intersil/hostap/hostap_ap.c @@ -69,7 +69,7 @@ static void prism2_send_mgmt(struct net_device *dev, #if !defined(PRISM2_NO_PROCFS_DEBUG) && defined(CONFIG_PROC_FS) static int ap_debug_proc_show(struct seq_file *m, void *v) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); seq_printf(m, "BridgedUnicastFrames=%u\n", ap->bridged_unicast); seq_printf(m, "BridgedMulticastFrames=%u\n", ap->bridged_multicast); @@ -320,7 +320,7 @@ void hostap_deauth_all_stas(struct net_device *dev, struct ap_data *ap, static int ap_control_proc_show(struct seq_file *m, void *v) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); char *policy_txt; struct mac_entry *entry; @@ -352,20 +352,20 @@ static int ap_control_proc_show(struct seq_file *m, void *v) static void *ap_control_proc_start(struct seq_file *m, loff_t *_pos) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); spin_lock_bh(&ap->mac_restrictions.lock); return seq_list_start_head(&ap->mac_restrictions.mac_list, *_pos); } static void *ap_control_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); return seq_list_next(v, &ap->mac_restrictions.mac_list, _pos); } static void ap_control_proc_stop(struct seq_file *m, void *v) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); spin_unlock_bh(&ap->mac_restrictions.lock); } @@ -554,20 +554,20 @@ static int prism2_ap_proc_show(struct seq_file *m, void *v) static void *prism2_ap_proc_start(struct seq_file *m, loff_t *_pos) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); spin_lock_bh(&ap->sta_table_lock); return seq_list_start_head(&ap->sta_list, *_pos); } static void *prism2_ap_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); return seq_list_next(v, &ap->sta_list, _pos); } static void prism2_ap_proc_stop(struct seq_file *m, void *v) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); spin_unlock_bh(&ap->sta_table_lock); } diff --git a/drivers/net/wireless/intersil/hostap/hostap_download.c b/drivers/net/wireless/intersil/hostap/hostap_download.c index 7c6a5a6d1d45..3672291ced5c 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_download.c +++ b/drivers/net/wireless/intersil/hostap/hostap_download.c @@ -227,7 +227,7 @@ static int prism2_download_aux_dump_proc_open(struct inode *inode, struct file * sizeof(struct prism2_download_aux_dump)); if (ret == 0) { struct seq_file *m = file->private_data; - m->private = PDE_DATA(inode); + m->private = pde_data(inode); } return ret; } diff --git a/drivers/net/wireless/intersil/hostap/hostap_proc.c b/drivers/net/wireless/intersil/hostap/hostap_proc.c index 51c847d98755..61f68786056f 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_proc.c +++ b/drivers/net/wireless/intersil/hostap/hostap_proc.c @@ -97,20 +97,20 @@ static int prism2_wds_proc_show(struct seq_file *m, void *v) static void *prism2_wds_proc_start(struct seq_file *m, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); read_lock_bh(&local->iface_lock); return seq_list_start(&local->hostap_interfaces, *_pos); } static void *prism2_wds_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); return seq_list_next(v, &local->hostap_interfaces, _pos); } static void prism2_wds_proc_stop(struct seq_file *m, void *v) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); read_unlock_bh(&local->iface_lock); } @@ -123,7 +123,7 @@ static const struct seq_operations prism2_wds_proc_seqops = { static int prism2_bss_list_proc_show(struct seq_file *m, void *v) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); struct list_head *ptr = v; struct hostap_bss_info *bss; @@ -151,21 +151,21 @@ static int prism2_bss_list_proc_show(struct seq_file *m, void *v) static void *prism2_bss_list_proc_start(struct seq_file *m, loff_t *_pos) __acquires(&local->lock) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); spin_lock_bh(&local->lock); return seq_list_start_head(&local->bss_list, *_pos); } static void *prism2_bss_list_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); return seq_list_next(v, &local->bss_list, _pos); } static void prism2_bss_list_proc_stop(struct seq_file *m, void *v) __releases(&local->lock) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); spin_unlock_bh(&local->lock); } @@ -198,7 +198,7 @@ static int prism2_crypt_proc_show(struct seq_file *m, void *v) static ssize_t prism2_pda_proc_read(struct file *file, char __user *buf, size_t count, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(file)); + local_info_t *local = pde_data(file_inode(file)); size_t off; if (local->pda == NULL || *_pos >= PRISM2_PDA_SIZE) @@ -272,7 +272,7 @@ static int prism2_io_debug_proc_read(char *page, char **start, off_t off, #ifndef PRISM2_NO_STATION_MODES static int prism2_scan_results_proc_show(struct seq_file *m, void *v) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); unsigned long entry; int i, len; struct hfa384x_hostscan_result *scanres; @@ -322,7 +322,7 @@ static int prism2_scan_results_proc_show(struct seq_file *m, void *v) static void *prism2_scan_results_proc_start(struct seq_file *m, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); spin_lock_bh(&local->lock); /* We have a header (pos 0) + N results to show (pos 1...N) */ @@ -333,7 +333,7 @@ static void *prism2_scan_results_proc_start(struct seq_file *m, loff_t *_pos) static void *prism2_scan_results_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); ++*_pos; if (*_pos > local->last_scan_results_count) @@ -343,7 +343,7 @@ static void *prism2_scan_results_proc_next(struct seq_file *m, void *v, loff_t * static void prism2_scan_results_proc_stop(struct seq_file *m, void *v) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); spin_unlock_bh(&local->lock); } diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c index e3a3dc3e45b4..2987ad9271f6 100644 --- a/drivers/net/wireless/ray_cs.c +++ b/drivers/net/wireless/ray_cs.c @@ -2746,7 +2746,7 @@ static ssize_t int_proc_write(struct file *file, const char __user *buffer, nr = nr * 10 + c; p++; } while (--len); - *(int *)PDE_DATA(file_inode(file)) = nr; + *(int *)pde_data(file_inode(file)) = nr; return count; } diff --git a/drivers/nubus/proc.c b/drivers/nubus/proc.c index 88e1f9a0faaf..1fd667852271 100644 --- a/drivers/nubus/proc.c +++ b/drivers/nubus/proc.c @@ -93,30 +93,30 @@ struct nubus_proc_pde_data { static struct nubus_proc_pde_data * nubus_proc_alloc_pde_data(unsigned char *ptr, unsigned int size) { - struct nubus_proc_pde_data *pde_data; + struct nubus_proc_pde_data *pded; - pde_data = kmalloc(sizeof(*pde_data), GFP_KERNEL); - if (!pde_data) + pded = kmalloc(sizeof(*pded), GFP_KERNEL); + if (!pded) return NULL; - pde_data->res_ptr = ptr; - pde_data->res_size = size; - return pde_data; + pded->res_ptr = ptr; + pded->res_size = size; + return pded; } static int nubus_proc_rsrc_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct nubus_proc_pde_data *pde_data; + struct nubus_proc_pde_data *pded; - pde_data = PDE_DATA(inode); - if (!pde_data) + pded = pde_data(inode); + if (!pded) return 0; - if (pde_data->res_size > m->size) + if (pded->res_size > m->size) return -EFBIG; - if (pde_data->res_size) { + if (pded->res_size) { int lanes = (int)proc_get_parent_data(inode); struct nubus_dirent ent; @@ -124,11 +124,11 @@ static int nubus_proc_rsrc_show(struct seq_file *m, void *v) return 0; ent.mask = lanes; - ent.base = pde_data->res_ptr; + ent.base = pded->res_ptr; ent.data = 0; - nubus_seq_write_rsrc_mem(m, &ent, pde_data->res_size); + nubus_seq_write_rsrc_mem(m, &ent, pded->res_size); } else { - unsigned int data = (unsigned int)pde_data->res_ptr; + unsigned int data = (unsigned int)pded->res_ptr; seq_putc(m, data >> 16); seq_putc(m, data >> 8); @@ -142,18 +142,18 @@ void nubus_proc_add_rsrc_mem(struct proc_dir_entry *procdir, unsigned int size) { char name[9]; - struct nubus_proc_pde_data *pde_data; + struct nubus_proc_pde_data *pded; if (!procdir) return; snprintf(name, sizeof(name), "%x", ent->type); if (size) - pde_data = nubus_proc_alloc_pde_data(nubus_dirptr(ent), size); + pded = nubus_proc_alloc_pde_data(nubus_dirptr(ent), size); else - pde_data = NULL; + pded = NULL; proc_create_single_data(name, S_IFREG | 0444, procdir, - nubus_proc_rsrc_show, pde_data); + nubus_proc_rsrc_show, pded); } void nubus_proc_add_rsrc(struct proc_dir_entry *procdir, diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c index cf91cb024be3..1e4a5663d011 100644 --- a/drivers/parisc/led.c +++ b/drivers/parisc/led.c @@ -168,14 +168,14 @@ static int led_proc_show(struct seq_file *m, void *v) static int led_proc_open(struct inode *inode, struct file *file) { - return single_open(file, led_proc_show, PDE_DATA(inode)); + return single_open(file, led_proc_show, pde_data(inode)); } static ssize_t led_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - void *data = PDE_DATA(file_inode(file)); + void *data = pde_data(file_inode(file)); char *cur, lbuf[32]; int d; diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c index cb18f8a13ab6..9c7edec64f7e 100644 --- a/drivers/pci/proc.c +++ b/drivers/pci/proc.c @@ -21,14 +21,14 @@ static int proc_initialized; /* = 0 */ static loff_t proc_bus_pci_lseek(struct file *file, loff_t off, int whence) { - struct pci_dev *dev = PDE_DATA(file_inode(file)); + struct pci_dev *dev = pde_data(file_inode(file)); return fixed_size_llseek(file, off, whence, dev->cfg_size); } static ssize_t proc_bus_pci_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct pci_dev *dev = PDE_DATA(file_inode(file)); + struct pci_dev *dev = pde_data(file_inode(file)); unsigned int pos = *ppos; unsigned int cnt, size; @@ -114,7 +114,7 @@ static ssize_t proc_bus_pci_write(struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos) { struct inode *ino = file_inode(file); - struct pci_dev *dev = PDE_DATA(ino); + struct pci_dev *dev = pde_data(ino); int pos = *ppos; int size = dev->cfg_size; int cnt, ret; @@ -196,7 +196,7 @@ struct pci_filp_private { static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct pci_dev *dev = PDE_DATA(file_inode(file)); + struct pci_dev *dev = pde_data(file_inode(file)); #ifdef HAVE_PCI_MMAP struct pci_filp_private *fpriv = file->private_data; #endif /* HAVE_PCI_MMAP */ @@ -244,7 +244,7 @@ static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd, #ifdef HAVE_PCI_MMAP static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma) { - struct pci_dev *dev = PDE_DATA(file_inode(file)); + struct pci_dev *dev = pde_data(file_inode(file)); struct pci_filp_private *fpriv = file->private_data; int i, ret, write_combine = 0, res_bit = IORESOURCE_MEM; diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 82fa6148216c..098180fb1cfc 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -880,14 +880,14 @@ static int dispatch_proc_show(struct seq_file *m, void *v) static int dispatch_proc_open(struct inode *inode, struct file *file) { - return single_open(file, dispatch_proc_show, PDE_DATA(inode)); + return single_open(file, dispatch_proc_show, pde_data(inode)); } static ssize_t dispatch_proc_write(struct file *file, const char __user *userbuf, size_t count, loff_t *pos) { - struct ibm_struct *ibm = PDE_DATA(file_inode(file)); + struct ibm_struct *ibm = pde_data(file_inode(file)); char *kernbuf; int ret; diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 352508d30467..f113dec98e21 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -1368,7 +1368,7 @@ static int lcd_proc_show(struct seq_file *m, void *v) static int lcd_proc_open(struct inode *inode, struct file *file) { - return single_open(file, lcd_proc_show, PDE_DATA(inode)); + return single_open(file, lcd_proc_show, pde_data(inode)); } static int set_lcd_brightness(struct toshiba_acpi_dev *dev, int value) @@ -1404,7 +1404,7 @@ static int set_lcd_status(struct backlight_device *bd) static ssize_t lcd_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file)); + struct toshiba_acpi_dev *dev = pde_data(file_inode(file)); char cmd[42]; size_t len; int levels; @@ -1469,13 +1469,13 @@ static int video_proc_show(struct seq_file *m, void *v) static int video_proc_open(struct inode *inode, struct file *file) { - return single_open(file, video_proc_show, PDE_DATA(inode)); + return single_open(file, video_proc_show, pde_data(inode)); } static ssize_t video_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file)); + struct toshiba_acpi_dev *dev = pde_data(file_inode(file)); char *buffer; char *cmd; int lcd_out = -1, crt_out = -1, tv_out = -1; @@ -1580,13 +1580,13 @@ static int fan_proc_show(struct seq_file *m, void *v) static int fan_proc_open(struct inode *inode, struct file *file) { - return single_open(file, fan_proc_show, PDE_DATA(inode)); + return single_open(file, fan_proc_show, pde_data(inode)); } static ssize_t fan_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file)); + struct toshiba_acpi_dev *dev = pde_data(file_inode(file)); char cmd[42]; size_t len; int value; @@ -1628,13 +1628,13 @@ static int keys_proc_show(struct seq_file *m, void *v) static int keys_proc_open(struct inode *inode, struct file *file) { - return single_open(file, keys_proc_show, PDE_DATA(inode)); + return single_open(file, keys_proc_show, pde_data(inode)); } static ssize_t keys_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file)); + struct toshiba_acpi_dev *dev = pde_data(file_inode(file)); char cmd[42]; size_t len; int value; diff --git a/drivers/pnp/isapnp/proc.c b/drivers/pnp/isapnp/proc.c index 1ae458c02656..55ae72a2818b 100644 --- a/drivers/pnp/isapnp/proc.c +++ b/drivers/pnp/isapnp/proc.c @@ -22,7 +22,7 @@ static loff_t isapnp_proc_bus_lseek(struct file *file, loff_t off, int whence) static ssize_t isapnp_proc_bus_read(struct file *file, char __user * buf, size_t nbytes, loff_t * ppos) { - struct pnp_dev *dev = PDE_DATA(file_inode(file)); + struct pnp_dev *dev = pde_data(file_inode(file)); int pos = *ppos; int cnt, size = 256; diff --git a/drivers/pnp/pnpbios/proc.c b/drivers/pnp/pnpbios/proc.c index a806830e3a40..0f0d819b157f 100644 --- a/drivers/pnp/pnpbios/proc.c +++ b/drivers/pnp/pnpbios/proc.c @@ -173,13 +173,13 @@ static int pnpbios_proc_show(struct seq_file *m, void *v) static int pnpbios_proc_open(struct inode *inode, struct file *file) { - return single_open(file, pnpbios_proc_show, PDE_DATA(inode)); + return single_open(file, pnpbios_proc_show, pde_data(inode)); } static ssize_t pnpbios_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - void *data = PDE_DATA(file_inode(file)); + void *data = pde_data(file_inode(file)); struct pnp_bios_node *node; int boot = (long)data >> 8; u8 nodenum = (long)data; diff --git a/drivers/scsi/scsi_proc.c b/drivers/scsi/scsi_proc.c index d6982d355739..95aee1ad1383 100644 --- a/drivers/scsi/scsi_proc.c +++ b/drivers/scsi/scsi_proc.c @@ -49,7 +49,7 @@ static DEFINE_MUTEX(global_host_template_mutex); static ssize_t proc_scsi_host_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct Scsi_Host *shost = PDE_DATA(file_inode(file)); + struct Scsi_Host *shost = pde_data(file_inode(file)); ssize_t ret = -ENOMEM; char *page; @@ -79,7 +79,7 @@ static int proc_scsi_show(struct seq_file *m, void *v) static int proc_scsi_host_open(struct inode *inode, struct file *file) { - return single_open_size(file, proc_scsi_show, PDE_DATA(inode), + return single_open_size(file, proc_scsi_show, pde_data(inode), 4 * PAGE_SIZE); } diff --git a/drivers/usb/gadget/function/rndis.c b/drivers/usb/gadget/function/rndis.c index 64de9f1b874c..431d5a7d737e 100644 --- a/drivers/usb/gadget/function/rndis.c +++ b/drivers/usb/gadget/function/rndis.c @@ -1117,7 +1117,7 @@ static int rndis_proc_show(struct seq_file *m, void *v) static ssize_t rndis_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - rndis_params *p = PDE_DATA(file_inode(file)); + rndis_params *p = pde_data(file_inode(file)); u32 speed = 0; int i, fl_speed = 0; @@ -1161,7 +1161,7 @@ static ssize_t rndis_proc_write(struct file *file, const char __user *buffer, static int rndis_proc_open(struct inode *inode, struct file *file) { - return single_open(file, rndis_proc_show, PDE_DATA(inode)); + return single_open(file, rndis_proc_show, pde_data(inode)); } static const struct proc_ops rndis_proc_ops = { diff --git a/drivers/zorro/proc.c b/drivers/zorro/proc.c index 1c9ae08225d8..f916bf60b312 100644 --- a/drivers/zorro/proc.c +++ b/drivers/zorro/proc.c @@ -30,7 +30,7 @@ proc_bus_zorro_lseek(struct file *file, loff_t off, int whence) static ssize_t proc_bus_zorro_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct zorro_dev *z = PDE_DATA(file_inode(file)); + struct zorro_dev *z = pde_data(file_inode(file)); struct ConfigDev cd; loff_t pos = *ppos; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 065a28bfa3f1..e1b863449296 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -227,7 +227,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) __acquires(cell->proc_lock) { - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); rcu_read_lock(); return seq_hlist_start_head_rcu(&cell->proc_volumes, *_pos); @@ -236,7 +236,7 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) static void *afs_proc_cell_volumes_next(struct seq_file *m, void *v, loff_t *_pos) { - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); return seq_hlist_next_rcu(v, &cell->proc_volumes, _pos); } @@ -322,7 +322,7 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) { struct afs_vl_seq_net_private *priv = m->private; struct afs_vlserver_list *vllist; - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); loff_t pos = *_pos; rcu_read_lock(); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cf2fd9fc7d98..9f86dd947032 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2834,7 +2834,7 @@ out: static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) @@ -2845,7 +2845,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; ++*pos; @@ -2857,7 +2857,7 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; int err, buddy_loaded = 0; @@ -2985,7 +2985,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) __acquires(&EXT4_SB(sb)->s_mb_rb_lock) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; read_lock(&EXT4_SB(sb)->s_mb_rb_lock); @@ -2998,7 +2998,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock) static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; ++*pos; @@ -3010,7 +3010,7 @@ static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, lof static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long position = ((unsigned long) v); struct ext4_group_info *grp; @@ -3058,7 +3058,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) __releases(&EXT4_SB(sb)->s_mb_rb_lock) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0b86a4365b66..f13d548e4a7f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1212,7 +1212,7 @@ static const struct seq_operations jbd2_seq_info_ops = { static int jbd2_seq_info_open(struct inode *inode, struct file *file) { - journal_t *journal = PDE_DATA(inode); + journal_t *journal = pde_data(inode); struct jbd2_stats_proc_session *s; int rc, size; diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 39b823ab2564..e1cfeda397f3 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -138,7 +138,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data); * @parent: The parent directory in which to create. * @ops: The seq_file ops with which to read the file. * @write: The write method with which to 'modify' the file. - * @data: Data for retrieval by PDE_DATA(). + * @data: Data for retrieval by pde_data(). * * Create a network namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a @@ -153,7 +153,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data); * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling - * PDE_DATA() on the file inode. The network namespace must be accessed by + * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode, @@ -230,7 +230,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single); * @parent: The parent directory in which to create. * @show: The seqfile show method with which to read the file. * @write: The write method with which to 'modify' the file. - * @data: Data for retrieval by PDE_DATA(). + * @data: Data for retrieval by pde_data(). * * Create a network-namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a @@ -245,7 +245,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single); * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling - * PDE_DATA() on the file inode. The network namespace must be accessed by + * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_single_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode, diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index b6e7005cc1b2..81d6e4ec2294 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -120,8 +120,6 @@ static inline void *pde_data(const struct inode *inode) return inode->i_private; } -#define PDE_DATA(i) pde_data(i) - extern void *proc_get_parent_data(const struct inode *); extern void proc_remove(struct proc_dir_entry *); extern void remove_proc_entry(const char *, struct proc_dir_entry *); @@ -202,7 +200,7 @@ proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {} static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {} -static inline void *PDE_DATA(const struct inode *inode) {BUG(); return NULL;} +static inline void *pde_data(const struct inode *inode) {BUG(); return NULL;} static inline void *proc_get_parent_data(const struct inode *inode) { BUG(); return NULL; } static inline void proc_remove(struct proc_dir_entry *de) {} diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 72dbb44a4573..88cc16444b43 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -209,7 +209,7 @@ static const struct file_operations __name ## _fops = { \ #define DEFINE_PROC_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ - return single_open(file, __name ## _show, PDE_DATA(inode)); \ + return single_open(file, __name ## _show, pde_data(inode)); \ } \ \ static const struct proc_ops __name ## _proc_ops = { \ diff --git a/ipc/util.c b/ipc/util.c index fa2d86ef3fb8..a2208d0f26b2 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -894,7 +894,7 @@ static int sysvipc_proc_open(struct inode *inode, struct file *file) if (!iter) return -ENOMEM; - iter->iface = PDE_DATA(inode); + iter->iface = pde_data(inode); iter->ns = get_ipc_ns(current->nsproxy->ipc_ns); iter->pid_ns = get_pid_ns(task_active_pid_ns(current)); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index ee595ec09778..623b8136e9af 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -137,7 +137,7 @@ static inline int irq_select_affinity_usr(unsigned int irq) static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)PDE_DATA(file_inode(file)); + unsigned int irq = (int)(long)pde_data(file_inode(file)); cpumask_var_t new_value; int err; @@ -190,12 +190,12 @@ static ssize_t irq_affinity_list_proc_write(struct file *file, static int irq_affinity_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_proc_show, PDE_DATA(inode)); + return single_open(file, irq_affinity_proc_show, pde_data(inode)); } static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); + return single_open(file, irq_affinity_list_proc_show, pde_data(inode)); } static const struct proc_ops irq_affinity_proc_ops = { @@ -265,7 +265,7 @@ out: static int default_affinity_open(struct inode *inode, struct file *file) { - return single_open(file, default_affinity_show, PDE_DATA(inode)); + return single_open(file, default_affinity_show, pde_data(inode)); } static const struct proc_ops default_affinity_proc_ops = { diff --git a/kernel/resource.c b/kernel/resource.c index 5ad3eba619ba..9c08d6e9eef2 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -99,7 +99,7 @@ enum { MAX_IORES_LEVEL = 5 }; static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { - struct resource *p = PDE_DATA(file_inode(m->file)); + struct resource *p = pde_data(file_inode(m->file)); loff_t l = 0; read_lock(&resource_lock); for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) @@ -115,7 +115,7 @@ static void r_stop(struct seq_file *m, void *v) static int r_show(struct seq_file *m, void *v) { - struct resource *root = PDE_DATA(file_inode(m->file)); + struct resource *root = pde_data(file_inode(m->file)); struct resource *r = v, *p; unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; diff --git a/net/atm/proc.c b/net/atm/proc.c index 4369ffa3302a..9bf736290e48 100644 --- a/net/atm/proc.c +++ b/net/atm/proc.c @@ -108,7 +108,7 @@ out: static inline void *vcc_walk(struct seq_file *seq, loff_t l) { struct vcc_state *state = seq->private; - int family = (uintptr_t)(PDE_DATA(file_inode(seq->file))); + int family = (uintptr_t)(pde_data(file_inode(seq->file))); return __vcc_walk(&state->sk, family, &state->bucket, l) ? state : NULL; @@ -324,7 +324,7 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf, page = get_zeroed_page(GFP_KERNEL); if (!page) return -ENOMEM; - dev = PDE_DATA(file_inode(file)); + dev = pde_data(file_inode(file)); if (!dev->ops->proc_read) length = -EINVAL; else { diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 1661979b6a6e..ee319779781e 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -611,7 +611,7 @@ EXPORT_SYMBOL(bt_sock_wait_ready); static void *bt_seq_start(struct seq_file *seq, loff_t *pos) __acquires(seq->private->l->lock) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); read_lock(&l->lock); return seq_hlist_start_head(&l->head, *pos); @@ -619,7 +619,7 @@ static void *bt_seq_start(struct seq_file *seq, loff_t *pos) static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); return seq_hlist_next(v, &l->head, pos); } @@ -627,14 +627,14 @@ static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void bt_seq_stop(struct seq_file *seq, void *v) __releases(seq->private->l->lock) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); read_unlock(&l->lock); } static int bt_seq_show(struct seq_file *seq, void *v) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); if (v == SEQ_START_TOKEN) { seq_puts(seq, "sk RefCnt Rmem Wmem User Inode Parent"); diff --git a/net/can/bcm.c b/net/can/bcm.c index bc88d901a1c0..95d209b52e6a 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -193,7 +193,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) { char ifname[IFNAMSIZ]; struct net *net = m->private; - struct sock *sk = (struct sock *)PDE_DATA(m->file->f_inode); + struct sock *sk = (struct sock *)pde_data(m->file->f_inode); struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; diff --git a/net/can/proc.c b/net/can/proc.c index b3099f0a3cb8..bbce97825f13 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -305,7 +305,7 @@ static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx, static int can_rcvlist_proc_show(struct seq_file *m, void *v) { /* double cast to prevent GCC warning */ - int idx = (int)(long)PDE_DATA(m->file->f_inode); + int idx = (int)(long)pde_data(m->file->f_inode); struct net_device *dev; struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = m->private; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 213cb7b26b7a..6c2016f7f3d1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3364,7 +3364,7 @@ EXPORT_SYMBOL(neigh_seq_stop); static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; if (*pos == 0) @@ -3381,7 +3381,7 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { @@ -3401,7 +3401,7 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v) static int neigh_stat_seq_show(struct seq_file *seq, void *v) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 560a5e712dc3..84b62cd7bc57 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -546,7 +546,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, static int pgctrl_open(struct inode *inode, struct file *file) { - return single_open(file, pgctrl_show, PDE_DATA(inode)); + return single_open(file, pgctrl_show, pde_data(inode)); } static const struct proc_ops pktgen_proc_ops = { @@ -1811,7 +1811,7 @@ static ssize_t pktgen_if_write(struct file *file, static int pktgen_if_open(struct inode *inode, struct file *file) { - return single_open(file, pktgen_if_show, PDE_DATA(inode)); + return single_open(file, pktgen_if_show, pde_data(inode)); } static const struct proc_ops pktgen_if_proc_ops = { @@ -1948,7 +1948,7 @@ out: static int pktgen_thread_open(struct inode *inode, struct file *file) { - return single_open(file, pktgen_thread_show, PDE_DATA(inode)); + return single_open(file, pktgen_thread_show, pde_data(inode)); } static const struct proc_ops pktgen_thread_proc_ops = { diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index b518f20c9a24..f8e176c77d1c 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -776,7 +776,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - struct clusterip_config *c = PDE_DATA(inode); + struct clusterip_config *c = pde_data(inode); sf->private = c; @@ -788,7 +788,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file) static int clusterip_proc_release(struct inode *inode, struct file *file) { - struct clusterip_config *c = PDE_DATA(inode); + struct clusterip_config *c = pde_data(inode); int ret; ret = seq_release(inode, file); @@ -802,7 +802,7 @@ static int clusterip_proc_release(struct inode *inode, struct file *file) static ssize_t clusterip_proc_write(struct file *file, const char __user *input, size_t size, loff_t *ofs) { - struct clusterip_config *c = PDE_DATA(file_inode(file)); + struct clusterip_config *c = pde_data(file_inode(file)); #define PROC_WRITELEN 10 char buffer[PROC_WRITELEN+1]; unsigned long nodenum; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index a53f256bf9d3..9eb5fc247868 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -971,7 +971,7 @@ struct proto raw_prot = { static struct sock *raw_get_first(struct seq_file *seq) { struct sock *sk; - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE; @@ -987,7 +987,7 @@ found: static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) { - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); do { @@ -1016,7 +1016,7 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) void *raw_seq_start(struct seq_file *seq, loff_t *pos) __acquires(&h->lock) { - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); read_lock(&h->lock); return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; @@ -1039,7 +1039,7 @@ EXPORT_SYMBOL_GPL(raw_seq_next); void raw_seq_stop(struct seq_file *seq, void *v) __releases(&h->lock) { - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); read_unlock(&h->lock); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b3f34e366b27..b53476e78c84 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3002,7 +3002,7 @@ static unsigned short seq_file_family(const struct seq_file *seq) #endif /* Iterated from proc fs */ - afinfo = PDE_DATA(file_inode(seq->file)); + afinfo = pde_data(file_inode(seq->file)); return afinfo->family; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 464590ea922e..090360939401 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2960,7 +2960,7 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) if (state->bpf_seq_afinfo) afinfo = state->bpf_seq_afinfo; else - afinfo = PDE_DATA(file_inode(seq->file)); + afinfo = pde_data(file_inode(seq->file)); for (state->bucket = start; state->bucket <= afinfo->udp_table->mask; ++state->bucket) { @@ -2993,7 +2993,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) if (state->bpf_seq_afinfo) afinfo = state->bpf_seq_afinfo; else - afinfo = PDE_DATA(file_inode(seq->file)); + afinfo = pde_data(file_inode(seq->file)); do { sk = sk_next(sk); @@ -3050,7 +3050,7 @@ void udp_seq_stop(struct seq_file *seq, void *v) if (state->bpf_seq_afinfo) afinfo = state->bpf_seq_afinfo; else - afinfo = PDE_DATA(file_inode(seq->file)); + afinfo = pde_data(file_inode(seq->file)); if (state->bucket <= afinfo->udp_table->mask) spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 25524e393349..54a489f16b17 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1517,7 +1517,7 @@ EXPORT_SYMBOL_GPL(xt_unregister_table); #ifdef CONFIG_PROC_FS static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) { - u8 af = (unsigned long)PDE_DATA(file_inode(seq->file)); + u8 af = (unsigned long)pde_data(file_inode(seq->file)); struct net *net = seq_file_net(seq); struct xt_pernet *xt_net; @@ -1529,7 +1529,7 @@ static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - u8 af = (unsigned long)PDE_DATA(file_inode(seq->file)); + u8 af = (unsigned long)pde_data(file_inode(seq->file)); struct net *net = seq_file_net(seq); struct xt_pernet *xt_net; @@ -1540,7 +1540,7 @@ static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void xt_table_seq_stop(struct seq_file *seq, void *v) { - u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file)); + u_int8_t af = (unsigned long)pde_data(file_inode(seq->file)); mutex_unlock(&xt[af].mutex); } @@ -1584,7 +1584,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, [MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC, [MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE, }; - uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file)); + uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; if (ppos != NULL) @@ -1633,7 +1633,7 @@ static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos, static void xt_mttg_seq_stop(struct seq_file *seq, void *v) { - uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file)); + uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; switch (trav->class) { diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 9c5cfd74a0ee..0859b8f76764 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -1052,7 +1052,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { static void *dl_seq_start(struct seq_file *s, loff_t *pos) __acquires(htable->lock) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket; spin_lock_bh(&htable->lock); @@ -1069,7 +1069,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos) static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; *pos = ++(*bucket); @@ -1083,7 +1083,7 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) static void dl_seq_stop(struct seq_file *s, void *v) __releases(htable->lock) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; if (!IS_ERR(bucket)) @@ -1125,7 +1125,7 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1140,7 +1140,7 @@ static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1155,7 +1155,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1169,7 +1169,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_show_v2(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = (unsigned int *)v; struct dsthash_ent *ent; @@ -1183,7 +1183,7 @@ static int dl_seq_show_v2(struct seq_file *s, void *v) static int dl_seq_show_v1(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; struct dsthash_ent *ent; @@ -1197,7 +1197,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v) static int dl_seq_show(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; struct dsthash_ent *ent; diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 0446307516cd..7ddb9a78e3fc 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -551,7 +551,7 @@ static int recent_seq_open(struct inode *inode, struct file *file) if (st == NULL) return -ENOMEM; - st->table = PDE_DATA(inode); + st->table = pde_data(inode); return 0; } @@ -559,7 +559,7 @@ static ssize_t recent_mt_proc_write(struct file *file, const char __user *input, size_t size, loff_t *loff) { - struct recent_table *t = PDE_DATA(file_inode(file)); + struct recent_table *t = pde_data(file_inode(file)); struct recent_entry *e; char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; const char *c = buf; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index b87565b64928..c2ba9d4cd2c7 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1433,7 +1433,7 @@ static bool use_gss_proxy(struct net *net) static ssize_t write_gssp(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct net *net = PDE_DATA(file_inode(file)); + struct net *net = pde_data(file_inode(file)); char tbuf[20]; unsigned long i; int res; @@ -1461,7 +1461,7 @@ static ssize_t write_gssp(struct file *file, const char __user *buf, static ssize_t read_gssp(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct net *net = PDE_DATA(file_inode(file)); + struct net *net = pde_data(file_inode(file)); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); unsigned long p = *ppos; char tbuf[10]; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 59641803472c..bb1177395b99 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1536,7 +1536,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf, static ssize_t cache_read_procfs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return cache_read(filp, buf, count, ppos, cd); } @@ -1544,14 +1544,14 @@ static ssize_t cache_read_procfs(struct file *filp, char __user *buf, static ssize_t cache_write_procfs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return cache_write(filp, buf, count, ppos, cd); } static __poll_t cache_poll_procfs(struct file *filp, poll_table *wait) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return cache_poll(filp, wait, cd); } @@ -1560,21 +1560,21 @@ static long cache_ioctl_procfs(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return cache_ioctl(inode, filp, cmd, arg, cd); } static int cache_open_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return cache_open(inode, filp, cd); } static int cache_release_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return cache_release(inode, filp, cd); } @@ -1591,14 +1591,14 @@ static const struct proc_ops cache_channel_proc_ops = { static int content_open_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return content_open(inode, filp, cd); } static int content_release_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return content_release(inode, filp, cd); } @@ -1612,14 +1612,14 @@ static const struct proc_ops content_proc_ops = { static int open_flush_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return open_flush(inode, filp, cd); } static int release_flush_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return release_flush(inode, filp, cd); } @@ -1627,7 +1627,7 @@ static int release_flush_procfs(struct inode *inode, struct file *filp) static ssize_t read_flush_procfs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return read_flush(filp, buf, count, ppos, cd); } @@ -1636,7 +1636,7 @@ static ssize_t write_flush_procfs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return write_flush(filp, buf, count, ppos, cd); } diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index c964b48eaaba..52908f9e6eab 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -66,7 +66,7 @@ static int rpc_proc_show(struct seq_file *seq, void *v) { static int rpc_proc_open(struct inode *inode, struct file *file) { - return single_open(file, rpc_proc_show, PDE_DATA(inode)); + return single_open(file, rpc_proc_show, pde_data(inode)); } static const struct proc_ops rpc_proc_ops = { diff --git a/sound/core/info.c b/sound/core/info.c index a451b24199c3..782fba87cc04 100644 --- a/sound/core/info.c +++ b/sound/core/info.c @@ -234,7 +234,7 @@ static int snd_info_entry_mmap(struct file *file, struct vm_area_struct *vma) static int snd_info_entry_open(struct inode *inode, struct file *file) { - struct snd_info_entry *entry = PDE_DATA(inode); + struct snd_info_entry *entry = pde_data(inode); struct snd_info_private_data *data; int mode, err; @@ -365,7 +365,7 @@ static int snd_info_seq_show(struct seq_file *seq, void *p) static int snd_info_text_entry_open(struct inode *inode, struct file *file) { - struct snd_info_entry *entry = PDE_DATA(inode); + struct snd_info_entry *entry = pde_data(inode); struct snd_info_private_data *data; int err; From 2dba5eb1c73b6ba2988ced07250edeac0f8cbf5a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 21 Jan 2022 22:14:27 -0800 Subject: [PATCH 0387/1295] lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() Currently, enabling CONFIG_STACKDEPOT means its stack_table will be allocated from memblock, even if stack depot ends up not actually used. The default size of stack_table is 4MB on 32-bit, 8MB on 64-bit. This is fine for use-cases such as KASAN which is also a config option and has overhead on its own. But it's an issue for functionality that has to be actually enabled on boot (page_owner) or depends on hardware (GPU drivers) and thus the memory might be wasted. This was raised as an issue [1] when attempting to add stackdepot support for SLUB's debug object tracking functionality. It's common to build kernels with CONFIG_SLUB_DEBUG and enable slub_debug on boot only when needed, or create only specific kmem caches with debugging for testing purposes. It would thus be more efficient if stackdepot's table was allocated only when actually going to be used. This patch thus makes the allocation (and whole stack_depot_init() call) optional: - Add a CONFIG_STACKDEPOT_ALWAYS_INIT flag to keep using the current well-defined point of allocation as part of mem_init(). Make CONFIG_KASAN select this flag. - Other users have to call stack_depot_init() as part of their own init when it's determined that stack depot will actually be used. This may depend on both config and runtime conditions. Convert current users which are page_owner and several in the DRM subsystem. Same will be done for SLUB later. - Because the init might now be called after the boot-time memblock allocation has given all memory to the buddy allocator, change stack_depot_init() to allocate stack_table with kvmalloc() when memblock is no longer available. Also handle allocation failure by disabling stackdepot (could have theoretically happened even with memblock allocation previously), and don't unnecessarily align the memblock allocation to its own size anymore. [1] https://lore.kernel.org/all/CAMuHMdW=eoVzM1Re5FVoEN87nKfiLmM2+Ah7eNu2KXEhCvbZyA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20211013073005.11351-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Dmitry Vyukov Reviewed-by: Marco Elver # stackdepot Cc: Marco Elver Cc: Vijayanand Jitta Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Geert Uytterhoeven Cc: Oliver Glitta Cc: Imran Khan From: Colin Ian King Subject: lib/stackdepot: fix spelling mistake and grammar in pr_err message There is a spelling mistake of the work allocation so fix this and re-phrase the message to make it easier to read. Link: https://lkml.kernel.org/r/20211015104159.11282-1-colin.king@canonical.com Signed-off-by: Colin Ian King Cc: Vlastimil Babka From: Vlastimil Babka Subject: lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() - fixup On FLATMEM, we call page_ext_init_flatmem_late() just before kmem_cache_init() which means stack_depot_init() (called by page owner init) will not recognize properly it should use kvmalloc() and not memblock_alloc(). memblock_alloc() will also not issue a warning and return a block memory that can be invalid and cause kernel page fault when saving stacks, as reported by the kernel test robot [1]. Fix this by moving page_ext_init_flatmem_late() below kmem_cache_init() so that slab_is_available() is true during stack_depot_init(). SPARSEMEM doesn't have this issue, as it doesn't do page_ext_init_flatmem_late(), but a different page_ext_init() even later in the boot process. Thanks to Mike Rapoport for pointing out the FLATMEM init ordering issue. While at it, also actually resolve a checkpatch warning in stack_depot_init() from DRM CI, which was supposed to be in the original patch already. [1] https://lore.kernel.org/all/20211014085450.GC18719@xsang-OptiPlex-9020/ Link: https://lkml.kernel.org/r/6abd9213-19a9-6d58-cedc-2414386d2d81@suse.cz Signed-off-by: Vlastimil Babka Reported-by: kernel test robot Cc: Mike Rapoport Cc: Stephen Rothwell From: Vlastimil Babka Subject: lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() - fixup3 Due to cd06ab2fd48f ("drm/locking: add backtrace for locking contended locks without backoff") landing recently to -next adding a new stack depot user in drivers/gpu/drm/drm_modeset_lock.c we need to add an appropriate call to stack_depot_init() there as well. Link: https://lkml.kernel.org/r/2a692365-cfa1-64f2-34e0-8aa5674dce5e@suse.cz Signed-off-by: Vlastimil Babka Cc: Jani Nikula Cc: Naresh Kamboju Cc: Marco Elver Cc: Vijayanand Jitta Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Geert Uytterhoeven Cc: Oliver Glitta Cc: Imran Khan Cc: Stephen Rothwell From: Vlastimil Babka Subject: lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() - fixup4 Due to 4e66934eaadc ("lib: add reference counting tracking infrastructure") landing recently to net-next adding a new stack depot user in lib/ref_tracker.c we need to add an appropriate call to stack_depot_init() there as well. Link: https://lkml.kernel.org/r/45c1b738-1a2f-5b5f-2f6d-86fab206d01c@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Eric Dumazet Cc: Jiri Slab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/drm_dp_mst_topology.c | 1 + drivers/gpu/drm/drm_mm.c | 4 +++ drivers/gpu/drm/drm_modeset_lock.c | 9 +++++++ drivers/gpu/drm/i915/intel_runtime_pm.c | 3 +++ include/linux/ref_tracker.h | 2 ++ include/linux/stackdepot.h | 25 ++++++++++++------- init/main.c | 9 ++++--- lib/Kconfig | 4 +++ lib/Kconfig.kasan | 2 +- lib/stackdepot.c | 33 +++++++++++++++++++++---- mm/page_owner.c | 2 ++ 11 files changed, 76 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c index f3d79eda94bb..8b3822142fed 100644 --- a/drivers/gpu/drm/drm_dp_mst_topology.c +++ b/drivers/gpu/drm/drm_dp_mst_topology.c @@ -5511,6 +5511,7 @@ int drm_dp_mst_topology_mgr_init(struct drm_dp_mst_topology_mgr *mgr, mutex_init(&mgr->probe_lock); #if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS) mutex_init(&mgr->topology_ref_history_lock); + stack_depot_init(); #endif INIT_LIST_HEAD(&mgr->tx_msg_downq); INIT_LIST_HEAD(&mgr->destroy_port_list); diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c index 7d1c578388d3..8257f9d4f619 100644 --- a/drivers/gpu/drm/drm_mm.c +++ b/drivers/gpu/drm/drm_mm.c @@ -980,6 +980,10 @@ void drm_mm_init(struct drm_mm *mm, u64 start, u64 size) add_hole(&mm->head_node); mm->scan_active = 0; + +#ifdef CONFIG_DRM_DEBUG_MM + stack_depot_init(); +#endif } EXPORT_SYMBOL(drm_mm_init); diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c index c97323365675..918065982db4 100644 --- a/drivers/gpu/drm/drm_modeset_lock.c +++ b/drivers/gpu/drm/drm_modeset_lock.c @@ -107,6 +107,11 @@ static void __drm_stack_depot_print(depot_stack_handle_t stack_depot) kfree(buf); } + +static void __drm_stack_depot_init(void) +{ + stack_depot_init(); +} #else /* CONFIG_DRM_DEBUG_MODESET_LOCK */ static depot_stack_handle_t __drm_stack_depot_save(void) { @@ -115,6 +120,9 @@ static depot_stack_handle_t __drm_stack_depot_save(void) static void __drm_stack_depot_print(depot_stack_handle_t stack_depot) { } +static void __drm_stack_depot_init(void) +{ +} #endif /* CONFIG_DRM_DEBUG_MODESET_LOCK */ /** @@ -359,6 +367,7 @@ void drm_modeset_lock_init(struct drm_modeset_lock *lock) { ww_mutex_init(&lock->mutex, &crtc_ww_class); INIT_LIST_HEAD(&lock->head); + __drm_stack_depot_init(); } EXPORT_SYMBOL(drm_modeset_lock_init); diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c index 22dab36afcb6..53f1ccb78849 100644 --- a/drivers/gpu/drm/i915/intel_runtime_pm.c +++ b/drivers/gpu/drm/i915/intel_runtime_pm.c @@ -68,6 +68,9 @@ static noinline depot_stack_handle_t __save_depot_stack(void) static void init_intel_runtime_pm_wakeref(struct intel_runtime_pm *rpm) { spin_lock_init(&rpm->debug.lock); + + if (rpm->available) + stack_depot_init(); } static noinline depot_stack_handle_t diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h index c11c9db5825c..60f3453be23e 100644 --- a/include/linux/ref_tracker.h +++ b/include/linux/ref_tracker.h @@ -4,6 +4,7 @@ #include #include #include +#include struct ref_tracker; @@ -26,6 +27,7 @@ static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir, spin_lock_init(&dir->lock); dir->quarantine_avail = quarantine_count; refcount_set(&dir->untracked, 1); + stack_depot_init(); } void ref_tracker_dir_exit(struct ref_tracker_dir *dir); diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index c34b55a6e554..17f992fe6355 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -19,6 +19,22 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags, bool can_alloc); +/* + * Every user of stack depot has to call this during its own init when it's + * decided that it will be calling stack_depot_save() later. + * + * The alternative is to select STACKDEPOT_ALWAYS_INIT to have stack depot + * enabled as part of mm_init(), for subsystems where it's known at compile time + * that stack depot will be used. + */ +int stack_depot_init(void); + +#ifdef CONFIG_STACKDEPOT_ALWAYS_INIT +static inline int stack_depot_early_init(void) { return stack_depot_init(); } +#else +static inline int stack_depot_early_init(void) { return 0; } +#endif + depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); @@ -30,13 +46,4 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, void stack_depot_print(depot_stack_handle_t stack); -#ifdef CONFIG_STACKDEPOT -int stack_depot_init(void); -#else -static inline int stack_depot_init(void) -{ - return 0; -} -#endif /* CONFIG_STACKDEPOT */ - #endif diff --git a/init/main.c b/init/main.c index bb984ed79de0..65fa2e41a9c0 100644 --- a/init/main.c +++ b/init/main.c @@ -834,12 +834,15 @@ static void __init mm_init(void) init_mem_debugging_and_hardening(); kfence_alloc_pool(); report_meminit(); - stack_depot_init(); + stack_depot_early_init(); mem_init(); mem_init_print_info(); - /* page_owner must be initialized after buddy is ready */ - page_ext_init_flatmem_late(); kmem_cache_init(); + /* + * page_owner must be initialized after buddy is ready, and also after + * slab is ready so that stack_depot_init() works properly + */ + page_ext_init_flatmem_late(); kmemleak_init(); pgtable_init(); debug_objects_mem_init(); diff --git a/lib/Kconfig b/lib/Kconfig index c20b68ad2bc3..51c368a50b16 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -673,6 +673,10 @@ config STACKDEPOT bool select STACKTRACE +config STACKDEPOT_ALWAYS_INIT + bool + select STACKDEPOT + config STACK_HASH_ORDER int "stack depot hash size (12 => 4KB, 20 => 1024KB)" range 12 20 diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index cdc842d090db..879757b6dd14 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -38,7 +38,7 @@ menuconfig KASAN CC_HAS_WORKING_NOSANITIZE_ADDRESS) || \ HAVE_ARCH_KASAN_HW_TAGS depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB) - select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT help Enables KASAN (KernelAddressSANitizer) - runtime memory debugger, designed to find out-of-bounds accesses and use-after-free bugs. diff --git a/lib/stackdepot.c b/lib/stackdepot.c index b437ae79aca1..00ccb106f1a8 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -161,18 +162,40 @@ static int __init is_stack_depot_disabled(char *str) } early_param("stack_depot_disable", is_stack_depot_disabled); -int __init stack_depot_init(void) +/* + * __ref because of memblock_alloc(), which will not be actually called after + * the __init code is gone, because at that point slab_is_available() is true + */ +__ref int stack_depot_init(void) { - if (!stack_depot_disable) { + static DEFINE_MUTEX(stack_depot_init_mutex); + + mutex_lock(&stack_depot_init_mutex); + if (!stack_depot_disable && !stack_table) { size_t size = (STACK_HASH_SIZE * sizeof(struct stack_record *)); int i; - stack_table = memblock_alloc(size, size); - for (i = 0; i < STACK_HASH_SIZE; i++) - stack_table[i] = NULL; + if (slab_is_available()) { + pr_info("Stack Depot allocating hash table with kvmalloc\n"); + stack_table = kvmalloc(size, GFP_KERNEL); + } else { + pr_info("Stack Depot allocating hash table with memblock_alloc\n"); + stack_table = memblock_alloc(size, SMP_CACHE_BYTES); + } + if (stack_table) { + for (i = 0; i < STACK_HASH_SIZE; i++) + stack_table[i] = NULL; + } else { + pr_err("Stack Depot hash table allocation failed, disabling\n"); + stack_depot_disable = true; + mutex_unlock(&stack_depot_init_mutex); + return -ENOMEM; + } } + mutex_unlock(&stack_depot_init_mutex); return 0; } +EXPORT_SYMBOL_GPL(stack_depot_init); /* Calculate hash for a stack */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) diff --git a/mm/page_owner.c b/mm/page_owner.c index 5eea061bb1e5..99e360df9465 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -80,6 +80,8 @@ static __init void init_page_owner(void) if (!page_owner_enabled) return; + stack_depot_init(); + register_dummy_stack(); register_failure_stack(); register_early_stack(); From e940066089490efde86abc519593be84362f4e53 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 21 Jan 2022 22:14:31 -0800 Subject: [PATCH 0388/1295] lib/stackdepot: always do filter_irq_stacks() in stack_depot_save() The non-interrupt portion of interrupt stack traces before interrupt entry is usually arbitrary. Therefore, saving stack traces of interrupts (that include entries before interrupt entry) to stack depot leads to unbounded stackdepot growth. As such, use of filter_irq_stacks() is a requirement to ensure stackdepot can efficiently deduplicate interrupt stacks. Looking through all current users of stack_depot_save(), none (except KASAN) pass the stack trace through filter_irq_stacks() before passing it on to stack_depot_save(). Rather than adding filter_irq_stacks() to all current users of stack_depot_save(), it became clear that stack_depot_save() should simply do filter_irq_stacks(). Link: https://lkml.kernel.org/r/20211130095727.2378739-1-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Alexander Potapenko Acked-by: Vlastimil Babka Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vijayanand Jitta Cc: "Gustavo A. R. Silva" Cc: Imran Khan Cc: Chris Wilson Cc: Jani Nikula Cc: Mika Kuoppala Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/stackdepot.c | 13 +++++++++++++ mm/kasan/common.c | 1 - 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 00ccb106f1a8..bf5ba9af0500 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -328,6 +328,9 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids * any allocations and will fail if no space is left to store the stack trace. * + * If the stack trace in @entries is from an interrupt, only the portion up to + * interrupt entry is saved. + * * Context: Any context, but setting @can_alloc to %false is required if * alloc_pages() cannot be used from the current context. Currently * this is the case from contexts where neither %GFP_ATOMIC nor @@ -346,6 +349,16 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned long flags; u32 hash; + /* + * If this stack trace is from an interrupt, including anything before + * interrupt entry usually leads to unbounded stackdepot growth. + * + * Because use of filter_irq_stacks() is a requirement to ensure + * stackdepot can efficiently deduplicate interrupt stacks, always + * filter_irq_stacks() to simplify all callers' use of stackdepot. + */ + nr_entries = filter_irq_stacks(entries, nr_entries); + if (unlikely(nr_entries == 0) || stack_depot_disable) goto fast_exit; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 7c06db78a76c..92196562687b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -36,7 +36,6 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); - nr_entries = filter_irq_stacks(entries, nr_entries); return __stack_depot_save(entries, nr_entries, flags, can_alloc); } From 0a4ee518185e902758191d968600399f3bc2be31 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:34 -0800 Subject: [PATCH 0389/1295] mm: remove cleancache Patch series "remove Xen tmem leftovers". Since the removal of the Xen tmem driver in 2019, the cleancache hooks are entirely unused, as are large parts of frontswap. This series against linux-next (with the folio changes included) removes cleancaches, and cuts down frontswap to the bits actually used by zswap. This patch (of 13): The cleancache subsystem is unused since the removal of Xen tmem driver in commit 814bbf49dcd0 ("xen: remove tmem driver"). [akpm@linux-foundation.org: remove now-unreachable code] Link: https://lkml.kernel.org/r/20211224062246.1258487-1-hch@lst.de Link: https://lkml.kernel.org/r/20211224062246.1258487-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Acked-by: Geert Uytterhoeven Cc: Konrad Rzeszutek Wilk Cc: Hugh Dickins Cc: Seth Jennings Cc: Dan Streetman Cc: Vitaly Wool Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/cleancache.rst | 296 ------------------------- Documentation/vm/frontswap.rst | 12 +- Documentation/vm/index.rst | 1 - MAINTAINERS | 7 - arch/arm/configs/bcm2835_defconfig | 1 - arch/arm/configs/qcom_defconfig | 1 - arch/m68k/configs/amiga_defconfig | 1 - arch/m68k/configs/apollo_defconfig | 1 - arch/m68k/configs/atari_defconfig | 1 - arch/m68k/configs/bvme6000_defconfig | 1 - arch/m68k/configs/hp300_defconfig | 1 - arch/m68k/configs/mac_defconfig | 1 - arch/m68k/configs/multi_defconfig | 1 - arch/m68k/configs/mvme147_defconfig | 1 - arch/m68k/configs/mvme16x_defconfig | 1 - arch/m68k/configs/q40_defconfig | 1 - arch/m68k/configs/sun3_defconfig | 1 - arch/m68k/configs/sun3x_defconfig | 1 - arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - block/bdev.c | 5 - fs/btrfs/extent_io.c | 10 - fs/btrfs/super.c | 2 - fs/ext4/readpage.c | 6 - fs/ext4/super.c | 3 - fs/f2fs/data.c | 13 -- fs/mpage.c | 7 - fs/ntfs3/ntfs_fs.h | 1 - fs/ocfs2/super.c | 2 - fs/super.c | 3 - include/linux/cleancache.h | 124 ----------- include/linux/fs.h | 5 - mm/Kconfig | 22 -- mm/Makefile | 1 - mm/cleancache.c | 315 --------------------------- mm/filemap.c | 11 - mm/truncate.c | 15 +- 37 files changed, 4 insertions(+), 873 deletions(-) delete mode 100644 Documentation/vm/cleancache.rst delete mode 100644 include/linux/cleancache.h delete mode 100644 mm/cleancache.c diff --git a/Documentation/vm/cleancache.rst b/Documentation/vm/cleancache.rst deleted file mode 100644 index 68cba9131c31..000000000000 --- a/Documentation/vm/cleancache.rst +++ /dev/null @@ -1,296 +0,0 @@ -.. _cleancache: - -========== -Cleancache -========== - -Motivation -========== - -Cleancache is a new optional feature provided by the VFS layer that -potentially dramatically increases page cache effectiveness for -many workloads in many environments at a negligible cost. - -Cleancache can be thought of as a page-granularity victim cache for clean -pages that the kernel's pageframe replacement algorithm (PFRA) would like -to keep around, but can't since there isn't enough memory. So when the -PFRA "evicts" a page, it first attempts to use cleancache code to -put the data contained in that page into "transcendent memory", memory -that is not directly accessible or addressable by the kernel and is -of unknown and possibly time-varying size. - -Later, when a cleancache-enabled filesystem wishes to access a page -in a file on disk, it first checks cleancache to see if it already -contains it; if it does, the page of data is copied into the kernel -and a disk access is avoided. - -Transcendent memory "drivers" for cleancache are currently implemented -in Xen (using hypervisor memory) and zcache (using in-kernel compressed -memory) and other implementations are in development. - -:ref:`FAQs ` are included below. - -Implementation Overview -======================= - -A cleancache "backend" that provides transcendent memory registers itself -to the kernel's cleancache "frontend" by calling cleancache_register_ops, -passing a pointer to a cleancache_ops structure with funcs set appropriately. -The functions provided must conform to certain semantics as follows: - -Most important, cleancache is "ephemeral". Pages which are copied into -cleancache have an indefinite lifetime which is completely unknowable -by the kernel and so may or may not still be in cleancache at any later time. -Thus, as its name implies, cleancache is not suitable for dirty pages. -Cleancache has complete discretion over what pages to preserve and what -pages to discard and when. - -Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a -pool id which, if positive, must be saved in the filesystem's superblock; -a negative return value indicates failure. A "put_page" will copy a -(presumably about-to-be-evicted) page into cleancache and associate it with -the pool id, a file key, and a page index into the file. (The combination -of a pool id, a file key, and an index is sometimes called a "handle".) -A "get_page" will copy the page, if found, from cleancache into kernel memory. -An "invalidate_page" will ensure the page no longer is present in cleancache; -an "invalidate_inode" will invalidate all pages associated with the specified -file; and, when a filesystem is unmounted, an "invalidate_fs" will invalidate -all pages in all files specified by the given pool id and also surrender -the pool id. - -An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache -to treat the pool as shared using a 128-bit UUID as a key. On systems -that may run multiple kernels (such as hard partitioned or virtualized -systems) that may share a clustered filesystem, and where cleancache -may be shared among those kernels, calls to init_shared_fs that specify the -same UUID will receive the same pool id, thus allowing the pages to -be shared. Note that any security requirements must be imposed outside -of the kernel (e.g. by "tools" that control cleancache). Or a -cleancache implementation can simply disable shared_init by always -returning a negative value. - -If a get_page is successful on a non-shared pool, the page is invalidated -(thus making cleancache an "exclusive" cache). On a shared pool, the page -is NOT invalidated on a successful get_page so that it remains accessible to -other sharers. The kernel is responsible for ensuring coherency between -cleancache (shared or not), the page cache, and the filesystem, using -cleancache invalidate operations as required. - -Note that cleancache must enforce put-put-get coherency and get-get -coherency. For the former, if two puts are made to the same handle but -with different data, say AAA by the first put and BBB by the second, a -subsequent get can never return the stale data (AAA). For get-get coherency, -if a get for a given handle fails, subsequent gets for that handle will -never succeed unless preceded by a successful put with that handle. - -Last, cleancache provides no SMP serialization guarantees; if two -different Linux threads are simultaneously putting and invalidating a page -with the same handle, the results are indeterminate. Callers must -lock the page to ensure serial behavior. - -Cleancache Performance Metrics -============================== - -If properly configured, monitoring of cleancache is done via debugfs in -the `/sys/kernel/debug/cleancache` directory. The effectiveness of cleancache -can be measured (across all filesystems) with: - -``succ_gets`` - number of gets that were successful - -``failed_gets`` - number of gets that failed - -``puts`` - number of puts attempted (all "succeed") - -``invalidates`` - number of invalidates attempted - -A backend implementation may provide additional metrics. - -.. _faq: - -FAQ -=== - -* Where's the value? (Andrew Morton) - -Cleancache provides a significant performance benefit to many workloads -in many environments with negligible overhead by improving the -effectiveness of the pagecache. Clean pagecache pages are -saved in transcendent memory (RAM that is otherwise not directly -addressable to the kernel); fetching those pages later avoids "refaults" -and thus disk reads. - -Cleancache (and its sister code "frontswap") provide interfaces for -this transcendent memory (aka "tmem"), which conceptually lies between -fast kernel-directly-addressable RAM and slower DMA/asynchronous devices. -Disallowing direct kernel or userland reads/writes to tmem -is ideal when data is transformed to a different form and size (such -as with compression) or secretly moved (as might be useful for write- -balancing for some RAM-like devices). Evicted page-cache pages (and -swap pages) are a great use for this kind of slower-than-RAM-but-much- -faster-than-disk transcendent memory, and the cleancache (and frontswap) -"page-object-oriented" specification provides a nice way to read and -write -- and indirectly "name" -- the pages. - -In the virtual case, the whole point of virtualization is to statistically -multiplex physical resources across the varying demands of multiple -virtual machines. This is really hard to do with RAM and efforts to -do it well with no kernel change have essentially failed (except in some -well-publicized special-case workloads). Cleancache -- and frontswap -- -with a fairly small impact on the kernel, provide a huge amount -of flexibility for more dynamic, flexible RAM multiplexing. -Specifically, the Xen Transcendent Memory backend allows otherwise -"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple -virtual machines, but the pages can be compressed and deduplicated to -optimize RAM utilization. And when guest OS's are induced to surrender -underutilized RAM (e.g. with "self-ballooning"), page cache pages -are the first to go, and cleancache allows those pages to be -saved and reclaimed if overall host system memory conditions allow. - -And the identical interface used for cleancache can be used in -physical systems as well. The zcache driver acts as a memory-hungry -device that stores pages of data in a compressed state. And -the proposed "RAMster" driver shares RAM across multiple physical -systems. - -* Why does cleancache have its sticky fingers so deep inside the - filesystems and VFS? (Andrew Morton and Christoph Hellwig) - -The core hooks for cleancache in VFS are in most cases a single line -and the minimum set are placed precisely where needed to maintain -coherency (via cleancache_invalidate operations) between cleancache, -the page cache, and disk. All hooks compile into nothingness if -cleancache is config'ed off and turn into a function-pointer- -compare-to-NULL if config'ed on but no backend claims the ops -functions, or to a compare-struct-element-to-negative if a -backend claims the ops functions but a filesystem doesn't enable -cleancache. - -Some filesystems are built entirely on top of VFS and the hooks -in VFS are sufficient, so don't require an "init_fs" hook; the -initial implementation of cleancache didn't provide this hook. -But for some filesystems (such as btrfs), the VFS hooks are -incomplete and one or more hooks in fs-specific code are required. -And for some other filesystems, such as tmpfs, cleancache may -be counterproductive. So it seemed prudent to require a filesystem -to "opt in" to use cleancache, which requires adding a hook in -each filesystem. Not all filesystems are supported by cleancache -only because they haven't been tested. The existing set should -be sufficient to validate the concept, the opt-in approach means -that untested filesystems are not affected, and the hooks in the -existing filesystems should make it very easy to add more -filesystems in the future. - -The total impact of the hooks to existing fs and mm files is only -about 40 lines added (not counting comments and blank lines). - -* Why not make cleancache asynchronous and batched so it can more - easily interface with real devices with DMA instead of copying each - individual page? (Minchan Kim) - -The one-page-at-a-time copy semantics simplifies the implementation -on both the frontend and backend and also allows the backend to -do fancy things on-the-fly like page compression and -page deduplication. And since the data is "gone" (copied into/out -of the pageframe) before the cleancache get/put call returns, -a great deal of race conditions and potential coherency issues -are avoided. While the interface seems odd for a "real device" -or for real kernel-addressable RAM, it makes perfect sense for -transcendent memory. - -* Why is non-shared cleancache "exclusive"? And where is the - page "invalidated" after a "get"? (Minchan Kim) - -The main reason is to free up space in transcendent memory and -to avoid unnecessary cleancache_invalidate calls. If you want inclusive, -the page can be "put" immediately following the "get". If -put-after-get for inclusive becomes common, the interface could -be easily extended to add a "get_no_invalidate" call. - -The invalidate is done by the cleancache backend implementation. - -* What's the performance impact? - -Performance analysis has been presented at OLS'09 and LCA'10. -Briefly, performance gains can be significant on most workloads, -especially when memory pressure is high (e.g. when RAM is -overcommitted in a virtual workload); and because the hooks are -invoked primarily in place of or in addition to a disk read/write, -overhead is negligible even in worst case workloads. Basically -cleancache replaces I/O with memory-copy-CPU-overhead; on older -single-core systems with slow memory-copy speeds, cleancache -has little value, but in newer multicore machines, especially -consolidated/virtualized machines, it has great value. - -* How do I add cleancache support for filesystem X? (Boaz Harrash) - -Filesystems that are well-behaved and conform to certain -restrictions can utilize cleancache simply by making a call to -cleancache_init_fs at mount time. Unusual, misbehaving, or -poorly layered filesystems must either add additional hooks -and/or undergo extensive additional testing... or should just -not enable the optional cleancache. - -Some points for a filesystem to consider: - - - The FS should be block-device-based (e.g. a ram-based FS such - as tmpfs should not enable cleancache) - - To ensure coherency/correctness, the FS must ensure that all - file removal or truncation operations either go through VFS or - add hooks to do the equivalent cleancache "invalidate" operations - - To ensure coherency/correctness, either inode numbers must - be unique across the lifetime of the on-disk file OR the - FS must provide an "encode_fh" function. - - The FS must call the VFS superblock alloc and deactivate routines - or add hooks to do the equivalent cleancache calls done there. - - To maximize performance, all pages fetched from the FS should - go through the do_mpag_readpage routine or the FS should add - hooks to do the equivalent (cf. btrfs) - - Currently, the FS blocksize must be the same as PAGESIZE. This - is not an architectural restriction, but no backends currently - support anything different. - - A clustered FS should invoke the "shared_init_fs" cleancache - hook to get best performance for some backends. - -* Why not use the KVA of the inode as the key? (Christoph Hellwig) - -If cleancache would use the inode virtual address instead of -inode/filehandle, the pool id could be eliminated. But, this -won't work because cleancache retains pagecache data pages -persistently even when the inode has been pruned from the -inode unused list, and only invalidates the data page if the file -gets removed/truncated. So if cleancache used the inode kva, -there would be potential coherency issues if/when the inode -kva is reused for a different file. Alternately, if cleancache -invalidated the pages when the inode kva was freed, much of the value -of cleancache would be lost because the cache of pages in cleanache -is potentially much larger than the kernel pagecache and is most -useful if the pages survive inode cache removal. - -* Why is a global variable required? - -The cleancache_enabled flag is checked in all of the frequently-used -cleancache hooks. The alternative is a function call to check a static -variable. Since cleancache is enabled dynamically at runtime, systems -that don't enable cleancache would suffer thousands (possibly -tens-of-thousands) of unnecessary function calls per second. So the -global variable allows cleancache to be enabled by default at compile -time, but have insignificant performance impact when cleancache remains -disabled at runtime. - -* Does cleanache work with KVM? - -The memory model of KVM is sufficiently different that a cleancache -backend may have less value for KVM. This remains to be tested, -especially in an overcommitted system. - -* Does cleancache work in userspace? It sounds useful for - memory hungry caches like web browsers. (Jamie Lokier) - -No plans yet, though we agree it sounds useful, at least for -apps that bypass the page cache (e.g. O_DIRECT). - -Last updated: Dan Magenheimer, April 13 2011 diff --git a/Documentation/vm/frontswap.rst b/Documentation/vm/frontswap.rst index 1979f430c1c5..e2e5ab3e375e 100644 --- a/Documentation/vm/frontswap.rst +++ b/Documentation/vm/frontswap.rst @@ -8,12 +8,6 @@ Frontswap provides a "transcendent memory" interface for swap pages. In some environments, dramatic performance savings may be obtained because swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. -(Note, frontswap -- and :ref:`cleancache` (merged at 3.0) -- are the "frontends" -and the only necessary changes to the core kernel for transcendent memory; -all other supporting code -- the "backends" -- is implemented as drivers. -See the LWN.net article `Transcendent memory in a nutshell`_ -for a detailed overview of frontswap and related kernel parts) - .. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ Frontswap is so named because it can be thought of as the opposite of @@ -87,11 +81,9 @@ This interface is ideal when data is transformed to a different form and size (such as with compression) or secretly moved (as might be useful for write-balancing for some RAM-like devices). Swap pages (and evicted page-cache pages) are a great use for this kind of slower-than-RAM- -but-much-faster-than-disk "pseudo-RAM device" and the frontswap (and -cleancache) interface to transcendent memory provides a nice way to read -and write -- and indirectly "name" -- the pages. +but-much-faster-than-disk "pseudo-RAM device". -Frontswap -- and cleancache -- with a fairly small impact on the kernel, +Frontswap with a fairly small impact on the kernel, provides a huge amount of flexibility for more dynamic, flexible RAM utilization in various system configurations: diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index 932440805453..44365c4574a3 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -15,7 +15,6 @@ algorithms. If you are looking for advice on simply allocating memory, see the active_mm arch_pgtable_helpers balance - cleancache damon/index free_page_reporting frontswap diff --git a/MAINTAINERS b/MAINTAINERS index 6c08cbe953fe..45f8750f6e30 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4705,13 +4705,6 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/cla F: include/linux/cfi.h F: kernel/cfi.c -CLEANCACHE API -M: Konrad Rzeszutek Wilk -L: linux-kernel@vger.kernel.org -S: Maintained -F: include/linux/cleancache.h -F: mm/cleancache.c - CLK API M: Russell King L: linux-clk@vger.kernel.org diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig index 383c632eba7b..a9ed79b7f871 100644 --- a/arch/arm/configs/bcm2835_defconfig +++ b/arch/arm/configs/bcm2835_defconfig @@ -31,7 +31,6 @@ CONFIG_ARCH_BCM2835=y CONFIG_PREEMPT_VOLUNTARY=y CONFIG_AEABI=y CONFIG_KSM=y -CONFIG_CLEANCACHE=y CONFIG_CMA=y CONFIG_SECCOMP=y CONFIG_KEXEC=y diff --git a/arch/arm/configs/qcom_defconfig b/arch/arm/configs/qcom_defconfig index 0daa9c0d298e..9981566f2096 100644 --- a/arch/arm/configs/qcom_defconfig +++ b/arch/arm/configs/qcom_defconfig @@ -27,7 +27,6 @@ CONFIG_PCIE_QCOM=y CONFIG_SMP=y CONFIG_PREEMPT=y CONFIG_HIGHMEM=y -CONFIG_CLEANCACHE=y CONFIG_ARM_APPENDED_DTB=y CONFIG_ARM_ATAG_DTB_COMPAT=y CONFIG_CPU_IDLE=y diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index a4b6c7108465..bc9952f8be66 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -45,7 +45,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 2db721965520..a77269c6e5ba 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -41,7 +41,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index c266a704eecd..7a74efa6b9a1 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -48,7 +48,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index f644f08dd6ed..a5323bf2eb33 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -38,7 +38,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index e4924650b687..5e80aa0869d5 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -40,7 +40,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 24113871ea76..e84326a3f62d 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -39,7 +39,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 6a7e4be70eea..337552f43339 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -59,7 +59,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 1d223247aff0..7b688f7d272a 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -37,7 +37,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 961f789f96c9..7c2cb31d63dd 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -38,7 +38,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index ff4b5e469390..ca43897af26d 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -39,7 +39,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 5f228621d0cc..e3d515f37144 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -35,7 +35,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index a600cb9e68c2..d601606c969b 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -35,7 +35,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 354e51dcb3e2..7fe8975b49ec 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -96,7 +96,6 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y CONFIG_CMA_DEBUG=y CONFIG_CMA_DEBUGFS=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 8dee6c3782f3..466780c465f5 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -91,7 +91,6 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y CONFIG_CMA_SYSFS=y CONFIG_CMA_AREAS=7 diff --git a/block/bdev.c b/block/bdev.c index 8bf93a19041b..102837a37051 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include "../fs/internal.h" @@ -88,10 +87,6 @@ void invalidate_bdev(struct block_device *bdev) lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); } - /* 99% of the time, we don't need to flush the cleancache on the bdev. - * But, for the strange corners, lets be cautious - */ - cleancache_invalidate_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d6d48ecf823c..409bad3928db 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include "misc.h" #include "extent_io.h" @@ -3578,15 +3577,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, goto out; } - if (!PageUptodate(page)) { - if (cleancache_get_page(page) == 0) { - BUG_ON(blocksize != PAGE_SIZE); - unlock_extent(tree, start, end); - unlock_page(page); - goto out; - } - } - if (page->index == last_byte >> PAGE_SHIFT) { size_t zero_offset = offset_in_page(last_byte); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0ec09fe01be6..4d947ba32da9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -1374,7 +1373,6 @@ static int btrfs_fill_super(struct super_block *sb, goto fail_close; } - cleancache_init_fs(sb); sb->s_flags |= SB_ACTIVE; return 0; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 3db923403505..4cd62f1d848c 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -43,7 +43,6 @@ #include #include #include -#include #include "ext4.h" @@ -350,11 +349,6 @@ int ext4_mpage_readpages(struct inode *inode, } else if (fully_mapped) { SetPageMappedToDisk(page); } - if (fully_mapped && blocks_per_page == 1 && - !PageUptodate(page) && cleancache_get_page(page) == 0) { - SetPageUptodate(page); - goto confused; - } /* * This page will go to BIO. Do we need to send this diff --git a/fs/ext4/super.c b/fs/ext4/super.c index db9fe4843529..eee0d9ebfa6c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include @@ -3149,8 +3148,6 @@ done: EXT4_BLOCKS_PER_GROUP(sb), EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt, sbi->s_mount_opt2); - - cleancache_init_fs(sb); return err; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0a1d236212f8..8c417864c66a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -2035,12 +2034,6 @@ got_it: block_nr = map->m_pblk + block_in_file - map->m_lblk; SetPageMappedToDisk(page); - if (!PageUptodate(page) && (!PageSwapCache(page) && - !cleancache_get_page(page))) { - SetPageUptodate(page); - goto confused; - } - if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { ret = -EFSCORRUPTED; @@ -2096,12 +2089,6 @@ submit_and_realloc: ClearPageError(page); *last_block_in_bio = block_nr; goto out; -confused: - if (bio) { - __submit_bio(F2FS_I_SB(inode), bio, DATA); - bio = NULL; - } - unlock_page(page); out: *bio_ret = bio; return ret; diff --git a/fs/mpage.c b/fs/mpage.c index 334e7d09aa65..87f5cfef6caa 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -29,7 +29,6 @@ #include #include #include -#include #include "internal.h" /* @@ -284,12 +283,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) SetPageMappedToDisk(page); } - if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && - cleancache_get_page(page) == 0) { - SetPageUptodate(page); - goto confused; - } - /* * This page will go to BIO. Do we need to send this BIO off first? */ diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 8aaec7e0804e..fb825059d488 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -11,7 +11,6 @@ #include #include -#include #include #include #include diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 1286b88b6fa1..2772dec9dcea 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #define CREATE_TRACE_POINTS @@ -2283,7 +2282,6 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_errno(status); goto bail; } - cleancache_init_shared_fs(sb); osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM); if (!osb->ocfs2_wq) { diff --git a/fs/super.c b/fs/super.c index a6405d44d4ca..7af820ba5ad5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -260,7 +259,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_time_gran = 1000000000; s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; - s->cleancache_poolid = CLEANCACHE_NO_POOL; s->s_shrink.seeks = DEFAULT_SEEKS; s->s_shrink.scan_objects = super_cache_scan; @@ -330,7 +328,6 @@ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { - cleancache_invalidate_fs(s); unregister_shrinker(&s->s_shrink); fs->kill_sb(s); diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h deleted file mode 100644 index 5f5730c1d324..000000000000 --- a/include/linux/cleancache.h +++ /dev/null @@ -1,124 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_CLEANCACHE_H -#define _LINUX_CLEANCACHE_H - -#include -#include -#include - -#define CLEANCACHE_NO_POOL -1 -#define CLEANCACHE_NO_BACKEND -2 -#define CLEANCACHE_NO_BACKEND_SHARED -3 - -#define CLEANCACHE_KEY_MAX 6 - -/* - * cleancache requires every file with a page in cleancache to have a - * unique key unless/until the file is removed/truncated. For some - * filesystems, the inode number is unique, but for "modern" filesystems - * an exportable filehandle is required (see exportfs.h) - */ -struct cleancache_filekey { - union { - ino_t ino; - __u32 fh[CLEANCACHE_KEY_MAX]; - u32 key[CLEANCACHE_KEY_MAX]; - } u; -}; - -struct cleancache_ops { - int (*init_fs)(size_t); - int (*init_shared_fs)(uuid_t *uuid, size_t); - int (*get_page)(int, struct cleancache_filekey, - pgoff_t, struct page *); - void (*put_page)(int, struct cleancache_filekey, - pgoff_t, struct page *); - void (*invalidate_page)(int, struct cleancache_filekey, pgoff_t); - void (*invalidate_inode)(int, struct cleancache_filekey); - void (*invalidate_fs)(int); -}; - -extern int cleancache_register_ops(const struct cleancache_ops *ops); -extern void __cleancache_init_fs(struct super_block *); -extern void __cleancache_init_shared_fs(struct super_block *); -extern int __cleancache_get_page(struct page *); -extern void __cleancache_put_page(struct page *); -extern void __cleancache_invalidate_page(struct address_space *, struct page *); -extern void __cleancache_invalidate_inode(struct address_space *); -extern void __cleancache_invalidate_fs(struct super_block *); - -#ifdef CONFIG_CLEANCACHE -#define cleancache_enabled (1) -static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping) -{ - return mapping->host->i_sb->cleancache_poolid >= 0; -} -static inline bool cleancache_fs_enabled(struct page *page) -{ - return cleancache_fs_enabled_mapping(page->mapping); -} -#else -#define cleancache_enabled (0) -#define cleancache_fs_enabled(_page) (0) -#define cleancache_fs_enabled_mapping(_page) (0) -#endif - -/* - * The shim layer provided by these inline functions allows the compiler - * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE - * is disabled, to a single global variable check if CONFIG_CLEANCACHE - * is enabled but no cleancache "backend" has dynamically enabled it, - * and, for the most frequent cleancache ops, to a single global variable - * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled - * and a cleancache backend has dynamically enabled cleancache, but the - * filesystem referenced by that cleancache op has not enabled cleancache. - * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially - * no measurable performance impact. - */ - -static inline void cleancache_init_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_init_fs(sb); -} - -static inline void cleancache_init_shared_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_init_shared_fs(sb); -} - -static inline int cleancache_get_page(struct page *page) -{ - if (cleancache_enabled && cleancache_fs_enabled(page)) - return __cleancache_get_page(page); - return -1; -} - -static inline void cleancache_put_page(struct page *page) -{ - if (cleancache_enabled && cleancache_fs_enabled(page)) - __cleancache_put_page(page); -} - -static inline void cleancache_invalidate_page(struct address_space *mapping, - struct page *page) -{ - /* careful... page->mapping is NULL sometimes when this is called */ - if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) - __cleancache_invalidate_page(mapping, page); -} - -static inline void cleancache_invalidate_inode(struct address_space *mapping) -{ - if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) - __cleancache_invalidate_inode(mapping); -} - -static inline void cleancache_invalidate_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_invalidate_fs(sb); -} - -#endif /* _LINUX_CLEANCACHE_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 9617dea24978..f3daaea16554 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1535,11 +1535,6 @@ struct super_block { const struct dentry_operations *s_d_op; /* default d_op for dentries */ - /* - * Saved pool identifier for cleancache (-1 means none) - */ - int cleancache_poolid; - struct shrinker s_shrink; /* per-sb shrinker handle */ /* Number of inodes with nlink == 0 but still referenced */ diff --git a/mm/Kconfig b/mm/Kconfig index a99bd499ef51..430240289b02 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -444,28 +444,6 @@ config USE_PERCPU_NUMA_NODE_ID config HAVE_SETUP_PER_CPU_AREA bool -config CLEANCACHE - bool "Enable cleancache driver to cache clean pages if tmem is present" - help - Cleancache can be thought of as a page-granularity victim cache - for clean pages that the kernel's pageframe replacement algorithm - (PFRA) would like to keep around, but can't since there isn't enough - memory. So when the PFRA "evicts" a page, it first attempts to use - cleancache code to put the data contained in that page into - "transcendent memory", memory that is not directly accessible or - addressable by the kernel and is of unknown and possibly - time-varying size. And when a cleancache-enabled - filesystem wishes to access a page in a file on disk, it first - checks cleancache to see if it already contains it; if it does, - the page is copied into the kernel and a disk access is avoided. - When a transcendent memory driver is available (such as zcache or - Xen transcendent memory), a significant I/O reduction - may be achieved. When none is available, all cleancache calls - are reduced to a single pointer-compare-against-NULL resulting - in a negligible performance hit. - - If unsure, say Y to enable cleancache - config FRONTSWAP bool "Enable frontswap to cache swap pages if tmem is present" depends on SWAP diff --git a/mm/Makefile b/mm/Makefile index 588d3113f3b0..70d4309c9ce3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -104,7 +104,6 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o -obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZBUD) += zbud.o diff --git a/mm/cleancache.c b/mm/cleancache.c deleted file mode 100644 index db7eee9c0886..000000000000 --- a/mm/cleancache.c +++ /dev/null @@ -1,315 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Cleancache frontend - * - * This code provides the generic "frontend" layer to call a matching - * "backend" driver implementation of cleancache. See - * Documentation/vm/cleancache.rst for more information. - * - * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. - * Author: Dan Magenheimer - */ - -#include -#include -#include -#include -#include -#include - -/* - * cleancache_ops is set by cleancache_register_ops to contain the pointers - * to the cleancache "backend" implementation functions. - */ -static const struct cleancache_ops *cleancache_ops __read_mostly; - -/* - * Counters available via /sys/kernel/debug/cleancache (if debugfs is - * properly configured. These are for information only so are not protected - * against increment races. - */ -static u64 cleancache_succ_gets; -static u64 cleancache_failed_gets; -static u64 cleancache_puts; -static u64 cleancache_invalidates; - -static void cleancache_register_ops_sb(struct super_block *sb, void *unused) -{ - switch (sb->cleancache_poolid) { - case CLEANCACHE_NO_BACKEND: - __cleancache_init_fs(sb); - break; - case CLEANCACHE_NO_BACKEND_SHARED: - __cleancache_init_shared_fs(sb); - break; - } -} - -/* - * Register operations for cleancache. Returns 0 on success. - */ -int cleancache_register_ops(const struct cleancache_ops *ops) -{ - if (cmpxchg(&cleancache_ops, NULL, ops)) - return -EBUSY; - - /* - * A cleancache backend can be built as a module and hence loaded after - * a cleancache enabled filesystem has called cleancache_init_fs. To - * handle such a scenario, here we call ->init_fs or ->init_shared_fs - * for each active super block. To differentiate between local and - * shared filesystems, we temporarily initialize sb->cleancache_poolid - * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED - * respectively in case there is no backend registered at the time - * cleancache_init_fs or cleancache_init_shared_fs is called. - * - * Since filesystems can be mounted concurrently with cleancache - * backend registration, we have to be careful to guarantee that all - * cleancache enabled filesystems that has been mounted by the time - * cleancache_register_ops is called has got and all mounted later will - * get cleancache_poolid. This is assured by the following statements - * tied together: - * - * a) iterate_supers skips only those super blocks that has started - * ->kill_sb - * - * b) if iterate_supers encounters a super block that has not finished - * ->mount yet, it waits until it is finished - * - * c) cleancache_init_fs is called from ->mount and - * cleancache_invalidate_fs is called from ->kill_sb - * - * d) we call iterate_supers after cleancache_ops has been set - * - * From a) it follows that if iterate_supers skips a super block, then - * either the super block is already dead, in which case we do not need - * to bother initializing cleancache for it, or it was mounted after we - * initiated iterate_supers. In the latter case, it must have seen - * cleancache_ops set according to d) and initialized cleancache from - * ->mount by itself according to c). This proves that we call - * ->init_fs at least once for each active super block. - * - * From b) and c) it follows that if iterate_supers encounters a super - * block that has already started ->init_fs, it will wait until ->mount - * and hence ->init_fs has finished, then check cleancache_poolid, see - * that it has already been set and therefore do nothing. This proves - * that we call ->init_fs no more than once for each super block. - * - * Combined together, the last two paragraphs prove the function - * correctness. - * - * Note that various cleancache callbacks may proceed before this - * function is called or even concurrently with it, but since - * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop - * until the corresponding ->init_fs has been actually called and - * cleancache_ops has been set. - */ - iterate_supers(cleancache_register_ops_sb, NULL); - return 0; -} -EXPORT_SYMBOL(cleancache_register_ops); - -/* Called by a cleancache-enabled filesystem at time of mount */ -void __cleancache_init_fs(struct super_block *sb) -{ - int pool_id = CLEANCACHE_NO_BACKEND; - - if (cleancache_ops) { - pool_id = cleancache_ops->init_fs(PAGE_SIZE); - if (pool_id < 0) - pool_id = CLEANCACHE_NO_POOL; - } - sb->cleancache_poolid = pool_id; -} -EXPORT_SYMBOL(__cleancache_init_fs); - -/* Called by a cleancache-enabled clustered filesystem at time of mount */ -void __cleancache_init_shared_fs(struct super_block *sb) -{ - int pool_id = CLEANCACHE_NO_BACKEND_SHARED; - - if (cleancache_ops) { - pool_id = cleancache_ops->init_shared_fs(&sb->s_uuid, PAGE_SIZE); - if (pool_id < 0) - pool_id = CLEANCACHE_NO_POOL; - } - sb->cleancache_poolid = pool_id; -} -EXPORT_SYMBOL(__cleancache_init_shared_fs); - -/* - * If the filesystem uses exportable filehandles, use the filehandle as - * the key, else use the inode number. - */ -static int cleancache_get_key(struct inode *inode, - struct cleancache_filekey *key) -{ - int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *); - int len = 0, maxlen = CLEANCACHE_KEY_MAX; - struct super_block *sb = inode->i_sb; - - key->u.ino = inode->i_ino; - if (sb->s_export_op != NULL) { - fhfn = sb->s_export_op->encode_fh; - if (fhfn) { - len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); - if (len <= FILEID_ROOT || len == FILEID_INVALID) - return -1; - if (maxlen > CLEANCACHE_KEY_MAX) - return -1; - } - } - return 0; -} - -/* - * "Get" data from cleancache associated with the poolid/inode/index - * that were specified when the data was put to cleanache and, if - * successful, use it to fill the specified page with data and return 0. - * The pageframe is unchanged and returns -1 if the get fails. - * Page must be locked by caller. - * - * The function has two checks before any action is taken - whether - * a backend is registered and whether the sb->cleancache_poolid - * is correct. - */ -int __cleancache_get_page(struct page *page) -{ - int ret = -1; - int pool_id; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (!cleancache_ops) { - cleancache_failed_gets++; - goto out; - } - - VM_BUG_ON_PAGE(!PageLocked(page), page); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id < 0) - goto out; - - if (cleancache_get_key(page->mapping->host, &key) < 0) - goto out; - - ret = cleancache_ops->get_page(pool_id, key, page->index, page); - if (ret == 0) - cleancache_succ_gets++; - else - cleancache_failed_gets++; -out: - return ret; -} -EXPORT_SYMBOL(__cleancache_get_page); - -/* - * "Put" data from a page to cleancache and associate it with the - * (previously-obtained per-filesystem) poolid and the page's, - * inode and page index. Page must be locked. Note that a put_page - * always "succeeds", though a subsequent get_page may succeed or fail. - * - * The function has two checks before any action is taken - whether - * a backend is registered and whether the sb->cleancache_poolid - * is correct. - */ -void __cleancache_put_page(struct page *page) -{ - int pool_id; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (!cleancache_ops) { - cleancache_puts++; - return; - } - - VM_BUG_ON_PAGE(!PageLocked(page), page); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id >= 0 && - cleancache_get_key(page->mapping->host, &key) >= 0) { - cleancache_ops->put_page(pool_id, key, page->index, page); - cleancache_puts++; - } -} -EXPORT_SYMBOL(__cleancache_put_page); - -/* - * Invalidate any data from cleancache associated with the poolid and the - * page's inode and page index so that a subsequent "get" will fail. - * - * The function has two checks before any action is taken - whether - * a backend is registered and whether the sb->cleancache_poolid - * is correct. - */ -void __cleancache_invalidate_page(struct address_space *mapping, - struct page *page) -{ - /* careful... page->mapping is NULL sometimes when this is called */ - int pool_id = mapping->host->i_sb->cleancache_poolid; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (!cleancache_ops) - return; - - if (pool_id >= 0) { - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (cleancache_get_key(mapping->host, &key) >= 0) { - cleancache_ops->invalidate_page(pool_id, - key, page->index); - cleancache_invalidates++; - } - } -} -EXPORT_SYMBOL(__cleancache_invalidate_page); - -/* - * Invalidate all data from cleancache associated with the poolid and the - * mappings's inode so that all subsequent gets to this poolid/inode - * will fail. - * - * The function has two checks before any action is taken - whether - * a backend is registered and whether the sb->cleancache_poolid - * is correct. - */ -void __cleancache_invalidate_inode(struct address_space *mapping) -{ - int pool_id = mapping->host->i_sb->cleancache_poolid; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (!cleancache_ops) - return; - - if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) - cleancache_ops->invalidate_inode(pool_id, key); -} -EXPORT_SYMBOL(__cleancache_invalidate_inode); - -/* - * Called by any cleancache-enabled filesystem at time of unmount; - * note that pool_id is surrendered and may be returned by a subsequent - * cleancache_init_fs or cleancache_init_shared_fs. - */ -void __cleancache_invalidate_fs(struct super_block *sb) -{ - int pool_id; - - pool_id = sb->cleancache_poolid; - sb->cleancache_poolid = CLEANCACHE_NO_POOL; - - if (cleancache_ops && pool_id >= 0) - cleancache_ops->invalidate_fs(pool_id); -} -EXPORT_SYMBOL(__cleancache_invalidate_fs); - -static int __init init_cleancache(void) -{ -#ifdef CONFIG_DEBUG_FS - struct dentry *root = debugfs_create_dir("cleancache", NULL); - - debugfs_create_u64("succ_gets", 0444, root, &cleancache_succ_gets); - debugfs_create_u64("failed_gets", 0444, root, &cleancache_failed_gets); - debugfs_create_u64("puts", 0444, root, &cleancache_puts); - debugfs_create_u64("invalidates", 0444, root, &cleancache_invalidates); -#endif - return 0; -} -module_init(init_cleancache) diff --git a/mm/filemap.c b/mm/filemap.c index 60866ae711e2..b50910ac2c88 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -151,16 +150,6 @@ static void filemap_unaccount_folio(struct address_space *mapping, { long nr; - /* - * if we're uptodate, flush out into the cleancache, otherwise - * invalidate any existing cleancache entries. We can't leave - * stale data around in the cleancache once our page is gone - */ - if (folio_test_uptodate(folio) && folio_test_mappedtodisk(folio)) - cleancache_put_page(&folio->page); - else - cleancache_invalidate_page(mapping, &folio->page); - VM_BUG_ON_FOLIO(folio_mapped(folio), folio); if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { int mapcount; diff --git a/mm/truncate.c b/mm/truncate.c index 5e243d7269c0..9dbf0b75da5d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -22,7 +22,6 @@ #include /* grr. try_to_release_page, do_invalidatepage */ #include -#include #include #include "internal.h" @@ -264,7 +263,6 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) */ folio_zero_range(folio, offset, length); - cleancache_invalidate_page(folio->mapping, &folio->page); if (folio_has_private(folio)) do_invalidatepage(&folio->page, offset, length); if (!folio_test_large(folio)) @@ -351,7 +349,7 @@ void truncate_inode_pages_range(struct address_space *mapping, bool same_folio; if (mapping_empty(mapping)) - goto out; + return; /* * 'start' and 'end' always covers the range of pages to be fully @@ -442,9 +440,6 @@ void truncate_inode_pages_range(struct address_space *mapping, folio_batch_release(&fbatch); index++; } - -out: - cleancache_invalidate_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -498,10 +493,6 @@ void truncate_inode_pages_final(struct address_space *mapping) xa_unlock_irq(&mapping->i_pages); } - /* - * Cleancache needs notification even if there are no pages or shadow - * entries. - */ truncate_inode_pages(mapping, 0); } EXPORT_SYMBOL(truncate_inode_pages_final); @@ -661,7 +652,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int did_range_unmap = 0; if (mapping_empty(mapping)) - goto out; + return 0; folio_batch_init(&fbatch); index = start; @@ -725,8 +716,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (dax_mapping(mapping)) { unmap_mapping_pages(mapping, start, end - start + 1, false); } -out: - cleancache_invalidate_inode(mapping); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); From 3d6035f136009f9cae380022754cba31f32570c5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:38 -0800 Subject: [PATCH 0390/1295] frontswap: remove frontswap_writethrough frontswap_writethrough is never called, so remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/frontswap.rst | 6 ------ include/linux/frontswap.h | 1 - mm/frontswap.c | 23 +---------------------- 3 files changed, 1 insertion(+), 29 deletions(-) diff --git a/Documentation/vm/frontswap.rst b/Documentation/vm/frontswap.rst index e2e5ab3e375e..2ab660651d04 100644 --- a/Documentation/vm/frontswap.rst +++ b/Documentation/vm/frontswap.rst @@ -39,12 +39,6 @@ a disk write and, if the data is later read back, a disk read are avoided. If a store returns failure, transcendent memory has rejected the data, and the page can be written to swap as usual. -If a backend chooses, frontswap can be configured as a "writethrough -cache" by calling frontswap_writethrough(). In this mode, the reduction -in swap device writes is lost (and also a non-trivial performance advantage) -in order to allow the backend to arbitrarily "reclaim" space used to -store frontswap pages to more completely manage its memory usage. - Note that if a page is stored and the page already exists in transcendent memory (a "duplicate" store), either the store succeeds and the data is overwritten, or the store fails AND the page is invalidated. This ensures stale data may diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index b07d88c92bb2..4a03fda41572 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -26,7 +26,6 @@ struct frontswap_ops { extern void frontswap_register_ops(struct frontswap_ops *ops); extern void frontswap_shrink(unsigned long); extern unsigned long frontswap_curr_pages(void); -extern void frontswap_writethrough(bool); #define FRONTSWAP_HAS_EXCLUSIVE_GETS extern void frontswap_tmem_exclusive_gets(bool); diff --git a/mm/frontswap.c b/mm/frontswap.c index 6bed12260dea..51a662a83955 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -32,16 +32,6 @@ static struct frontswap_ops *frontswap_ops __read_mostly; #define for_each_frontswap_ops(ops) \ for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next) -/* - * If enabled, frontswap_store will return failure even on success. As - * a result, the swap subsystem will always write the page to swap, in - * effect converting frontswap into a writethrough cache. In this mode, - * there is no direct reduction in swap writes, but a frontswap backend - * can unilaterally "reclaim" any pages in use with no data loss, thus - * providing increases control over maximum memory usage due to frontswap. - */ -static bool frontswap_writethrough_enabled __read_mostly; - /* * If enabled, the underlying tmem implementation is capable of doing * exclusive gets, so frontswap_load, on a successful tmem_get must @@ -170,15 +160,6 @@ void frontswap_register_ops(struct frontswap_ops *ops) } EXPORT_SYMBOL(frontswap_register_ops); -/* - * Enable/disable frontswap writethrough (see above). - */ -void frontswap_writethrough(bool enable) -{ - frontswap_writethrough_enabled = enable; -} -EXPORT_SYMBOL(frontswap_writethrough); - /* * Enable/disable frontswap exclusive gets (see above). */ @@ -283,9 +264,7 @@ int __frontswap_store(struct page *page) } else { inc_frontswap_failed_stores(); } - if (frontswap_writethrough_enabled) - /* report failure so swap also writes to swap device */ - ret = -1; + return ret; } EXPORT_SYMBOL(__frontswap_store); From 71024cb4a0bfe7767aec7a128d0a1a13a37b7fcd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:41 -0800 Subject: [PATCH 0391/1295] frontswap: remove frontswap_tmem_exclusive_gets frontswap_tmem_exclusive_gets is never called, so remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 2 -- mm/frontswap.c | 23 +---------------------- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 4a03fda41572..83a56392cc7f 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -26,8 +26,6 @@ struct frontswap_ops { extern void frontswap_register_ops(struct frontswap_ops *ops); extern void frontswap_shrink(unsigned long); extern unsigned long frontswap_curr_pages(void); -#define FRONTSWAP_HAS_EXCLUSIVE_GETS -extern void frontswap_tmem_exclusive_gets(bool); extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); extern void __frontswap_init(unsigned type, unsigned long *map); diff --git a/mm/frontswap.c b/mm/frontswap.c index 51a662a83955..dba7f087ee86 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -32,13 +32,6 @@ static struct frontswap_ops *frontswap_ops __read_mostly; #define for_each_frontswap_ops(ops) \ for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next) -/* - * If enabled, the underlying tmem implementation is capable of doing - * exclusive gets, so frontswap_load, on a successful tmem_get must - * mark the page as no longer in frontswap AND mark it dirty. - */ -static bool frontswap_tmem_exclusive_gets_enabled __read_mostly; - #ifdef CONFIG_DEBUG_FS /* * Counters available via /sys/kernel/debug/frontswap (if debugfs is @@ -160,15 +153,6 @@ void frontswap_register_ops(struct frontswap_ops *ops) } EXPORT_SYMBOL(frontswap_register_ops); -/* - * Enable/disable frontswap exclusive gets (see above). - */ -void frontswap_tmem_exclusive_gets(bool enable) -{ - frontswap_tmem_exclusive_gets_enabled = enable; -} -EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); - /* * Called when a swap device is swapon'd. */ @@ -296,13 +280,8 @@ int __frontswap_load(struct page *page) if (!ret) /* successful load */ break; } - if (ret == 0) { + if (ret == 0) inc_frontswap_loads(); - if (frontswap_tmem_exclusive_gets_enabled) { - SetPageDirty(page); - __frontswap_clear(sis, offset); - } - } return ret; } EXPORT_SYMBOL(__frontswap_load); From 0b364446d734da76e421dbfb09e5268270cefaf0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:44 -0800 Subject: [PATCH 0392/1295] frontswap: remove frontswap_shrink frontswap_shrink is never called, so remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/frontswap.rst | 13 ------ include/linux/frontswap.h | 1 - mm/frontswap.c | 83 ---------------------------------- 3 files changed, 97 deletions(-) diff --git a/Documentation/vm/frontswap.rst b/Documentation/vm/frontswap.rst index 2ab660651d04..feecc5e24477 100644 --- a/Documentation/vm/frontswap.rst +++ b/Documentation/vm/frontswap.rst @@ -255,19 +255,6 @@ the old data and ensure that it is no longer accessible. Since the swap subsystem then writes the new data to the read swap device, this is the correct course of action to ensure coherency. -* What is frontswap_shrink for? - -When the (non-frontswap) swap subsystem swaps out a page to a real -swap device, that page is only taking up low-value pre-allocated disk -space. But if frontswap has placed a page in transcendent memory, that -page may be taking up valuable real estate. The frontswap_shrink -routine allows code outside of the swap subsystem to force pages out -of the memory managed by frontswap and back into kernel-addressable memory. -For example, in RAMster, a "suction driver" thread will attempt -to "repatriate" pages sent to a remote machine back to the local machine; -this is driven using the frontswap_shrink mechanism when memory pressure -subsides. - * Why does the frontswap patch create the new include file swapfile.h? The frontswap code depends on some swap-subsystem-internal data diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 83a56392cc7f..d268d7bb6513 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -24,7 +24,6 @@ struct frontswap_ops { }; extern void frontswap_register_ops(struct frontswap_ops *ops); -extern void frontswap_shrink(unsigned long); extern unsigned long frontswap_curr_pages(void); extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); diff --git a/mm/frontswap.c b/mm/frontswap.c index dba7f087ee86..a77ebba6101b 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -341,89 +341,6 @@ static unsigned long __frontswap_curr_pages(void) return totalpages; } -static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, - int *swapid) -{ - int ret = -EINVAL; - struct swap_info_struct *si = NULL; - int si_frontswap_pages; - unsigned long total_pages_to_unuse = total; - unsigned long pages = 0, pages_to_unuse = 0; - - assert_spin_locked(&swap_lock); - plist_for_each_entry(si, &swap_active_head, list) { - si_frontswap_pages = atomic_read(&si->frontswap_pages); - if (total_pages_to_unuse < si_frontswap_pages) { - pages = pages_to_unuse = total_pages_to_unuse; - } else { - pages = si_frontswap_pages; - pages_to_unuse = 0; /* unuse all */ - } - /* ensure there is enough RAM to fetch pages from frontswap */ - if (security_vm_enough_memory_mm(current->mm, pages)) { - ret = -ENOMEM; - continue; - } - vm_unacct_memory(pages); - *unused = pages_to_unuse; - *swapid = si->type; - ret = 0; - break; - } - - return ret; -} - -/* - * Used to check if it's necessary and feasible to unuse pages. - * Return 1 when nothing to do, 0 when need to shrink pages, - * error code when there is an error. - */ -static int __frontswap_shrink(unsigned long target_pages, - unsigned long *pages_to_unuse, - int *type) -{ - unsigned long total_pages = 0, total_pages_to_unuse; - - assert_spin_locked(&swap_lock); - - total_pages = __frontswap_curr_pages(); - if (total_pages <= target_pages) { - /* Nothing to do */ - *pages_to_unuse = 0; - return 1; - } - total_pages_to_unuse = total_pages - target_pages; - return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); -} - -/* - * Frontswap, like a true swap device, may unnecessarily retain pages - * under certain circumstances; "shrink" frontswap is essentially a - * "partial swapoff" and works by calling try_to_unuse to attempt to - * unuse enough frontswap pages to attempt to -- subject to memory - * constraints -- reduce the number of pages in frontswap to the - * number given in the parameter target_pages. - */ -void frontswap_shrink(unsigned long target_pages) -{ - unsigned long pages_to_unuse = 0; - int type, ret; - - /* - * we don't want to hold swap_lock while doing a very - * lengthy try_to_unuse, but swap_list may change - * so restart scan from swap_active_head each time - */ - spin_lock(&swap_lock); - ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); - spin_unlock(&swap_lock); - if (ret == 0) - try_to_unuse(type, true, pages_to_unuse); - return; -} -EXPORT_SYMBOL(frontswap_shrink); - /* * Count and return the number of frontswap pages across all * swap devices. This is exported so that backend drivers can From 3e8e1af63d7a831f576477c25d9b89049bd2d53d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:47 -0800 Subject: [PATCH 0393/1295] frontswap: remove frontswap_curr_pages frontswap_curr_pages is never called, so remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 1 - mm/frontswap.c | 28 ---------------------------- 2 files changed, 29 deletions(-) diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index d268d7bb6513..5205c2977b20 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -24,7 +24,6 @@ struct frontswap_ops { }; extern void frontswap_register_ops(struct frontswap_ops *ops); -extern unsigned long frontswap_curr_pages(void); extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); extern void __frontswap_init(unsigned type, unsigned long *map); diff --git a/mm/frontswap.c b/mm/frontswap.c index a77ebba6101b..af8f68d0e5cc 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -330,34 +330,6 @@ void __frontswap_invalidate_area(unsigned type) } EXPORT_SYMBOL(__frontswap_invalidate_area); -static unsigned long __frontswap_curr_pages(void) -{ - unsigned long totalpages = 0; - struct swap_info_struct *si = NULL; - - assert_spin_locked(&swap_lock); - plist_for_each_entry(si, &swap_active_head, list) - totalpages += atomic_read(&si->frontswap_pages); - return totalpages; -} - -/* - * Count and return the number of frontswap pages across all - * swap devices. This is exported so that backend drivers can - * determine current usage without reading debugfs. - */ -unsigned long frontswap_curr_pages(void) -{ - unsigned long totalpages = 0; - - spin_lock(&swap_lock); - totalpages = __frontswap_curr_pages(); - spin_unlock(&swap_lock); - - return totalpages; -} -EXPORT_SYMBOL(frontswap_curr_pages); - static int __init init_frontswap(void) { #ifdef CONFIG_DEBUG_FS From 1cf53c894d15dd4b73397a56fa055d76d3db66b4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:51 -0800 Subject: [PATCH 0394/1295] frontswap: simplify frontswap_init Just use IS_ENABLED() and remove the __frontswap_init indirection. Also remove the unused export. Link: https://lkml.kernel.org/r/20211224062246.1258487-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 9 +-------- mm/frontswap.c | 3 +-- mm/swapfile.c | 3 ++- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 5205c2977b20..73d7beb44f2b 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -26,7 +26,7 @@ struct frontswap_ops { extern void frontswap_register_ops(struct frontswap_ops *ops); extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); -extern void __frontswap_init(unsigned type, unsigned long *map); +extern void frontswap_init(unsigned type, unsigned long *map); extern int __frontswap_store(struct page *page); extern int __frontswap_load(struct page *page); extern void __frontswap_invalidate_page(unsigned, pgoff_t); @@ -107,11 +107,4 @@ static inline void frontswap_invalidate_area(unsigned type) __frontswap_invalidate_area(type); } -static inline void frontswap_init(unsigned type, unsigned long *map) -{ -#ifdef CONFIG_FRONTSWAP - __frontswap_init(type, map); -#endif -} - #endif /* _LINUX_FRONTSWAP_H */ diff --git a/mm/frontswap.c b/mm/frontswap.c index af8f68d0e5cc..132d6ad6d70b 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -156,7 +156,7 @@ EXPORT_SYMBOL(frontswap_register_ops); /* * Called when a swap device is swapon'd. */ -void __frontswap_init(unsigned type, unsigned long *map) +void frontswap_init(unsigned type, unsigned long *map) { struct swap_info_struct *sis = swap_info[type]; struct frontswap_ops *ops; @@ -179,7 +179,6 @@ void __frontswap_init(unsigned type, unsigned long *map) for_each_frontswap_ops(ops) ops->init(type); } -EXPORT_SYMBOL(__frontswap_init); bool __frontswap_test(struct swap_info_struct *sis, pgoff_t offset) diff --git a/mm/swapfile.c b/mm/swapfile.c index caa9f81a0d15..df5930ccd93d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2463,7 +2463,8 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, struct swap_cluster_info *cluster_info, unsigned long *frontswap_map) { - frontswap_init(p->type, frontswap_map); + if (IS_ENABLED(CONFIG_FRONTSWAP)) + frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); spin_lock(&p->lock); setup_swap_info(p, prio, swap_map, cluster_info); From 360be5daa33fe74fc472195242ae2a2337c4320b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:54 -0800 Subject: [PATCH 0395/1295] frontswap: remove the frontswap exports None of the frontswap API is called from modular code. Link: https://lkml.kernel.org/r/20211224062246.1258487-8-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frontswap.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/frontswap.c b/mm/frontswap.c index 132d6ad6d70b..42d554da53bb 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -151,7 +151,6 @@ void frontswap_register_ops(struct frontswap_ops *ops) } } } -EXPORT_SYMBOL(frontswap_register_ops); /* * Called when a swap device is swapon'd. @@ -187,7 +186,6 @@ bool __frontswap_test(struct swap_info_struct *sis, return test_bit(offset, sis->frontswap_map); return false; } -EXPORT_SYMBOL(__frontswap_test); static inline void __frontswap_set(struct swap_info_struct *sis, pgoff_t offset) @@ -250,7 +248,6 @@ int __frontswap_store(struct page *page) return ret; } -EXPORT_SYMBOL(__frontswap_store); /* * "Get" data from frontswap associated with swaptype and offset that were @@ -283,7 +280,6 @@ int __frontswap_load(struct page *page) inc_frontswap_loads(); return ret; } -EXPORT_SYMBOL(__frontswap_load); /* * Invalidate any data from frontswap associated with the specified swaptype @@ -305,7 +301,6 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset) __frontswap_clear(sis, offset); inc_frontswap_invalidates(); } -EXPORT_SYMBOL(__frontswap_invalidate_page); /* * Invalidate all data from frontswap associated with all offsets for the @@ -327,7 +322,6 @@ void __frontswap_invalidate_area(unsigned type) atomic_set(&sis->frontswap_pages, 0); bitmap_zero(sis->frontswap_map, sis->max); } -EXPORT_SYMBOL(__frontswap_invalidate_area); static int __init init_frontswap(void) { From 10a9c496789fe2098bfc018650fc77b23ba08a54 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:57 -0800 Subject: [PATCH 0396/1295] mm: simplify try_to_unuse Remove the unused frontswap and pages_to_unuse arguments, and mark the function static now that the caller in frontswap is gone. [akpm@linux-foundation.org: fix shmem_unuse() stub, per Matthew] Link: https://lkml.kernel.org/r/20211224062246.1258487-9-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Cc: Naresh Kamboju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 7 ---- include/linux/shmem_fs.h | 3 +- include/linux/swapfile.h | 1 - mm/shmem.c | 33 +++------------- mm/swapfile.c | 83 +++++++++++---------------------------- 5 files changed, 30 insertions(+), 97 deletions(-) diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 73d7beb44f2b..a9817d4fa74c 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -7,13 +7,6 @@ #include #include -/* - * Return code to denote that requested number of - * frontswap pages are unused(moved to page cache). - * Used in shmem_unuse and try_to_unuse. - */ -#define FRONTSWAP_PAGES_UNUSED 2 - struct frontswap_ops { void (*init)(unsigned); /* this swap type was just swapon'ed */ int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 166158b6e917..e65b80ed09e7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -83,8 +83,7 @@ extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); -extern int shmem_unuse(unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse); +int shmem_unuse(unsigned int type); extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, pgoff_t index); diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index e06febf62978..809cd01ef2c5 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -9,7 +9,6 @@ extern spinlock_t swap_lock; extern struct plist_head swap_active_head; extern struct swap_info_struct *swap_info[]; -extern int try_to_unuse(unsigned int, bool, unsigned long); extern unsigned long generic_max_swapfile_size(void); extern unsigned long max_swapfile_size(void); diff --git a/mm/shmem.c b/mm/shmem.c index 66909efd0a1b..a09b29ec2b45 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include @@ -1152,7 +1151,7 @@ static void shmem_evict_inode(struct inode *inode) static int shmem_find_swap_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices, - unsigned int type, bool frontswap) + unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); struct page *page; @@ -1173,9 +1172,6 @@ static int shmem_find_swap_entries(struct address_space *mapping, entry = radix_to_swp_entry(page); if (swp_type(entry) != type) continue; - if (frontswap && - !frontswap_test(swap_info[type], swp_offset(entry))) - continue; indices[ret] = xas.xa_index; entries[ret] = page; @@ -1228,26 +1224,20 @@ static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec, /* * If swap found in inode, free it and move page from swapcache to filecache. */ -static int shmem_unuse_inode(struct inode *inode, unsigned int type, - bool frontswap, unsigned long *fs_pages_to_unuse) +static int shmem_unuse_inode(struct inode *inode, unsigned int type) { struct address_space *mapping = inode->i_mapping; pgoff_t start = 0; struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; - bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0); int ret = 0; pagevec_init(&pvec); do { unsigned int nr_entries = PAGEVEC_SIZE; - if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE) - nr_entries = *fs_pages_to_unuse; - pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, - pvec.pages, indices, - type, frontswap); + pvec.pages, indices, type); if (pvec.nr == 0) { ret = 0; break; @@ -1257,14 +1247,6 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type, if (ret < 0) break; - if (frontswap_partial) { - *fs_pages_to_unuse -= ret; - if (*fs_pages_to_unuse == 0) { - ret = FRONTSWAP_PAGES_UNUSED; - break; - } - } - start = indices[pvec.nr - 1]; } while (true); @@ -1276,8 +1258,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type, * device 'type' back into memory, so the swap device can be * unused. */ -int shmem_unuse(unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) +int shmem_unuse(unsigned int type) { struct shmem_inode_info *info, *next; int error = 0; @@ -1300,8 +1281,7 @@ int shmem_unuse(unsigned int type, bool frontswap, atomic_inc(&info->stop_eviction); mutex_unlock(&shmem_swaplist_mutex); - error = shmem_unuse_inode(&info->vfs_inode, type, frontswap, - fs_pages_to_unuse); + error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); mutex_lock(&shmem_swaplist_mutex); @@ -4015,8 +3995,7 @@ int __init shmem_init(void) return 0; } -int shmem_unuse(unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) +int shmem_unuse(unsigned int type) { return 0; } diff --git a/mm/swapfile.c b/mm/swapfile.c index df5930ccd93d..82342c77791b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1923,8 +1923,7 @@ out: static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, - unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) + unsigned int type) { struct page *page; swp_entry_t entry; @@ -1945,9 +1944,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; offset = swp_offset(entry); - if (frontswap && !frontswap_test(si, offset)) - continue; - pte_unmap(pte); swap_map = &si->swap_map[offset]; page = lookup_swap_cache(entry, vma, addr); @@ -1979,11 +1975,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, try_to_free_swap(page); unlock_page(page); put_page(page); - - if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { - ret = FRONTSWAP_PAGES_UNUSED; - goto out; - } try_next: pte = pte_offset_map(pmd, addr); } while (pte++, addr += PAGE_SIZE, addr != end); @@ -1996,8 +1987,7 @@ out: static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) + unsigned int type) { pmd_t *pmd; unsigned long next; @@ -2009,8 +1999,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; - ret = unuse_pte_range(vma, pmd, addr, next, type, - frontswap, fs_pages_to_unuse); + ret = unuse_pte_range(vma, pmd, addr, next, type); if (ret) return ret; } while (pmd++, addr = next, addr != end); @@ -2019,8 +2008,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, - unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) + unsigned int type) { pud_t *pud; unsigned long next; @@ -2031,8 +2019,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - ret = unuse_pmd_range(vma, pud, addr, next, type, - frontswap, fs_pages_to_unuse); + ret = unuse_pmd_range(vma, pud, addr, next, type); if (ret) return ret; } while (pud++, addr = next, addr != end); @@ -2041,8 +2028,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) + unsigned int type) { p4d_t *p4d; unsigned long next; @@ -2053,16 +2039,14 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; - ret = unuse_pud_range(vma, p4d, addr, next, type, - frontswap, fs_pages_to_unuse); + ret = unuse_pud_range(vma, p4d, addr, next, type); if (ret) return ret; } while (p4d++, addr = next, addr != end); return 0; } -static int unuse_vma(struct vm_area_struct *vma, unsigned int type, - bool frontswap, unsigned long *fs_pages_to_unuse) +static int unuse_vma(struct vm_area_struct *vma, unsigned int type) { pgd_t *pgd; unsigned long addr, end, next; @@ -2076,16 +2060,14 @@ static int unuse_vma(struct vm_area_struct *vma, unsigned int type, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - ret = unuse_p4d_range(vma, pgd, addr, next, type, - frontswap, fs_pages_to_unuse); + ret = unuse_p4d_range(vma, pgd, addr, next, type); if (ret) return ret; } while (pgd++, addr = next, addr != end); return 0; } -static int unuse_mm(struct mm_struct *mm, unsigned int type, - bool frontswap, unsigned long *fs_pages_to_unuse) +static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; @@ -2093,8 +2075,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type, mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma) { - ret = unuse_vma(vma, type, frontswap, - fs_pages_to_unuse); + ret = unuse_vma(vma, type); if (ret) break; } @@ -2110,7 +2091,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type, * if there are no inuse entries after prev till end of the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, - unsigned int prev, bool frontswap) + unsigned int prev) { unsigned int i; unsigned char count; @@ -2124,8 +2105,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) - if (!frontswap || frontswap_test(si, i)) - break; + break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); } @@ -2136,12 +2116,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, return i; } -/* - * If the boolean frontswap is true, only unuse pages_to_unuse pages; - * pages_to_unuse==0 means all pages; ignored if frontswap is false - */ -int try_to_unuse(unsigned int type, bool frontswap, - unsigned long pages_to_unuse) +static int try_to_unuse(unsigned int type) { struct mm_struct *prev_mm; struct mm_struct *mm; @@ -2155,13 +2130,10 @@ int try_to_unuse(unsigned int type, bool frontswap, if (!READ_ONCE(si->inuse_pages)) return 0; - if (!frontswap) - pages_to_unuse = 0; - retry: - retval = shmem_unuse(type, frontswap, &pages_to_unuse); + retval = shmem_unuse(type); if (retval) - goto out; + return retval; prev_mm = &init_mm; mmget(prev_mm); @@ -2178,11 +2150,10 @@ retry: spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; - retval = unuse_mm(mm, type, frontswap, &pages_to_unuse); - + retval = unuse_mm(mm, type); if (retval) { mmput(prev_mm); - goto out; + return retval; } /* @@ -2199,7 +2170,7 @@ retry: i = 0; while (READ_ONCE(si->inuse_pages) && !signal_pending(current) && - (i = find_next_to_unuse(si, i, frontswap)) != 0) { + (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); page = find_get_page(swap_address_space(entry), i); @@ -2217,14 +2188,6 @@ retry: try_to_free_swap(page); unlock_page(page); put_page(page); - - /* - * For frontswap, we just need to unuse pages_to_unuse, if - * it was specified. Need not check frontswap again here as - * we already zeroed out pages_to_unuse if not frontswap. - */ - if (pages_to_unuse && --pages_to_unuse == 0) - goto out; } /* @@ -2242,10 +2205,10 @@ retry: if (READ_ONCE(si->inuse_pages)) { if (!signal_pending(current)) goto retry; - retval = -EINTR; + return -EINTR; } -out: - return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; + + return 0; } /* @@ -2577,7 +2540,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) disable_swap_slots_cache_lock(); set_current_oom_origin(); - err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ + err = try_to_unuse(p->type); clear_current_oom_origin(); if (err) { From bd9cd521496ba8d537d8f46f4167bf4221aba9a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:15:01 -0800 Subject: [PATCH 0397/1295] frontswap: remove frontswap_test frontswap_test is unused now, remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 11 ----------- mm/frontswap.c | 2 +- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index a9817d4fa74c..c5b2848d2240 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -18,7 +18,6 @@ struct frontswap_ops { extern void frontswap_register_ops(struct frontswap_ops *ops); -extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); extern void frontswap_init(unsigned type, unsigned long *map); extern int __frontswap_store(struct page *page); extern int __frontswap_load(struct page *page); @@ -33,11 +32,6 @@ static inline bool frontswap_enabled(void) return static_branch_unlikely(&frontswap_enabled_key); } -static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) -{ - return __frontswap_test(sis, offset); -} - static inline void frontswap_map_set(struct swap_info_struct *p, unsigned long *map) { @@ -56,11 +50,6 @@ static inline bool frontswap_enabled(void) return false; } -static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) -{ - return false; -} - static inline void frontswap_map_set(struct swap_info_struct *p, unsigned long *map) { diff --git a/mm/frontswap.c b/mm/frontswap.c index 42d554da53bb..f51159f0d75d 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -179,7 +179,7 @@ void frontswap_init(unsigned type, unsigned long *map) ops->init(type); } -bool __frontswap_test(struct swap_info_struct *sis, +static bool __frontswap_test(struct swap_info_struct *sis, pgoff_t offset) { if (sis->frontswap_map) From f328c1d16e4c764992895ac9c9425cea861b2ca0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:15:04 -0800 Subject: [PATCH 0398/1295] frontswap: simplify frontswap_register_ops Given that frontswap_register_ops must be called from built-in code, there is no need to handle the case of swapfiles coming online before or during it, so delete the code that deals with that case. Link: https://lkml.kernel.org/r/20211224062246.1258487-11-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frontswap.c | 41 ----------------------------------------- 1 file changed, 41 deletions(-) diff --git a/mm/frontswap.c b/mm/frontswap.c index f51159f0d75d..35040fa4eba8 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -99,25 +99,6 @@ static inline void inc_frontswap_invalidates(void) { } */ void frontswap_register_ops(struct frontswap_ops *ops) { - DECLARE_BITMAP(a, MAX_SWAPFILES); - DECLARE_BITMAP(b, MAX_SWAPFILES); - struct swap_info_struct *si; - unsigned int i; - - bitmap_zero(a, MAX_SWAPFILES); - bitmap_zero(b, MAX_SWAPFILES); - - spin_lock(&swap_lock); - plist_for_each_entry(si, &swap_active_head, list) { - if (!WARN_ON(!si->frontswap_map)) - __set_bit(si->type, a); - } - spin_unlock(&swap_lock); - - /* the new ops needs to know the currently active swap devices */ - for_each_set_bit(i, a, MAX_SWAPFILES) - ops->init(i); - /* * Setting frontswap_ops must happen after the ops->init() calls * above; cmpxchg implies smp_mb() which will ensure the init is @@ -128,28 +109,6 @@ void frontswap_register_ops(struct frontswap_ops *ops) } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); static_branch_inc(&frontswap_enabled_key); - - spin_lock(&swap_lock); - plist_for_each_entry(si, &swap_active_head, list) { - if (si->frontswap_map) - __set_bit(si->type, b); - } - spin_unlock(&swap_lock); - - /* - * On the very unlikely chance that a swap device was added or - * removed between setting the "a" list bits and the ops init - * calls, we re-check and do init or invalidate for any changed - * bits. - */ - if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) { - for (i = 0; i < MAX_SWAPFILES; i++) { - if (!test_bit(i, a) && test_bit(i, b)) - ops->init(i); - else if (test_bit(i, a) && !test_bit(i, b)) - ops->invalidate_area(i); - } - } } /* From 633423a09cb5cfe61438283e1ce49c23cf4a0611 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:15:07 -0800 Subject: [PATCH 0399/1295] mm: mark swap_lock and swap_active_head static swap_lock and swap_active_head are only used in swapfile.c, so mark them static. Link: https://lkml.kernel.org/r/20211224062246.1258487-12-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swapfile.h | 2 -- mm/swapfile.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index 809cd01ef2c5..54078542134c 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -6,8 +6,6 @@ * these were static in swapfile.c but frontswap.c needs them and we don't * want to expose them to the dozens of source files that include swap.h */ -extern spinlock_t swap_lock; -extern struct plist_head swap_active_head; extern struct swap_info_struct *swap_info[]; extern unsigned long generic_max_swapfile_size(void); extern unsigned long max_swapfile_size(void); diff --git a/mm/swapfile.c b/mm/swapfile.c index 82342c77791b..bf0df7aa7158 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -49,7 +49,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -DEFINE_SPINLOCK(swap_lock); +static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; atomic_long_t nr_swap_pages; /* @@ -71,7 +71,7 @@ static const char Unused_offset[] = "Unused swap offset entry "; * all active swap_info_structs * protected with swap_lock, and ordered by priority. */ -PLIST_HEAD(swap_active_head); +static PLIST_HEAD(swap_active_head); /* * all available (active, not full) swap_info_structs From 1da0d94a3ec8c5f3793b7be8538b55e60ebeefe3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:15:10 -0800 Subject: [PATCH 0400/1295] frontswap: remove support for multiple ops There is only a single instance of frontswap ops in the kernel, so simplify the frontswap code by removing support for multiple operations. Link: https://lkml.kernel.org/r/20211224062246.1258487-13-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 3 +-- mm/frontswap.c | 50 ++++++++++----------------------------- mm/zswap.c | 8 +++++-- 3 files changed, 19 insertions(+), 42 deletions(-) diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index c5b2848d2240..a631bac12220 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -13,10 +13,9 @@ struct frontswap_ops { int (*load)(unsigned, pgoff_t, struct page *); /* load a page */ void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */ void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */ - struct frontswap_ops *next; /* private pointer to next ops */ }; -extern void frontswap_register_ops(struct frontswap_ops *ops); +int frontswap_register_ops(const struct frontswap_ops *ops); extern void frontswap_init(unsigned type, unsigned long *map); extern int __frontswap_store(struct page *page); diff --git a/mm/frontswap.c b/mm/frontswap.c index 35040fa4eba8..6f69b044a8cc 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -27,10 +27,7 @@ DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key); * may be registered, but implementations can never deregister. This * is a simple singly-linked list of all registered implementations. */ -static struct frontswap_ops *frontswap_ops __read_mostly; - -#define for_each_frontswap_ops(ops) \ - for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next) +static const struct frontswap_ops *frontswap_ops __read_mostly; #ifdef CONFIG_DEBUG_FS /* @@ -97,18 +94,14 @@ static inline void inc_frontswap_invalidates(void) { } /* * Register operations for frontswap */ -void frontswap_register_ops(struct frontswap_ops *ops) +int frontswap_register_ops(const struct frontswap_ops *ops) { - /* - * Setting frontswap_ops must happen after the ops->init() calls - * above; cmpxchg implies smp_mb() which will ensure the init is - * complete at this point. - */ - do { - ops->next = frontswap_ops; - } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); + if (frontswap_ops) + return -EINVAL; + frontswap_ops = ops; static_branch_inc(&frontswap_enabled_key); + return 0; } /* @@ -117,7 +110,6 @@ void frontswap_register_ops(struct frontswap_ops *ops) void frontswap_init(unsigned type, unsigned long *map) { struct swap_info_struct *sis = swap_info[type]; - struct frontswap_ops *ops; VM_BUG_ON(sis == NULL); @@ -133,9 +125,7 @@ void frontswap_init(unsigned type, unsigned long *map) * p->frontswap set to something valid to work properly. */ frontswap_map_set(sis, map); - - for_each_frontswap_ops(ops) - ops->init(type); + frontswap_ops->init(type); } static bool __frontswap_test(struct swap_info_struct *sis, @@ -174,7 +164,6 @@ int __frontswap_store(struct page *page) int type = swp_type(entry); struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); - struct frontswap_ops *ops; VM_BUG_ON(!frontswap_ops); VM_BUG_ON(!PageLocked(page)); @@ -188,16 +177,10 @@ int __frontswap_store(struct page *page) */ if (__frontswap_test(sis, offset)) { __frontswap_clear(sis, offset); - for_each_frontswap_ops(ops) - ops->invalidate_page(type, offset); + frontswap_ops->invalidate_page(type, offset); } - /* Try to store in each implementation, until one succeeds. */ - for_each_frontswap_ops(ops) { - ret = ops->store(type, offset, page); - if (!ret) /* successful store */ - break; - } + ret = frontswap_ops->store(type, offset, page); if (ret == 0) { __frontswap_set(sis, offset); inc_frontswap_succ_stores(); @@ -220,7 +203,6 @@ int __frontswap_load(struct page *page) int type = swp_type(entry); struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); - struct frontswap_ops *ops; VM_BUG_ON(!frontswap_ops); VM_BUG_ON(!PageLocked(page)); @@ -230,11 +212,7 @@ int __frontswap_load(struct page *page) return -1; /* Try loading from each implementation, until one succeeds. */ - for_each_frontswap_ops(ops) { - ret = ops->load(type, offset, page); - if (!ret) /* successful load */ - break; - } + ret = frontswap_ops->load(type, offset, page); if (ret == 0) inc_frontswap_loads(); return ret; @@ -247,7 +225,6 @@ int __frontswap_load(struct page *page) void __frontswap_invalidate_page(unsigned type, pgoff_t offset) { struct swap_info_struct *sis = swap_info[type]; - struct frontswap_ops *ops; VM_BUG_ON(!frontswap_ops); VM_BUG_ON(sis == NULL); @@ -255,8 +232,7 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset) if (!__frontswap_test(sis, offset)) return; - for_each_frontswap_ops(ops) - ops->invalidate_page(type, offset); + frontswap_ops->invalidate_page(type, offset); __frontswap_clear(sis, offset); inc_frontswap_invalidates(); } @@ -268,7 +244,6 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset) void __frontswap_invalidate_area(unsigned type) { struct swap_info_struct *sis = swap_info[type]; - struct frontswap_ops *ops; VM_BUG_ON(!frontswap_ops); VM_BUG_ON(sis == NULL); @@ -276,8 +251,7 @@ void __frontswap_invalidate_area(unsigned type) if (sis->frontswap_map == NULL) return; - for_each_frontswap_ops(ops) - ops->invalidate_area(type); + frontswap_ops->invalidate_area(type); atomic_set(&sis->frontswap_pages, 0); bitmap_zero(sis->frontswap_map, sis->max); } diff --git a/mm/zswap.c b/mm/zswap.c index 7944e3e57e78..cdf6950fcb2e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1378,7 +1378,7 @@ static void zswap_frontswap_init(unsigned type) zswap_trees[type] = tree; } -static struct frontswap_ops zswap_frontswap_ops = { +static const struct frontswap_ops zswap_frontswap_ops = { .store = zswap_frontswap_store, .load = zswap_frontswap_load, .invalidate_page = zswap_frontswap_invalidate_page, @@ -1475,11 +1475,15 @@ static int __init init_zswap(void) if (!shrink_wq) goto fallback_fail; - frontswap_register_ops(&zswap_frontswap_ops); + ret = frontswap_register_ops(&zswap_frontswap_ops); + if (ret) + goto destroy_wq; if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); return 0; +destroy_wq: + destroy_workqueue(shrink_wq); fallback_fail: if (pool) zswap_pool_destroy(pool); From 6e61dde82e8bfe65e8ebbe43da45e615bc529236 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:15:14 -0800 Subject: [PATCH 0401/1295] mm: hide the FRONTSWAP Kconfig symbol Select FRONTSWAP from ZSWAP instead of prompting for it. Link: https://lkml.kernel.org/r/20211224062246.1258487-14-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 430240289b02..3326ee3903f3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -445,20 +445,7 @@ config HAVE_SETUP_PER_CPU_AREA bool config FRONTSWAP - bool "Enable frontswap to cache swap pages if tmem is present" - depends on SWAP - help - Frontswap is so named because it can be thought of as the opposite - of a "backing" store for a swap device. The data is stored into - "transcendent memory", memory that is not directly accessible or - addressable by the kernel and is of unknown and possibly - time-varying size. When space in transcendent memory is available, - a significant swap I/O reduction may be achieved. When none is - available, all frontswap calls are reduced to a single pointer- - compare-against-NULL resulting in a negligible performance hit - and swap data is stored as normal on the matching swap device. - - If unsure, say Y to enable frontswap. + bool config CMA bool "Contiguous Memory Allocator" @@ -523,7 +510,8 @@ config MEM_SOFT_DIRTY config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" - depends on FRONTSWAP && CRYPTO=y + depends on SWAP && CRYPTO=y + select FRONTSWAP select ZPOOL help A lightweight compressed cache for swap pages. It takes From d11a327ed95dbec756b99cbfef2a7fd85c9eeb09 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 21 Jan 2022 21:07:47 +0000 Subject: [PATCH 0402/1295] KVM: arm64: vgic-v3: Restrict SEIS workaround to known broken systems Contrary to what df652bcf1136 ("KVM: arm64: vgic-v3: Work around GICv3 locally generated SErrors") was asserting, there is at least one other system out there (Cavium ThunderX2) implementing SEIS, and not in an obviously broken way. So instead of imposing the M1 workaround on an innocent bystander, let's limit it to the two known broken Apple implementations. Fixes: df652bcf1136 ("KVM: arm64: vgic-v3: Work around GICv3 locally generated SErrors") Reported-by: Ard Biesheuvel Tested-by: Ard Biesheuvel Acked-by: Ard Biesheuvel Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220122103912.795026-1-maz@kernel.org --- arch/arm64/kvm/hyp/vgic-v3-sr.c | 3 +++ arch/arm64/kvm/vgic/vgic-v3.c | 17 +++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 20db2f281cf2..4fb419f7b8b6 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -983,6 +983,9 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; /* IDbits */ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; + /* SEIS */ + if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) + val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT); /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 78cf674c1230..221489c29354 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -609,6 +609,18 @@ static int __init early_gicv4_enable(char *buf) } early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); +static const struct midr_range broken_seis[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + {}, +}; + +static bool vgic_v3_broken_seis(void) +{ + return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) && + is_midr_in_range_list(read_cpuid_id(), broken_seis)); +} + /** * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller * @info: pointer to the GIC description @@ -676,9 +688,10 @@ int vgic_v3_probe(const struct gic_kvm_info *info) group1_trap = true; } - if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) { - kvm_info("GICv3 with locally generated SEI\n"); + if (vgic_v3_broken_seis()) { + kvm_info("GICv3 with broken locally generated SEI\n"); + kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_SEIS_MASK; group0_trap = true; group1_trap = true; if (ich_vtr_el2 & ICH_VTR_TDS_MASK) From a6501e4b380faee6bbf41bb2f833977c7ac6491a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 18 Jan 2022 10:20:03 -0800 Subject: [PATCH 0403/1295] eeprom: at25: Restore missing allocation The at25 driver regressed in v5.17-rc1 due to a broken conflict resolution: the allocation of the object was accidentally removed. Restore it. This was found when building under CONFIG_FORTIFY_SOURCE=y and -Warray-bounds, which complained about strncpy() being used against an empty object: In function 'strncpy', inlined from 'at25_fw_to_chip.constprop' at drivers/misc/eeprom/at25.c:312:2: ./include/linux/fortify-string.h:48:33: warning: '__builtin_strncpy' offset [0, 9] is out of the bounds [0, 0] [-Warray-bounds] 48 | #define __underlying_strncpy __builtin_strncpy | ^ ./include/linux/fortify-string.h:59:16: note: in expansion of macro '__underlying_strncpy' 59 | return __underlying_strncpy(p, q, size); | ^~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'at25_fram_to_chip' at drivers/misc/eeprom/at25.c:373:2, inlined from 'at25_probe' at drivers/misc/eeprom/at25.c:453:10: ./include/linux/fortify-string.h:48:33: warning: '__builtin_strncpy' offset [0, 9] is out of the bounds [0, 0] [-Warray-bounds] 48 | #define __underlying_strncpy __builtin_strncpy | ^ ./include/linux/fortify-string.h:59:16: note: in expansion of macro '__underlying_strncpy' 59 | return __underlying_strncpy(p, q, size); | ^~~~~~~~~~~~~~~~~~~~ Link: https://lore.kernel.org/lkml/CAHp75VdqK7h63fz-cPaQ2MGaVdaR2f1Fb5kKCZidUG3RwLsAVA@mail.gmail.com/ Fixes: af40d16042d6 ("Merge v5.15-rc5 into char-misc-next") Cc: Arnd Bergmann Cc: Jiri Prchal Reviewed-by: Andy Shevchenko Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220118182003.3385019-1-keescook@chromium.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/at25.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index c3305bdda69c..bee727ed98db 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -440,6 +440,10 @@ static int at25_probe(struct spi_device *spi) return -ENXIO; } + at25 = devm_kzalloc(&spi->dev, sizeof(*at25), GFP_KERNEL); + if (!at25) + return -ENOMEM; + mutex_init(&at25->lock); at25->spi = spi; spi_set_drvdata(spi, at25); From 10756dc5b02bff370ddd351d7744bc99ada659c2 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 3 Jan 2022 04:24:02 +0300 Subject: [PATCH 0404/1295] usr/include/Makefile: add linux/nfc.h to the compile-test coverage As linux/nfc.h userspace compilation was finally fixed by commits 79b69a83705e ("nfc: uapi: use kernel size_t to fix user-space builds") and 7175f02c4e5f ("uapi: fix linux/nfc.h userspace compilation errors"), there is no need to keep the compile-test exception for it in usr/include/Makefile. Signed-off-by: Dmitry V. Levin Signed-off-by: Masahiro Yamada --- usr/include/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/usr/include/Makefile b/usr/include/Makefile index 94403806ea56..7be7468c177b 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -34,7 +34,6 @@ no-header-test += linux/hdlc/ioctl.h no-header-test += linux/ivtv.h no-header-test += linux/kexec.h no-header-test += linux/matroxfb.h -no-header-test += linux/nfc.h no-header-test += linux/omap3isp.h no-header-test += linux/omapfb.h no-header-test += linux/patchkey.h From e92e2634ef3a95376ad917452a476fccaff83fde Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 20 Jan 2022 14:31:00 +0900 Subject: [PATCH 0405/1295] Revert "Makefile: Do not quote value for CONFIG_CC_IMPLICIT_FALLTHROUGH" This reverts commit cd8c917a56f20f48748dd43d9ae3caff51d5b987. Commit 129ab0d2d9f3 ("kbuild: do not quote string values in include/config/auto.conf") provided the final solution. Now reverting the temporary workaround. Signed-off-by: Masahiro Yamada --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3f07f0f04475..c94559a97dca 100644 --- a/Makefile +++ b/Makefile @@ -778,7 +778,7 @@ stackp-flags-$(CONFIG_STACKPROTECTOR_STRONG) := -fstack-protector-strong KBUILD_CFLAGS += $(stackp-flags-y) KBUILD_CFLAGS-$(CONFIG_WERROR) += -Werror -KBUILD_CFLAGS += $(KBUILD_CFLAGS-y) $(CONFIG_CC_IMPLICIT_FALLTHROUGH:"%"=%) +KBUILD_CFLAGS += $(KBUILD_CFLAGS-y) $(CONFIG_CC_IMPLICIT_FALLTHROUGH) ifdef CONFIG_CC_IS_CLANG KBUILD_CPPFLAGS += -Qunused-arguments From ad29a2fb3c201ef066b0a9fe10a6e14dd0d59c48 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 21 Jan 2022 04:22:04 +0900 Subject: [PATCH 0406/1295] certs: Fix build error when CONFIG_MODULE_SIG_KEY is PKCS#11 URI When CONFIG_MODULE_SIG_KEY is PKCS#11 URL (pkcs11:*), signing_key.x509 fails to build: certs/Makefile:77: *** target pattern contains no '%'. Stop. Due to the typo, $(X509_DEP) contains a colon. Fix it. Fixes: b8c96a6b466c ("certs: simplify $(srctree)/ handling and remove config_filename macro") Signed-off-by: Masahiro Yamada --- certs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/certs/Makefile b/certs/Makefile index f7041c29a2e0..0c459cfd09df 100644 --- a/certs/Makefile +++ b/certs/Makefile @@ -68,7 +68,7 @@ $(obj)/x509.genkey: endif # CONFIG_MODULE_SIG_KEY # If CONFIG_MODULE_SIG_KEY isn't a PKCS#11 URI, depend on it -ifneq ($(filter-out pkcs11:%, %(CONFIG_MODULE_SIG_KEY)),) +ifneq ($(filter-out pkcs11:%, $(CONFIG_MODULE_SIG_KEY)),) X509_DEP := $(CONFIG_MODULE_SIG_KEY) endif From e6340b6526eeec5a00fe26a6ff515afe7d0affa4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 21 Jan 2022 04:22:05 +0900 Subject: [PATCH 0407/1295] certs: Fix build error when CONFIG_MODULE_SIG_KEY is empty Since b8c96a6b466c ("certs: simplify $(srctree)/ handling and remove config_filename macro"), when CONFIG_MODULE_SIG_KEY is empty, signing_key.x509 fails to build: CERT certs/signing_key.x509 Usage: extract-cert make[1]: *** [certs/Makefile:78: certs/signing_key.x509] Error 2 make: *** [Makefile:1831: certs] Error 2 Pass "" to the first argument of extract-cert to fix the build error. Link: https://lore.kernel.org/linux-kbuild/20220120094606.2skuyb26yjlnu66q@lion.mk-sys.cz/T/#u Fixes: b8c96a6b466c ("certs: simplify $(srctree)/ handling and remove config_filename macro") Reported-by: Michal Kubecek Signed-off-by: Masahiro Yamada Tested-by: Michal Kubecek --- certs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/certs/Makefile b/certs/Makefile index 0c459cfd09df..3ea7fe60823f 100644 --- a/certs/Makefile +++ b/certs/Makefile @@ -75,7 +75,7 @@ endif $(obj)/system_certificates.o: $(obj)/signing_key.x509 $(obj)/signing_key.x509: $(X509_DEP) $(obj)/extract-cert FORCE - $(call if_changed,extract_certs,$(if $(X509_DEP),$<,$(CONFIG_MODULE_SIG_KEY))) + $(call if_changed,extract_certs,$(if $(CONFIG_MODULE_SIG_KEY),$(if $(X509_DEP),$<,$(CONFIG_MODULE_SIG_KEY)),"")) endif # CONFIG_MODULE_SIG targets += signing_key.x509 From 16436f70abeebb29cd99444e27b310755806c1fa Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 22 Jan 2022 16:16:14 +0100 Subject: [PATCH 0408/1295] irqchip/gic-v3-its: Fix build for !SMP Commit 835f442fdbce ("irqchip/gic-v3-its: Limit memreserve cpuhp state lifetime") added a reference to cpus_booted_once_mask, which does not exist on !SMP builds, breaking the build for such configurations. Given the intent of the check, short circuit it to always pass. Cc: Valentin Schneider Fixes: 835f442fdbce ("irqchip/gic-v3-its: Limit memreserve cpuhp state lifetime") Signed-off-by: Ard Biesheuvel Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220122151614.133766-1-ardb@kernel.org --- drivers/irqchip/irq-gic-v3-its.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index ee83eb377d7e..7b8f1ec0ff78 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -5241,7 +5241,8 @@ static int its_cpu_memreserve_lpi(unsigned int cpu) out: /* Last CPU being brought up gets to issue the cleanup */ - if (cpumask_equal(&cpus_booted_once_mask, cpu_possible_mask)) + if (!IS_ENABLED(CONFIG_SMP) || + cpumask_equal(&cpus_booted_once_mask, cpu_possible_mask)) schedule_work(&rdist_memreserve_cpuhp_cleanup_work); gic_data_rdist()->flags |= RD_LOCAL_MEMRESERVE_DONE; From 9edcde68d653e1f8f895fbb69a0043c6a56ae35e Mon Sep 17 00:00:00 2001 From: Yao Jin Date: Fri, 21 Jan 2022 14:59:54 +0800 Subject: [PATCH 0409/1295] perf script: Fix printing 'phys_addr' failure issue Perf script was failed to print the phys_addr for SPE profiling. One 'dummy' event is added by SPE profiling but it doesn't have PHYS_ADDR attribute set, perf script then exits with error. Now referring to 'addr', use evsel__do_check_stype() to check the type. Before: # perf record -e arm_spe_0/branch_filter=0,ts_enable=1,pa_enable=1,load_filter=1,jitter=0,\ store_filter=0,min_latency=0,event_filter=2/ -p 4064384 -- sleep 3 # perf script -F pid,tid,addr,phys_addr Samples for 'dummy:u' event do not have PHYS_ADDR attribute set. Cannot print 'phys_addr' field. After: # perf record -e arm_spe_0/branch_filter=0,ts_enable=1,pa_enable=1,load_filter=1,jitter=0,\ store_filter=0,min_latency=0,event_filter=2/ -p 4064384 -- sleep 3 # perf script -F pid,tid,addr,phys_addr 4064384/4064384 ffff802f921be0d0 2f921be0d0 4064384/4064384 ffff802f921be0d0 2f921be0d0 Reviewed-by: German Gomez Signed-off-by: Yao Jin Cc: Alexander Shishkin Cc: Hanjun Guo Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20220121065954.2121900-1-liwei391@huawei.com Signed-off-by: Wei Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index ecd4f99a6c14..abae8184e171 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -515,7 +515,7 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session) return -EINVAL; if (PRINT_FIELD(PHYS_ADDR) && - evsel__check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR", PERF_OUTPUT_PHYS_ADDR)) + evsel__do_check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR", PERF_OUTPUT_PHYS_ADDR, allow_user_set)) return -EINVAL; if (PRINT_FIELD(DATA_PAGE_SIZE) && From 1d1d9af254ffc3bc38c59484c50c600d1d0c96da Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 21 Jan 2022 20:58:09 -0800 Subject: [PATCH 0410/1295] perf python: Fix cpu_map__item() building MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Value should be built as an integer. Switch some uses of perf_cpu_map to use the library API. Fixes: 6d18804b963b78dc ("perf cpumap: Give CPUs their own type") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: Andi Kleen Cc: André Almeida Cc: Andrew Morton Cc: Andy Shevchenko Cc: Darren Hart Cc: Davidlohr Bueso Cc: Dmitriy Vyukov Cc: Eric Dumazet Cc: German Gomez Cc: Ian Rogers Cc: James Clark Cc: Jin Yao Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miaoqian Lin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Riccardo Mancini Cc: Shunsuke Nakamura Cc: Song Liu Cc: Stephane Eranian Cc: Stephen Brennan Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Thomas Richter Cc: Yury Norov Link: http://lore.kernel.org/lkml/20220122045811.3402706-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/python.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index f3e5131f183c..52d8995cfd73 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -638,17 +638,17 @@ static Py_ssize_t pyrf_cpu_map__length(PyObject *obj) { struct pyrf_cpu_map *pcpus = (void *)obj; - return pcpus->cpus->nr; + return perf_cpu_map__nr(pcpus->cpus); } static PyObject *pyrf_cpu_map__item(PyObject *obj, Py_ssize_t i) { struct pyrf_cpu_map *pcpus = (void *)obj; - if (i >= pcpus->cpus->nr) + if (i >= perf_cpu_map__nr(pcpus->cpus)) return NULL; - return Py_BuildValue("i", pcpus->cpus->map[i]); + return Py_BuildValue("i", perf_cpu_map__cpu(pcpus->cpus, i).cpu); } static PySequenceMethods pyrf_cpu_map__sequence_methods = { From 440286993960bea4aa09d912a5497d92d09ae54c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 21 Jan 2022 20:58:10 -0800 Subject: [PATCH 0411/1295] perf cpumap: Migrate to libperf cpumap api MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch from directly accessing the perf_cpu_map to using the appropriate libperf API when possible. Using the API simplifies the job of refactoring use of perf_cpu_map. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: Andi Kleen Cc: Andrew Morton Cc: André Almeida Cc: Andy Shevchenko Cc: Darren Hart Cc: Davidlohr Bueso Cc: Dmitriy Vyukov Cc: Eric Dumazet Cc: German Gomez Cc: James Clark Cc: Jin Yao Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miaoqian Lin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Riccardo Mancini Cc: Shunsuke Nakamura Cc: Song Liu Cc: Stephane Eranian Cc: Stephen Brennan Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Thomas Richter Cc: Yury Norov Link: http://lore.kernel.org/lkml/20220122045811.3402706-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/evsel.c | 4 +- tools/perf/bench/epoll-ctl.c | 2 +- tools/perf/bench/epoll-wait.c | 2 +- tools/perf/bench/evlist-open-close.c | 4 +- tools/perf/bench/futex-hash.c | 2 +- tools/perf/bench/futex-lock-pi.c | 2 +- tools/perf/bench/futex-requeue.c | 2 +- tools/perf/bench/futex-wake-parallel.c | 2 +- tools/perf/bench/futex-wake.c | 2 +- tools/perf/builtin-ftrace.c | 2 +- tools/perf/builtin-stat.c | 7 ++-- tools/perf/tests/bitmap.c | 4 +- tools/perf/tests/event_update.c | 8 ++-- tools/perf/tests/mem2node.c | 9 +++-- tools/perf/tests/mmap-basic.c | 5 ++- tools/perf/tests/topology.c | 37 +++++++++++-------- tools/perf/util/auxtrace.c | 2 +- tools/perf/util/counts.c | 2 +- tools/perf/util/cpumap.h | 2 +- tools/perf/util/cputopo.c | 4 +- tools/perf/util/evlist-hybrid.c | 11 +++--- tools/perf/util/evsel.c | 20 +++++----- tools/perf/util/evsel.h | 3 +- tools/perf/util/mmap.c | 2 +- tools/perf/util/perf_api_probe.c | 4 +- tools/perf/util/record.c | 6 +-- .../scripting-engines/trace-event-python.c | 4 +- tools/perf/util/session.c | 4 +- tools/perf/util/svghelper.c | 4 +- tools/perf/util/synthetic-events.c | 18 ++++----- tools/perf/util/top.c | 6 +-- 31 files changed, 99 insertions(+), 87 deletions(-) diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c index 7ea86a44eae5..210ea7c06ce8 100644 --- a/tools/lib/perf/evsel.c +++ b/tools/lib/perf/evsel.c @@ -141,7 +141,7 @@ int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus, } if (evsel->fd == NULL && - perf_evsel__alloc_fd(evsel, cpus->nr, threads->nr) < 0) + perf_evsel__alloc_fd(evsel, perf_cpu_map__nr(cpus), threads->nr) < 0) return -ENOMEM; perf_cpu_map__for_each_cpu(cpu, idx, cpus) { @@ -384,7 +384,7 @@ int perf_evsel__apply_filter(struct perf_evsel *evsel, const char *filter) { int err = 0, i; - for (i = 0; i < evsel->cpus->nr && !err; i++) + for (i = 0; i < perf_cpu_map__nr(evsel->cpus) && !err; i++) err = perf_evsel__run_ioctl(evsel, PERF_EVENT_IOC_SET_FILTER, (void *)filter, i); diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c index 1a17ec83d3c4..740ae764537e 100644 --- a/tools/perf/bench/epoll-ctl.c +++ b/tools/perf/bench/epoll-ctl.c @@ -333,7 +333,7 @@ int bench_epoll_ctl(int argc, const char **argv) /* default to the number of CPUs */ if (!nthreads) - nthreads = cpu->nr; + nthreads = perf_cpu_map__nr(cpu); worker = calloc(nthreads, sizeof(*worker)); if (!worker) diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c index 0d1dd8879197..37de970c9743 100644 --- a/tools/perf/bench/epoll-wait.c +++ b/tools/perf/bench/epoll-wait.c @@ -452,7 +452,7 @@ int bench_epoll_wait(int argc, const char **argv) /* default to the number of CPUs and leave one for the writer pthread */ if (!nthreads) - nthreads = cpu->nr - 1; + nthreads = perf_cpu_map__nr(cpu) - 1; worker = calloc(nthreads, sizeof(*worker)); if (!worker) { diff --git a/tools/perf/bench/evlist-open-close.c b/tools/perf/bench/evlist-open-close.c index 482738e9bdad..de56601f69ee 100644 --- a/tools/perf/bench/evlist-open-close.c +++ b/tools/perf/bench/evlist-open-close.c @@ -71,7 +71,7 @@ static int evlist__count_evsel_fds(struct evlist *evlist) int cnt = 0; evlist__for_each_entry(evlist, evsel) - cnt += evsel->core.threads->nr * evsel->core.cpus->nr; + cnt += evsel->core.threads->nr * perf_cpu_map__nr(evsel->core.cpus); return cnt; } @@ -151,7 +151,7 @@ static int bench_evlist_open_close__run(char *evstr) init_stats(&time_stats); - printf(" Number of cpus:\t%d\n", evlist->core.cpus->nr); + printf(" Number of cpus:\t%d\n", perf_cpu_map__nr(evlist->core.cpus)); printf(" Number of threads:\t%d\n", evlist->core.threads->nr); printf(" Number of events:\t%d (%d fds)\n", evlist->core.nr_entries, evlist__count_evsel_fds(evlist)); diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index 9627b6ab8670..dbcecec4eeda 100644 --- a/tools/perf/bench/futex-hash.c +++ b/tools/perf/bench/futex-hash.c @@ -150,7 +150,7 @@ int bench_futex_hash(int argc, const char **argv) } if (!params.nthreads) /* default to the number of CPUs */ - params.nthreads = cpu->nr; + params.nthreads = perf_cpu_map__nr(cpu); worker = calloc(params.nthreads, sizeof(*worker)); if (!worker) diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c index a512a320df74..6fc9a3d55c1f 100644 --- a/tools/perf/bench/futex-lock-pi.c +++ b/tools/perf/bench/futex-lock-pi.c @@ -173,7 +173,7 @@ int bench_futex_lock_pi(int argc, const char **argv) } if (!params.nthreads) - params.nthreads = cpu->nr; + params.nthreads = perf_cpu_map__nr(cpu); worker = calloc(params.nthreads, sizeof(*worker)); if (!worker) diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c index aca47ce8b1e7..2f59d5d1c509 100644 --- a/tools/perf/bench/futex-requeue.c +++ b/tools/perf/bench/futex-requeue.c @@ -175,7 +175,7 @@ int bench_futex_requeue(int argc, const char **argv) } if (!params.nthreads) - params.nthreads = cpu->nr; + params.nthreads = perf_cpu_map__nr(cpu); worker = calloc(params.nthreads, sizeof(*worker)); if (!worker) diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c index 888ee6037945..861deb934745 100644 --- a/tools/perf/bench/futex-wake-parallel.c +++ b/tools/perf/bench/futex-wake-parallel.c @@ -252,7 +252,7 @@ int bench_futex_wake_parallel(int argc, const char **argv) err(EXIT_FAILURE, "calloc"); if (!params.nthreads) - params.nthreads = cpu->nr; + params.nthreads = perf_cpu_map__nr(cpu); /* some sanity checks */ if (params.nwakes > params.nthreads || diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c index aa82db51c0ab..cfda48bef1d7 100644 --- a/tools/perf/bench/futex-wake.c +++ b/tools/perf/bench/futex-wake.c @@ -151,7 +151,7 @@ int bench_futex_wake(int argc, const char **argv) } if (!params.nthreads) - params.nthreads = cpu->nr; + params.nthreads = perf_cpu_map__nr(cpu); worker = calloc(params.nthreads, sizeof(*worker)); if (!worker) diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index 71452599f87d..dec24dc0e767 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -281,7 +281,7 @@ static int set_tracing_cpumask(struct perf_cpu_map *cpumap) int ret; int last_cpu; - last_cpu = perf_cpu_map__cpu(cpumap, cpumap->nr - 1).cpu; + last_cpu = perf_cpu_map__cpu(cpumap, perf_cpu_map__nr(cpumap) - 1).cpu; mask_size = last_cpu / 4 + 2; /* one more byte for EOS */ mask_size += last_cpu / 32; /* ',' is needed for every 32th cpus */ diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 934e992c966f..3f98689dd687 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -230,11 +230,12 @@ static bool cpus_map_matched(struct evsel *a, struct evsel *b) if (!a->core.cpus || !b->core.cpus) return false; - if (a->core.cpus->nr != b->core.cpus->nr) + if (perf_cpu_map__nr(a->core.cpus) != perf_cpu_map__nr(b->core.cpus)) return false; - for (int i = 0; i < a->core.cpus->nr; i++) { - if (a->core.cpus->map[i].cpu != b->core.cpus->map[i].cpu) + for (int i = 0; i < perf_cpu_map__nr(a->core.cpus); i++) { + if (perf_cpu_map__cpu(a->core.cpus, i).cpu != + perf_cpu_map__cpu(b->core.cpus, i).cpu) return false; } diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c index 0bf399c49849..4965dd666956 100644 --- a/tools/perf/tests/bitmap.c +++ b/tools/perf/tests/bitmap.c @@ -17,8 +17,8 @@ static unsigned long *get_bitmap(const char *str, int nbits) bm = bitmap_zalloc(nbits); if (map && bm) { - for (i = 0; i < map->nr; i++) - set_bit(map->map[i].cpu, bm); + for (i = 0; i < perf_cpu_map__nr(map); i++) + set_bit(perf_cpu_map__cpu(map, i).cpu, bm); } if (map) diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index 16b6d6f47f38..78db4d704e76 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -75,10 +75,10 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused, TEST_ASSERT_VAL("wrong id", ev->id == 123); TEST_ASSERT_VAL("wrong type", ev->type == PERF_EVENT_UPDATE__CPUS); - TEST_ASSERT_VAL("wrong cpus", map->nr == 3); - TEST_ASSERT_VAL("wrong cpus", map->map[0].cpu == 1); - TEST_ASSERT_VAL("wrong cpus", map->map[1].cpu == 2); - TEST_ASSERT_VAL("wrong cpus", map->map[2].cpu == 3); + TEST_ASSERT_VAL("wrong cpus", perf_cpu_map__nr(map) == 3); + TEST_ASSERT_VAL("wrong cpus", perf_cpu_map__cpu(map, 0).cpu == 1); + TEST_ASSERT_VAL("wrong cpus", perf_cpu_map__cpu(map, 1).cpu == 2); + TEST_ASSERT_VAL("wrong cpus", perf_cpu_map__cpu(map, 2).cpu == 3); perf_cpu_map__put(map); return 0; } diff --git a/tools/perf/tests/mem2node.c b/tools/perf/tests/mem2node.c index f4a4aba33f76..4c96829510c9 100644 --- a/tools/perf/tests/mem2node.c +++ b/tools/perf/tests/mem2node.c @@ -25,14 +25,15 @@ static unsigned long *get_bitmap(const char *str, int nbits) { struct perf_cpu_map *map = perf_cpu_map__new(str); unsigned long *bm = NULL; - int i; bm = bitmap_zalloc(nbits); if (map && bm) { - for (i = 0; i < map->nr; i++) { - set_bit(map->map[i].cpu, bm); - } + struct perf_cpu cpu; + int i; + + perf_cpu_map__for_each_cpu(cpu, i, map) + set_bit(cpu.cpu, bm); } if (map) diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index 0ad62914b4d7..c3c17600f29c 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -59,11 +59,12 @@ static int test__basic_mmap(struct test_suite *test __maybe_unused, int subtest } CPU_ZERO(&cpu_set); - CPU_SET(cpus->map[0].cpu, &cpu_set); + CPU_SET(perf_cpu_map__cpu(cpus, 0).cpu, &cpu_set); sched_setaffinity(0, sizeof(cpu_set), &cpu_set); if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) { pr_debug("sched_setaffinity() failed on CPU %d: %s ", - cpus->map[0].cpu, str_error_r(errno, sbuf, sizeof(sbuf))); + perf_cpu_map__cpu(cpus, 0).cpu, + str_error_r(errno, sbuf, sizeof(sbuf))); goto out_free_cpus; } diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index c4ef0c7002f1..ee1e3dcbc0bd 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -122,44 +122,48 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) } // Test that CPU ID contains socket, die, core and CPU - for (i = 0; i < map->nr; i++) { + for (i = 0; i < perf_cpu_map__nr(map); i++) { id = aggr_cpu_id__cpu(perf_cpu_map__cpu(map, i), NULL); - TEST_ASSERT_VAL("Cpu map - CPU ID doesn't match", map->map[i].cpu == id.cpu.cpu); + TEST_ASSERT_VAL("Cpu map - CPU ID doesn't match", + perf_cpu_map__cpu(map, i).cpu == id.cpu.cpu); TEST_ASSERT_VAL("Cpu map - Core ID doesn't match", - session->header.env.cpu[map->map[i].cpu].core_id == id.core); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].core_id == id.core); TEST_ASSERT_VAL("Cpu map - Socket ID doesn't match", - session->header.env.cpu[map->map[i].cpu].socket_id == id.socket); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + id.socket); TEST_ASSERT_VAL("Cpu map - Die ID doesn't match", - session->header.env.cpu[map->map[i].cpu].die_id == id.die); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die); TEST_ASSERT_VAL("Cpu map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Cpu map - Thread is set", id.thread == -1); } // Test that core ID contains socket, die and core - for (i = 0; i < map->nr; i++) { + for (i = 0; i < perf_cpu_map__nr(map); i++) { id = aggr_cpu_id__core(perf_cpu_map__cpu(map, i), NULL); TEST_ASSERT_VAL("Core map - Core ID doesn't match", - session->header.env.cpu[map->map[i].cpu].core_id == id.core); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].core_id == id.core); TEST_ASSERT_VAL("Core map - Socket ID doesn't match", - session->header.env.cpu[map->map[i].cpu].socket_id == id.socket); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + id.socket); TEST_ASSERT_VAL("Core map - Die ID doesn't match", - session->header.env.cpu[map->map[i].cpu].die_id == id.die); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die); TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Core map - Thread is set", id.thread == -1); } // Test that die ID contains socket and die - for (i = 0; i < map->nr; i++) { + for (i = 0; i < perf_cpu_map__nr(map); i++) { id = aggr_cpu_id__die(perf_cpu_map__cpu(map, i), NULL); TEST_ASSERT_VAL("Die map - Socket ID doesn't match", - session->header.env.cpu[map->map[i].cpu].socket_id == id.socket); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + id.socket); TEST_ASSERT_VAL("Die map - Die ID doesn't match", - session->header.env.cpu[map->map[i].cpu].die_id == id.die); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die); TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Die map - Core is set", id.core == -1); @@ -168,10 +172,11 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) } // Test that socket ID contains only socket - for (i = 0; i < map->nr; i++) { + for (i = 0; i < perf_cpu_map__nr(map); i++) { id = aggr_cpu_id__socket(perf_cpu_map__cpu(map, i), NULL); TEST_ASSERT_VAL("Socket map - Socket ID doesn't match", - session->header.env.cpu[map->map[i].cpu].socket_id == id.socket); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + id.socket); TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1); @@ -181,10 +186,10 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) } // Test that node ID contains only node - for (i = 0; i < map->nr; i++) { + for (i = 0; i < perf_cpu_map__nr(map); i++) { id = aggr_cpu_id__node(perf_cpu_map__cpu(map, i), NULL); TEST_ASSERT_VAL("Node map - Node ID doesn't match", - cpu__get_node(map->map[i]) == id.node); + cpu__get_node(perf_cpu_map__cpu(map, i)) == id.node); TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1); TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1); TEST_ASSERT_VAL("Node map - Core is set", id.core == -1); diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 5632efc44738..825336304a37 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -174,7 +174,7 @@ void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp, mp->idx = idx; if (per_cpu) { - mp->cpu = evlist->core.cpus->map[idx]; + mp->cpu = perf_cpu_map__cpu(evlist->core.cpus, idx); if (evlist->core.threads) mp->tid = perf_thread_map__pid(evlist->core.threads, 0); else diff --git a/tools/perf/util/counts.c b/tools/perf/util/counts.c index 2b81707b9dba..7a447d918458 100644 --- a/tools/perf/util/counts.c +++ b/tools/perf/util/counts.c @@ -61,7 +61,7 @@ int evsel__alloc_counts(struct evsel *evsel) struct perf_cpu_map *cpus = evsel__cpus(evsel); int nthreads = perf_thread_map__nr(evsel->core.threads); - evsel->counts = perf_counts__new(cpus ? cpus->nr : 1, nthreads); + evsel->counts = perf_counts__new(perf_cpu_map__nr(cpus), nthreads); return evsel->counts != NULL ? 0 : -ENOMEM; } diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 0d3c2006a15d..5c85fbd709b4 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -57,7 +57,7 @@ struct perf_cpu cpu__max_present_cpu(void); */ static inline bool cpu_map__is_dummy(struct perf_cpu_map *cpus) { - return cpus->nr == 1 && cpus->map[0].cpu == -1; + return perf_cpu_map__nr(cpus) == 1 && perf_cpu_map__cpu(cpus, 0).cpu == -1; } /** diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c index e20b835a1194..d275d843c155 100644 --- a/tools/perf/util/cputopo.c +++ b/tools/perf/util/cputopo.c @@ -325,7 +325,7 @@ struct numa_topology *numa_topology__new(void) if (!node_map) goto out; - nr = (u32) node_map->nr; + nr = (u32) perf_cpu_map__nr(node_map); tp = zalloc(sizeof(*tp) + sizeof(tp->nodes[0])*nr); if (!tp) @@ -334,7 +334,7 @@ struct numa_topology *numa_topology__new(void) tp->nr = nr; for (i = 0; i < nr; i++) { - if (load_numa_node(&tp->nodes[i], node_map->map[i].cpu)) { + if (load_numa_node(&tp->nodes[i], perf_cpu_map__cpu(node_map, i).cpu)) { numa_topology__delete(tp); tp = NULL; break; diff --git a/tools/perf/util/evlist-hybrid.c b/tools/perf/util/evlist-hybrid.c index 7c554234b43d..7f234215147d 100644 --- a/tools/perf/util/evlist-hybrid.c +++ b/tools/perf/util/evlist-hybrid.c @@ -124,22 +124,23 @@ int evlist__fix_hybrid_cpus(struct evlist *evlist, const char *cpu_list) events_nr++; - if (matched_cpus->nr > 0 && (unmatched_cpus->nr > 0 || - matched_cpus->nr < cpus->nr || - matched_cpus->nr < pmu->cpus->nr)) { + if (perf_cpu_map__nr(matched_cpus) > 0 && + (perf_cpu_map__nr(unmatched_cpus) > 0 || + perf_cpu_map__nr(matched_cpus) < perf_cpu_map__nr(cpus) || + perf_cpu_map__nr(matched_cpus) < perf_cpu_map__nr(pmu->cpus))) { perf_cpu_map__put(evsel->core.cpus); perf_cpu_map__put(evsel->core.own_cpus); evsel->core.cpus = perf_cpu_map__get(matched_cpus); evsel->core.own_cpus = perf_cpu_map__get(matched_cpus); - if (unmatched_cpus->nr > 0) { + if (perf_cpu_map__nr(unmatched_cpus) > 0) { cpu_map__snprint(matched_cpus, buf1, sizeof(buf1)); pr_warning("WARNING: use %s in '%s' for '%s', skip other cpus in list.\n", buf1, pmu->name, evsel->name); } } - if (matched_cpus->nr == 0) { + if (perf_cpu_map__nr(matched_cpus) == 0) { evlist__remove(evlist, evsel); evsel__delete(evsel); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 2f6b18af49e5..fb0a2debf015 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1782,7 +1782,7 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus, nthreads = threads->nr; if (evsel->core.fd == NULL && - perf_evsel__alloc_fd(&evsel->core, cpus->nr, nthreads) < 0) + perf_evsel__alloc_fd(&evsel->core, perf_cpu_map__nr(cpus), nthreads) < 0) return -ENOMEM; evsel->open_flags = PERF_FLAG_FD_CLOEXEC; @@ -2020,9 +2020,10 @@ retry_open: test_attr__ready(); pr_debug2_peo("sys_perf_event_open: pid %d cpu %d group_fd %d flags %#lx", - pid, cpus->map[idx].cpu, group_fd, evsel->open_flags); + pid, perf_cpu_map__cpu(cpus, idx).cpu, group_fd, evsel->open_flags); - fd = sys_perf_event_open(&evsel->core.attr, pid, cpus->map[idx].cpu, + fd = sys_perf_event_open(&evsel->core.attr, pid, + perf_cpu_map__cpu(cpus, idx).cpu, group_fd, evsel->open_flags); FD(evsel, idx, thread) = fd; @@ -2038,7 +2039,8 @@ retry_open: bpf_counter__install_pe(evsel, idx, fd); if (unlikely(test_attr__enabled)) { - test_attr__open(&evsel->core.attr, pid, cpus->map[idx], + test_attr__open(&evsel->core.attr, pid, + perf_cpu_map__cpu(cpus, idx), fd, group_fd, evsel->open_flags); } @@ -2079,7 +2081,8 @@ try_fallback: if (evsel__precise_ip_fallback(evsel)) goto retry_open; - if (evsel__ignore_missing_thread(evsel, cpus->nr, idx, threads, thread, err)) { + if (evsel__ignore_missing_thread(evsel, perf_cpu_map__nr(cpus), + idx, threads, thread, err)) { /* We just removed 1 thread, so lower the upper nthreads limit. */ nthreads--; @@ -2119,7 +2122,7 @@ out_close: int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, struct perf_thread_map *threads) { - return evsel__open_cpu(evsel, cpus, threads, 0, cpus ? cpus->nr : 1); + return evsel__open_cpu(evsel, cpus, threads, 0, perf_cpu_map__nr(cpus)); } void evsel__close(struct evsel *evsel) @@ -2131,8 +2134,7 @@ void evsel__close(struct evsel *evsel) int evsel__open_per_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, int cpu_map_idx) { if (cpu_map_idx == -1) - return evsel__open_cpu(evsel, cpus, NULL, 0, - cpus ? cpus->nr : 1); + return evsel__open_cpu(evsel, cpus, NULL, 0, perf_cpu_map__nr(cpus)); return evsel__open_cpu(evsel, cpus, NULL, cpu_map_idx, cpu_map_idx + 1); } @@ -2982,7 +2984,7 @@ int evsel__store_ids(struct evsel *evsel, struct evlist *evlist) struct perf_cpu_map *cpus = evsel->core.cpus; struct perf_thread_map *threads = evsel->core.threads; - if (perf_evsel__alloc_id(&evsel->core, cpus->nr, threads->nr)) + if (perf_evsel__alloc_id(&evsel->core, perf_cpu_map__nr(cpus), threads->nr)) return -ENOMEM; return store_evsel_ids(evsel, evlist); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 5720ceebffac..041b42d33bf5 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -11,6 +11,7 @@ #include #include "symbol_conf.h" #include +#include struct bpf_object; struct cgroup; @@ -191,7 +192,7 @@ static inline struct perf_cpu_map *evsel__cpus(struct evsel *evsel) static inline int evsel__nr_cpus(struct evsel *evsel) { - return evsel__cpus(evsel)->nr; + return perf_cpu_map__nr(evsel__cpus(evsel)); } void evsel__compute_deltas(struct evsel *evsel, int cpu, int thread, diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 12261ed8c15b..0e8ff8d1e206 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -250,7 +250,7 @@ static void build_node_mask(int node, struct mmap_cpu_mask *mask) nr_cpus = perf_cpu_map__nr(cpu_map); for (idx = 0; idx < nr_cpus; idx++) { - cpu = cpu_map->map[idx]; /* map c index to online cpu index */ + cpu = perf_cpu_map__cpu(cpu_map, idx); /* map c index to online cpu index */ if (cpu__get_node(cpu) == node) set_bit(cpu.cpu, mask->bits); } diff --git a/tools/perf/util/perf_api_probe.c b/tools/perf/util/perf_api_probe.c index 734d006d9a8c..c28dd50bd571 100644 --- a/tools/perf/util/perf_api_probe.c +++ b/tools/perf/util/perf_api_probe.c @@ -67,7 +67,7 @@ static bool perf_probe_api(setup_probe_fn_t fn) cpus = perf_cpu_map__new(NULL); if (!cpus) return false; - cpu = cpus->map[0]; + cpu = perf_cpu_map__cpu(cpus, 0); perf_cpu_map__put(cpus); do { @@ -144,7 +144,7 @@ bool perf_can_record_cpu_wide(void) if (!cpus) return false; - cpu = cpus->map[0]; + cpu = perf_cpu_map__cpu(cpus, 0); perf_cpu_map__put(cpus); fd = sys_perf_event_open(&attr, -1, cpu.cpu, -1, 0); diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 20461f174991..007a64681416 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -106,7 +106,7 @@ void evlist__config(struct evlist *evlist, struct record_opts *opts, struct call if (opts->group) evlist__set_leader(evlist); - if (evlist->core.cpus->map[0].cpu < 0) + if (perf_cpu_map__cpu(evlist->core.cpus, 0).cpu < 0) opts->no_inherit = true; use_comm_exec = perf_can_comm_exec(); @@ -248,11 +248,11 @@ bool evlist__can_select_event(struct evlist *evlist, const char *str) struct perf_cpu_map *cpus = perf_cpu_map__new(NULL); if (cpus) - cpu = cpus->map[0]; + cpu = perf_cpu_map__cpu(cpus, 0); perf_cpu_map__put(cpus); } else { - cpu = evlist->core.cpus->map[0]; + cpu = perf_cpu_map__cpu(evlist->core.cpus, 0); } while (1) { diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index f5ad0e62227a..e752e1f4a5f0 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -1607,8 +1607,8 @@ static void python_process_stat(struct perf_stat_config *config, } for (thread = 0; thread < threads->nr; thread++) { - for (cpu = 0; cpu < cpus->nr; cpu++) { - process_stat(counter, cpus->map[cpu], + for (cpu = 0; cpu < perf_cpu_map__nr(cpus); cpu++) { + process_stat(counter, perf_cpu_map__cpu(cpus, cpu), perf_thread_map__pid(threads, thread), tstamp, perf_counts(counter->counts, cpu, thread)); } diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index f19348dddd55..2c0d30f08e78 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2537,8 +2537,8 @@ int perf_session__cpu_bitmap(struct perf_session *session, return -1; } - for (i = 0; i < map->nr; i++) { - struct perf_cpu cpu = map->map[i]; + for (i = 0; i < perf_cpu_map__nr(map); i++) { + struct perf_cpu cpu = perf_cpu_map__cpu(map, i); if (cpu.cpu >= nr_cpus) { pr_err("Requested CPU %d too large. " diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c index 4c9f211249db..1e0c731fc539 100644 --- a/tools/perf/util/svghelper.c +++ b/tools/perf/util/svghelper.c @@ -734,8 +734,8 @@ static int str_to_bitmap(char *s, cpumask_t *b, int nr_cpus) if (!m) return -1; - for (i = 0; i < m->nr; i++) { - c = m->map[i]; + for (i = 0; i < perf_cpu_map__nr(m); i++) { + c = perf_cpu_map__cpu(m, i); if (c.cpu >= nr_cpus) { ret = -1; break; diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index c9ba8050cc2b..70f095624a0b 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1186,12 +1186,12 @@ int perf_event__synthesize_thread_map2(struct perf_tool *tool, static void synthesize_cpus(struct cpu_map_entries *cpus, struct perf_cpu_map *map) { - int i; + int i, map_nr = perf_cpu_map__nr(map); - cpus->nr = map->nr; + cpus->nr = map_nr; - for (i = 0; i < map->nr; i++) - cpus->cpu[i] = map->map[i].cpu; + for (i = 0; i < map_nr; i++) + cpus->cpu[i] = perf_cpu_map__cpu(map, i).cpu; } static void synthesize_mask(struct perf_record_record_cpu_map *mask, @@ -1202,13 +1202,13 @@ static void synthesize_mask(struct perf_record_record_cpu_map *mask, mask->nr = BITS_TO_LONGS(max); mask->long_size = sizeof(long); - for (i = 0; i < map->nr; i++) - set_bit(map->map[i].cpu, mask->mask); + for (i = 0; i < perf_cpu_map__nr(map); i++) + set_bit(perf_cpu_map__cpu(map, i).cpu, mask->mask); } static size_t cpus_size(struct perf_cpu_map *map) { - return sizeof(struct cpu_map_entries) + map->nr * sizeof(u16); + return sizeof(struct cpu_map_entries) + perf_cpu_map__nr(map) * sizeof(u16); } static size_t mask_size(struct perf_cpu_map *map, int *max) @@ -1217,9 +1217,9 @@ static size_t mask_size(struct perf_cpu_map *map, int *max) *max = 0; - for (i = 0; i < map->nr; i++) { + for (i = 0; i < perf_cpu_map__nr(map); i++) { /* bit position of the cpu is + 1 */ - int bit = map->map[i].cpu + 1; + int bit = perf_cpu_map__cpu(map, i).cpu + 1; if (bit > *max) *max = bit; diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c index 27945eeb0cb5..c1ebfc5d2e0c 100644 --- a/tools/perf/util/top.c +++ b/tools/perf/util/top.c @@ -95,15 +95,15 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) if (target->cpu_list) ret += SNPRINTF(bf + ret, size - ret, ", CPU%s: %s)", - top->evlist->core.cpus->nr > 1 ? "s" : "", + perf_cpu_map__nr(top->evlist->core.cpus) > 1 ? "s" : "", target->cpu_list); else { if (target->tid) ret += SNPRINTF(bf + ret, size - ret, ")"); else ret += SNPRINTF(bf + ret, size - ret, ", %d CPU%s)", - top->evlist->core.cpus->nr, - top->evlist->core.cpus->nr > 1 ? "s" : ""); + perf_cpu_map__nr(top->evlist->core.cpus), + perf_cpu_map__nr(top->evlist->core.cpus) > 1 ? "s" : ""); } perf_top__reset_sample_counters(top); From 24ead7c254b42c4ea252e57bf9928154dc7744e0 Mon Sep 17 00:00:00 2001 From: Lv Ruyi Date: Mon, 17 Jan 2022 08:37:30 +0000 Subject: [PATCH 0412/1295] perf cpumap: Remove duplicate include in cpumap.h Remove all but the first include of stdbool.h from cpumap.h. Reported-by: Zeal Robot Signed-off-by: Lv Ruyi Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220117083730.863200-1-lv.ruyi@zte.com.cn Signed-off-by: CGEL ZTE Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cpumap.h | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 5c85fbd709b4..703ae6d3386e 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -4,7 +4,6 @@ #include #include -#include #include #include From 3606c0e1a1050d397ad759a62607e419fd8b0ccb Mon Sep 17 00:00:00 2001 From: German Gomez Date: Tue, 18 Jan 2022 14:40:54 +0000 Subject: [PATCH 0413/1295] perf evsel: Override attr->sample_period for non-libpfm4 events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A previous patch preventing "attr->sample_period" values from being overridden in pfm events changed a related behaviour in arm-spe. Before said patch: perf record -c 10000 -e arm_spe_0// -- sleep 1 Would yield an SPE event with period=10000. After the patch, the period in "-c 10000" was being ignored because the arm-spe code initializes sample_period to a non-zero value. This patch restores the previous behaviour for non-libpfm4 events. Fixes: ae5dcc8abe31 (“perf record: Prevent override of attr->sample_period for libpfm4 events”) Reported-by: Chase Conklin Signed-off-by: German Gomez Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Song Liu Cc: Stephane Eranian Cc: Yonghong Song Cc: bpf@vger.kernel.org Cc: netdev@vger.kernel.org Link: http://lore.kernel.org/lkml/20220118144054.2541-1-german.gomez@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index fb0a2debf015..22d3267ce294 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1064,6 +1064,17 @@ void __weak arch_evsel__fixup_new_cycles(struct perf_event_attr *attr __maybe_un { } +static void evsel__set_default_freq_period(struct record_opts *opts, + struct perf_event_attr *attr) +{ + if (opts->freq) { + attr->freq = 1; + attr->sample_freq = opts->freq; + } else { + attr->sample_period = opts->default_interval; + } +} + /* * The enable_on_exec/disabled value strategy: * @@ -1130,14 +1141,12 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts, * We default some events to have a default interval. But keep * it a weak assumption overridable by the user. */ - if (!attr->sample_period) { - if (opts->freq) { - attr->freq = 1; - attr->sample_freq = opts->freq; - } else { - attr->sample_period = opts->default_interval; - } - } + if ((evsel->is_libpfm_event && !attr->sample_period) || + (!evsel->is_libpfm_event && (!attr->sample_period || + opts->user_freq != UINT_MAX || + opts->user_interval != ULLONG_MAX))) + evsel__set_default_freq_period(opts, attr); + /* * If attr->freq was set (here or earlier), ask for period * to be sampled. From 864bc8c905261f264c3ea357027cf555fe51c5a3 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 17 Jan 2022 23:10:13 +0800 Subject: [PATCH 0414/1295] perf parse-events: Support event alias in form foo-bar-baz Event aliasing for events whose name in the form foo-bar-baz is not supported, while foo-bar, foo_bar_baz, and other combinations are, i.e. two hyphens are not supported. The HiSilicon D06 platform has events in such form: $ ./perf list sdir-home-migrate List of pre-defined events (to be used in -e): uncore hha: sdir-home-migrate [Unit: hisi_sccl,hha] $ sudo ./perf stat -e sdir-home-migrate event syntax error: 'sdir-home-migrate' \___ parser error Run 'perf list' for a list of valid events Usage: perf stat [] [] -e, --event event selector. use 'perf list' to list available events To support, add an extra PMU event symbol type for "baz", and add a new rule in the bison file. Signed-off-by: John Garry Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Qi Liu Cc: Shaokun Zhang Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/1642432215-234089-2-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 25 +++++++++++++++++++++++-- tools/perf/util/parse-events.h | 1 + tools/perf/util/parse-events.l | 2 ++ tools/perf/util/parse-events.y | 17 +++++++++++++++-- 4 files changed, 41 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index acf20ce98ce9..879f606e07e6 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2098,8 +2098,17 @@ static void perf_pmu__parse_init(void) pmu = NULL; while ((pmu = perf_pmu__scan(pmu)) != NULL) { list_for_each_entry(alias, &pmu->aliases, list) { - if (strchr(alias->name, '-')) + char *tmp = strchr(alias->name, '-'); + + if (tmp) { + char *tmp2 = NULL; + + tmp2 = strchr(tmp + 1, '-'); len++; + if (tmp2) + len++; + } + len++; } } @@ -2119,8 +2128,20 @@ static void perf_pmu__parse_init(void) list_for_each_entry(alias, &pmu->aliases, list) { struct perf_pmu_event_symbol *p = perf_pmu_events_list + len; char *tmp = strchr(alias->name, '-'); + char *tmp2 = NULL; - if (tmp != NULL) { + if (tmp) + tmp2 = strchr(tmp + 1, '-'); + if (tmp2) { + SET_SYMBOL(strndup(alias->name, tmp - alias->name), + PMU_EVENT_SYMBOL_PREFIX); + p++; + tmp++; + SET_SYMBOL(strndup(tmp, tmp2 - tmp), PMU_EVENT_SYMBOL_SUFFIX); + p++; + SET_SYMBOL(strdup(++tmp2), PMU_EVENT_SYMBOL_SUFFIX2); + len += 3; + } else if (tmp) { SET_SYMBOL(strndup(alias->name, tmp - alias->name), PMU_EVENT_SYMBOL_PREFIX); p++; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index c7fc93f54577..a38b8b160e80 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -53,6 +53,7 @@ enum perf_pmu_event_symbol_type { PMU_EVENT_SYMBOL, /* normal style PMU event */ PMU_EVENT_SYMBOL_PREFIX, /* prefix of pre-suf style event */ PMU_EVENT_SYMBOL_SUFFIX, /* suffix of pre-suf style event */ + PMU_EVENT_SYMBOL_SUFFIX2, /* suffix of pre-suf2 style event */ }; struct perf_pmu_event_symbol { diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 4efe9872c667..5b6e4b5249cf 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -149,6 +149,8 @@ static int pmu_str_check(yyscan_t scanner, struct parse_events_state *parse_stat return PE_PMU_EVENT_PRE; case PMU_EVENT_SYMBOL_SUFFIX: return PE_PMU_EVENT_SUF; + case PMU_EVENT_SYMBOL_SUFFIX2: + return PE_PMU_EVENT_SUF2; case PMU_EVENT_SYMBOL: return parse_state->fake_pmu ? PE_PMU_EVENT_FAKE : PE_KERNEL_PMU_EVENT; diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 174158982fae..be8c51770051 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -69,7 +69,7 @@ static void inc_group_count(struct list_head *list, %token PE_NAME_CACHE_TYPE PE_NAME_CACHE_OP_RESULT %token PE_PREFIX_MEM PE_PREFIX_RAW PE_PREFIX_GROUP %token PE_ERROR -%token PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE +%token PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_PMU_EVENT_SUF2 PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE %token PE_ARRAY_ALL PE_ARRAY_RANGE %token PE_DRV_CFG_TERM %type PE_VALUE @@ -87,7 +87,7 @@ static void inc_group_count(struct list_head *list, %type PE_MODIFIER_EVENT %type PE_MODIFIER_BP %type PE_EVENT_NAME -%type PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE +%type PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_PMU_EVENT_SUF2 PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE %type PE_DRV_CFG_TERM %type event_pmu_name %destructor { free ($$); } @@ -372,6 +372,19 @@ PE_KERNEL_PMU_EVENT opt_pmu_config $$ = list; } | +PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF '-' PE_PMU_EVENT_SUF2 sep_dc +{ + struct list_head *list; + char pmu_name[128]; + snprintf(pmu_name, sizeof(pmu_name), "%s-%s-%s", $1, $3, $5); + free($1); + free($3); + free($5); + if (parse_events_multi_pmu_add(_parse_state, pmu_name, NULL, &list) < 0) + YYABORT; + $$ = list; +} +| PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc { struct list_head *list; From 34fa67e72085201ea94b5332eae316951331958f Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 17 Jan 2022 23:10:14 +0800 Subject: [PATCH 0415/1295] perf test: Add pmu-events test for aliases with hyphens Add a test for aliases with hyphens in the name to ensure that the pmu-events tables are as expects. There should be no reason why these sort of aliases would be treated differently, but no harm in checking. Signed-off-by: John Garry Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Qi Liu Cc: Shaokun Zhang Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/1642432215-234089-3-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/test/test_soc/cpu/uncore.json | 16 ++++++++++ tools/perf/tests/pmu-events.c | 32 +++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/tools/perf/pmu-events/arch/test/test_soc/cpu/uncore.json b/tools/perf/pmu-events/arch/test/test_soc/cpu/uncore.json index 73089c682f80..41bac1c6a008 100644 --- a/tools/perf/pmu-events/arch/test/test_soc/cpu/uncore.json +++ b/tools/perf/pmu-events/arch/test/test_soc/cpu/uncore.json @@ -18,6 +18,22 @@ "Invert": "0", "EdgeDetect": "0" }, + { + "Unit": "CBO", + "EventCode": "0xE0", + "UMask": "0x00", + "EventName": "event-hyphen", + "BriefDescription": "UNC_CBO_HYPHEN", + "PublicDescription": "UNC_CBO_HYPHEN" + }, + { + "Unit": "CBO", + "EventCode": "0xC0", + "UMask": "0x00", + "EventName": "event-two-hyph", + "BriefDescription": "UNC_CBO_TWO_HYPH", + "PublicDescription": "UNC_CBO_TWO_HYPH" + }, { "EventCode": "0x7", "EventName": "uncore_hisi_l3c.rd_hit_cpipe", diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c index df1c9a3cc05b..1c695fb5a79c 100644 --- a/tools/perf/tests/pmu-events.c +++ b/tools/perf/tests/pmu-events.c @@ -143,6 +143,34 @@ static const struct perf_pmu_test_event unc_cbo_xsnp_response_miss_eviction = { .matching_pmu = "uncore_cbox_0", }; +static const struct perf_pmu_test_event uncore_hyphen = { + .event = { + .name = "event-hyphen", + .event = "umask=0x00,event=0xe0", + .desc = "Unit: uncore_cbox UNC_CBO_HYPHEN", + .topic = "uncore", + .long_desc = "UNC_CBO_HYPHEN", + .pmu = "uncore_cbox", + }, + .alias_str = "umask=0,event=0xe0", + .alias_long_desc = "UNC_CBO_HYPHEN", + .matching_pmu = "uncore_cbox_0", +}; + +static const struct perf_pmu_test_event uncore_two_hyph = { + .event = { + .name = "event-two-hyph", + .event = "umask=0x00,event=0xc0", + .desc = "Unit: uncore_cbox UNC_CBO_TWO_HYPH", + .topic = "uncore", + .long_desc = "UNC_CBO_TWO_HYPH", + .pmu = "uncore_cbox", + }, + .alias_str = "umask=0,event=0xc0", + .alias_long_desc = "UNC_CBO_TWO_HYPH", + .matching_pmu = "uncore_cbox_0", +}; + static const struct perf_pmu_test_event uncore_hisi_l3c_rd_hit_cpipe = { .event = { .name = "uncore_hisi_l3c.rd_hit_cpipe", @@ -188,6 +216,8 @@ static const struct perf_pmu_test_event uncore_imc_cache_hits = { static const struct perf_pmu_test_event *uncore_events[] = { &uncore_hisi_ddrc_flux_wcmd, &unc_cbo_xsnp_response_miss_eviction, + &uncore_hyphen, + &uncore_two_hyph, &uncore_hisi_l3c_rd_hit_cpipe, &uncore_imc_free_running_cache_miss, &uncore_imc_cache_hits, @@ -654,6 +684,8 @@ static struct perf_pmu_test_pmu test_pmus[] = { }, .aliases = { &unc_cbo_xsnp_response_miss_eviction, + &uncore_hyphen, + &uncore_two_hyph, }, }, { From b4a7276c5e9a79c238a2fad4fb9498dd3558ad2e Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 17 Jan 2022 23:10:15 +0800 Subject: [PATCH 0416/1295] perf test: Add parse-events test for aliases with hyphens Add a test which allows us to test parsing an event alias with hyphens. Since these events typically do not exist on most host systems, add the alias to the fake pmu. Function perf_pmu__test_parse_init() has terms added to match known test aliases. Signed-off-by: John Garry Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Qi Liu Cc: Shaokun Zhang Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/1642432215-234089-4-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 49 +++++++++++++++++++++++++++++++++ tools/perf/util/parse-events.c | 42 ++++++++++++++++++++++------ 2 files changed, 82 insertions(+), 9 deletions(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index a508f1dbcb2a..e71efadb24f5 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -2069,6 +2069,31 @@ static int test_event(struct evlist_test *e) return ret; } +static int test_event_fake_pmu(const char *str) +{ + struct parse_events_error err; + struct evlist *evlist; + int ret; + + evlist = evlist__new(); + if (!evlist) + return -ENOMEM; + + parse_events_error__init(&err); + perf_pmu__test_parse_init(); + ret = __parse_events(evlist, str, &err, &perf_pmu__fake); + if (ret) { + pr_debug("failed to parse event '%s', err %d, str '%s'\n", + str, ret, err.str); + parse_events_error__print(&err, str); + } + + parse_events_error__exit(&err); + evlist__delete(evlist); + + return ret; +} + static int test_events(struct evlist_test *events, unsigned cnt) { int ret1, ret2 = 0; @@ -2276,6 +2301,26 @@ static int test_pmu_events_alias(char *event, char *alias) return test_event(&e); } +static int test_pmu_events_alias2(void) +{ + static const char events[][30] = { + "event-hyphen", + "event-two-hyph", + }; + unsigned long i; + int ret = 0; + + for (i = 0; i < ARRAY_SIZE(events); i++) { + ret = test_event_fake_pmu(&events[i][0]); + if (ret) { + pr_err("check_parse_fake %s failed\n", &events[i][0]); + break; + } + } + + return ret; +} + static int test__parse_events(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { int ret1, ret2 = 0; @@ -2313,6 +2358,10 @@ do { \ return ret; } + ret1 = test_pmu_events_alias2(); + if (!ret2) + ret2 = ret1; + ret1 = test_terms(test__terms, ARRAY_SIZE(test__terms)); if (!ret2) ret2 = ret1; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 879f606e07e6..9739b05b999e 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1697,6 +1697,15 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, } } } + + if (parse_state->fake_pmu) { + if (!parse_events_add_pmu(parse_state, list, str, head, + true, true)) { + pr_debug("%s -> %s/%s/\n", str, "fake_pmu", str); + ok++; + } + } + out_err: if (ok) *listp = list; @@ -2168,23 +2177,38 @@ err: */ int perf_pmu__test_parse_init(void) { - struct perf_pmu_event_symbol *list; + struct perf_pmu_event_symbol *list, *tmp, symbols[] = { + {(char *)"read", PMU_EVENT_SYMBOL}, + {(char *)"event", PMU_EVENT_SYMBOL_PREFIX}, + {(char *)"two", PMU_EVENT_SYMBOL_SUFFIX}, + {(char *)"hyphen", PMU_EVENT_SYMBOL_SUFFIX}, + {(char *)"hyph", PMU_EVENT_SYMBOL_SUFFIX2}, + }; + unsigned long i, j; - list = malloc(sizeof(*list) * 1); + tmp = list = malloc(sizeof(*list) * ARRAY_SIZE(symbols)); if (!list) return -ENOMEM; - list->type = PMU_EVENT_SYMBOL; - list->symbol = strdup("read"); - - if (!list->symbol) { - free(list); - return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(symbols); i++, tmp++) { + tmp->type = symbols[i].type; + tmp->symbol = strdup(symbols[i].symbol); + if (!list->symbol) + goto err_free; } perf_pmu_events_list = list; - perf_pmu_events_list_num = 1; + perf_pmu_events_list_num = ARRAY_SIZE(symbols); + + qsort(perf_pmu_events_list, ARRAY_SIZE(symbols), + sizeof(struct perf_pmu_event_symbol), comp_pmu); return 0; + +err_free: + for (j = 0, tmp = list; j < i; j++, tmp++) + free(tmp->symbol); + free(list); + return -ENOMEM; } enum perf_pmu_event_symbol_type From f0ac5b85810a69104ee6bc939bcbaecfe4db9a3e Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Wed, 12 Jan 2022 08:01:09 +0000 Subject: [PATCH 0417/1295] perf tools: Remove redundant err variable Return value from perf_event__process_tracing_data() directly instead of taking this in another redundant variable. Reported-by: Zeal Robot Signed-off-by: Minghao Chi Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Link: http://lore.kernel.org/lkml/20220112080109.666800-1-chi.minghao@zte.com.cn Signed-off-by: CGEL ZTE Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 409b721666cb..fbf43a454cba 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -535,12 +535,9 @@ static int perf_event__repipe_exit(struct perf_tool *tool, static int perf_event__repipe_tracing_data(struct perf_session *session, union perf_event *event) { - int err; - perf_event__repipe_synth(session->tool, event); - err = perf_event__process_tracing_data(session, event); - return err; + return perf_event__process_tracing_data(session, event); } static int dso__read_build_id(struct dso *dso) From 6b9b6413700e104934734b72a3be622a76923b98 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 22 Jan 2022 09:17:10 -0500 Subject: [PATCH 0418/1295] ftrace: Fix assuming build time sort works for s390 To speed up the boot process, as mcount_loc needs to be sorted for ftrace to work properly, sorting it at build time is more efficient than boot up and can save milliseconds of time. Unfortunately, this change broke s390 as it will modify the mcount_loc location after the sorting takes place and will put back the unsorted locations. Since the sorting is skipped at boot up if it is believed that it was sorted at run time, ftrace can crash as its algorithms are dependent on the list being sorted. Add a new config BUILDTIME_MCOUNT_SORT that is set when BUILDTIME_TABLE_SORT but not if S390 is set. Use this config to determine if sorting should take place at boot up. Link: https://lore.kernel.org/all/yt9dee51ctfn.fsf@linux.ibm.com/ Fixes: 72b3942a173c ("scripts: ftrace - move the sort-processing in ftrace_init") Reported-by: Sven Schnelle Tested-by: Heiko Carstens Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 9 ++++++++- kernel/trace/ftrace.c | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f468767bc287..752ed89a293b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -70,6 +70,13 @@ config HAVE_C_RECORDMCOUNT help C version of recordmcount available? +config BUILDTIME_MCOUNT_SORT + bool + default y + depends on BUILDTIME_TABLE_SORT && !S390 + help + Sort the mcount_loc section at build time. + config TRACER_MAX_TRACE bool @@ -918,7 +925,7 @@ config EVENT_TRACE_TEST_SYSCALLS config FTRACE_SORT_STARTUP_TEST bool "Verify compile time sorting of ftrace functions" depends on DYNAMIC_FTRACE - depends on BUILDTIME_TABLE_SORT + depends on BUILDTIME_MCOUNT_SORT help Sorting of the mcount_loc sections that is used to find the where the ftrace knows where to patch functions for tracing diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 403e485bf091..b01e1fa62193 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6429,10 +6429,10 @@ static int ftrace_process_locs(struct module *mod, /* * Sorting mcount in vmlinux at build time depend on - * CONFIG_BUILDTIME_TABLE_SORT, while mcount loc in + * CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in * modules can not be sorted at build time. */ - if (!IS_ENABLED(CONFIG_BUILDTIME_TABLE_SORT) || mod) { + if (!IS_ENABLED(CONFIG_BUILDTIME_MCOUNT_SORT) || mod) { sort(start, count, sizeof(*start), ftrace_cmp_ips, NULL); } else { From e783362eb54cd99b2cac8b3a9aeac942e6f6ac07 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 23 Jan 2022 10:12:53 +0200 Subject: [PATCH 0419/1295] Linux 5.17-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c94559a97dca..0fb4f94a6885 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 -PATCHLEVEL = 16 +PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Gobble Gobble # *DOCUMENTATION* From 1ea1d6a847d2b1d17fefd9196664b95f052a0775 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 13 Jan 2022 11:44:19 +0100 Subject: [PATCH 0420/1295] s390/nmi: handle guarded storage validity failures for KVM guests machine check validity bits reflect the state of the machine check. If a guest does not make use of guarded storage, the validity bit might be off. We can not use the host CR bit to decide if the validity bit must be on. So ignore "invalid" guarded storage controls for KVM guests in the host and rely on the machine check being forwarded to the guest. If no other errors happen from a host perspective everything is fine and no process must be killed and the host can continue to run. Cc: stable@vger.kernel.org Fixes: c929500d7a5a ("s390/nmi: s390: New low level handling for machine check happening in guest") Reported-by: Carsten Otte Signed-off-by: Christian Borntraeger Tested-by: Carsten Otte Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/kernel/nmi.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 0c9e894913dc..147c0d5fd9b4 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -307,11 +307,21 @@ static int notrace s390_validate_registers(union mci mci, int umode) if (cr2.gse) { if (!mci.gs) { /* - * Guarded storage register can't be restored and - * the current processes uses guarded storage. - * It has to be terminated. + * 2 cases: + * - machine check in kernel or userspace + * - machine check while running SIE (KVM guest) + * For kernel or userspace the userspace values of + * guarded storage control can not be recreated, the + * process must be terminated. + * For SIE the guest values of guarded storage can not + * be recreated. This is either due to a bug or due to + * GS being disabled in the guest. The guest will be + * notified by KVM code and the guests machine check + * handling must take care of this. The host values + * are saved by KVM and are not affected. */ - kill_task = 1; + if (!test_cpu_flag(CIF_MCCK_GUEST)) + kill_task = 1; } else { load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area); } From f094a39c6ba168f2df1edfd1731cca377af5f442 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 17 Jan 2022 18:40:32 +0100 Subject: [PATCH 0421/1295] s390/nmi: handle vector validity failures for KVM guests The machine check validity bit tells about the context. If a KVM guest was running the bit tells about the guest validity and the host state is not affected. As a guest can disable the guest validity this might result in unwanted host errors on machine checks. Cc: stable@vger.kernel.org Fixes: c929500d7a5a ("s390/nmi: s390: New low level handling for machine check happening in guest") Signed-off-by: Christian Borntraeger Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/kernel/nmi.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 147c0d5fd9b4..651a51914e34 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -264,7 +264,14 @@ static int notrace s390_validate_registers(union mci mci, int umode) /* Validate vector registers */ union ctlreg0 cr0; - if (!mci.vr) { + /* + * The vector validity must only be checked if not running a + * KVM guest. For KVM guests the machine check is forwarded by + * KVM and it is the responsibility of the guest to take + * appropriate actions. The host vector or FPU values have been + * saved by KVM and will be restored by KVM. + */ + if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST)) { /* * Vector registers can't be restored. If the kernel * currently uses vector registers the system is From 3d787b392d169d4a2e3aee6ac6dfd6ec39722cf2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 22 Jan 2022 10:24:31 +0100 Subject: [PATCH 0422/1295] s390/uaccess: fix compile error Compiling with e.g MARCH=z900 results in compile errors: arch/s390/lib/uaccess.c: In function 'copy_from_user_mvcos': >> arch/s390/lib/uaccess.c:65:15: error: variable 'spec' has initializer but incomplete type 65 | union oac spec = { Therefore make definition of union oac visible for all MARCHs. Reported-by: kernel test robot Cc: Nico Boehr Cc: Janis Schoetterl-Glausch Fixes: 012a224e1fa3 ("s390/uaccess: introduce bit field for OAC specifier") Signed-off-by: Heiko Carstens --- arch/s390/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 147cb3534ce4..d74e26b48604 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -47,8 +47,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n); int __put_user_bad(void) __attribute__((noreturn)); int __get_user_bad(void) __attribute__((noreturn)); -#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES - union oac { unsigned int val; struct { @@ -71,6 +69,8 @@ union oac { }; }; +#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES + #define __put_get_user_asm(to, from, size, oac_spec) \ ({ \ int __rc; \ From e9b7c3a4263bdcfd31bc3d03d48ce0ded7a94635 Mon Sep 17 00:00:00 2001 From: Mihai Carabas Date: Wed, 19 Jan 2022 18:14:27 +0200 Subject: [PATCH 0423/1295] efi/libstub: arm64: Fix image check alignment at entry The kernel is aligned at SEGMENT_SIZE and this is the size populated in the PE headers: arch/arm64/kernel/efi-header.S: .long SEGMENT_ALIGN // SectionAlignment EFI_KIMG_ALIGN is defined as: (SEGMENT_ALIGN > THREAD_ALIGN ? SEGMENT_ALIGN : THREAD_ALIGN) So it depends on THREAD_ALIGN. On newer builds this message started to appear even though the loader is taking into account the PE header (which is stating SEGMENT_ALIGN). Fixes: c32ac11da3f8 ("efi/libstub: arm64: Double check image alignment at entry") Signed-off-by: Mihai Carabas Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/arm64-stub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 2363fee9211c..9cc556013d08 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -119,9 +119,9 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, if (image->image_base != _text) efi_err("FIRMWARE BUG: efi_loaded_image_t::image_base has bogus value\n"); - if (!IS_ALIGNED((u64)_text, EFI_KIMG_ALIGN)) - efi_err("FIRMWARE BUG: kernel image not aligned on %ldk boundary\n", - EFI_KIMG_ALIGN >> 10); + if (!IS_ALIGNED((u64)_text, SEGMENT_ALIGN)) + efi_err("FIRMWARE BUG: kernel image not aligned on %dk boundary\n", + SEGMENT_ALIGN >> 10); kernel_size = _edata - _text; kernel_memsize = kernel_size + (_end - _edata); From f5390cd0b43c2e54c7cf5506c7da4a37c5cef746 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 12 Jan 2022 11:14:13 +0100 Subject: [PATCH 0424/1295] efi: runtime: avoid EFIv2 runtime services on Apple x86 machines Aditya reports [0] that his recent MacbookPro crashes in the firmware when using the variable services at runtime. The culprit appears to be a call to QueryVariableInfo(), which we did not use to call on Apple x86 machines in the past as they only upgraded from EFI v1.10 to EFI v2.40 firmware fairly recently, and QueryVariableInfo() (along with UpdateCapsule() et al) was added in EFI v2.00. The only runtime service introduced in EFI v2.00 that we actually use in Linux is QueryVariableInfo(), as the capsule based ones are optional, generally not used at runtime (all the LVFS/fwupd firmware update infrastructure uses helper EFI programs that invoke capsule update at boot time, not runtime), and not implemented by Apple machines in the first place. QueryVariableInfo() is used to 'safely' set variables, i.e., only when there is enough space. This prevents machines with buggy firmwares from corrupting their NVRAMs when they run out of space. Given that Apple machines have been using EFI v1.10 services only for the longest time (the EFI v2.0 spec was released in 2006, and Linux support for the newly introduced runtime services was added in 2011, but the MacbookPro12,1 released in 2015 still claims to be EFI v1.10 only), let's avoid the EFI v2.0 ones on all Apple x86 machines. [0] https://lore.kernel.org/all/6D757C75-65B1-468B-842D-10410081A8E4@live.com/ Cc: Cc: Jeremy Kerr Cc: Matthew Garrett Reported-by: Aditya Garg Tested-by: Orlando Chamberlain Signed-off-by: Ard Biesheuvel Tested-by: Aditya Garg Link: https://bugzilla.kernel.org/show_bug.cgi?id=215277 --- drivers/firmware/efi/efi.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index ae79c3300129..7de3f5b6e8d0 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -722,6 +722,13 @@ void __init efi_systab_report_header(const efi_table_hdr_t *systab_hdr, systab_hdr->revision >> 16, systab_hdr->revision & 0xffff, vendor); + + if (IS_ENABLED(CONFIG_X86_64) && + systab_hdr->revision > EFI_1_10_SYSTEM_TABLE_REVISION && + !strcmp(vendor, "Apple")) { + pr_info("Apple Mac detected, using EFI v1.10 runtime services only\n"); + efi.runtime_version = EFI_1_10_SYSTEM_TABLE_REVISION; + } } static __initdata char memory_type_name[][13] = { From 42fed57046fc74586d7058bd51a1c10ac9c690cb Mon Sep 17 00:00:00 2001 From: Al Cooper Date: Wed, 1 Dec 2021 13:06:51 -0500 Subject: [PATCH 0425/1295] phy: usb: Leave some clocks running during suspend The PHY client driver does a phy_exit() call on suspend or rmmod and the PHY driver needs to know the difference because some clocks need to be kept running for suspend but can be shutdown on unbind/rmmod (or if there are no PHY clients at all). The fix is to use a PM notifier so the driver can tell if a PHY client is calling exit() because of a system suspend or a driver unbind/rmmod. Signed-off-by: Al Cooper Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20211201180653.35097-2-alcooperx@gmail.com Signed-off-by: Vinod Koul --- drivers/phy/broadcom/phy-brcm-usb.c | 38 +++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/drivers/phy/broadcom/phy-brcm-usb.c b/drivers/phy/broadcom/phy-brcm-usb.c index 116fb23aebd9..0f1deb6e0eab 100644 --- a/drivers/phy/broadcom/phy-brcm-usb.c +++ b/drivers/phy/broadcom/phy-brcm-usb.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "phy-brcm-usb-init.h" @@ -70,12 +71,35 @@ struct brcm_usb_phy_data { int init_count; int wake_irq; struct brcm_usb_phy phys[BRCM_USB_PHY_ID_MAX]; + struct notifier_block pm_notifier; + bool pm_active; }; static s8 *node_reg_names[BRCM_REGS_MAX] = { "crtl", "xhci_ec", "xhci_gbl", "usb_phy", "usb_mdio", "bdc_ec" }; +static int brcm_pm_notifier(struct notifier_block *notifier, + unsigned long pm_event, + void *unused) +{ + struct brcm_usb_phy_data *priv = + container_of(notifier, struct brcm_usb_phy_data, pm_notifier); + + switch (pm_event) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + priv->pm_active = true; + break; + case PM_POST_RESTORE: + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: + priv->pm_active = false; + break; + } + return NOTIFY_DONE; +} + static irqreturn_t brcm_usb_phy_wake_isr(int irq, void *dev_id) { struct phy *gphy = dev_id; @@ -91,6 +115,9 @@ static int brcm_usb_phy_init(struct phy *gphy) struct brcm_usb_phy_data *priv = container_of(phy, struct brcm_usb_phy_data, phys[phy->id]); + if (priv->pm_active) + return 0; + /* * Use a lock to make sure a second caller waits until * the base phy is inited before using it. @@ -120,6 +147,9 @@ static int brcm_usb_phy_exit(struct phy *gphy) struct brcm_usb_phy_data *priv = container_of(phy, struct brcm_usb_phy_data, phys[phy->id]); + if (priv->pm_active) + return 0; + dev_dbg(&gphy->dev, "EXIT\n"); if (phy->id == BRCM_USB_PHY_2_0) brcm_usb_uninit_eohci(&priv->ini); @@ -488,6 +518,9 @@ static int brcm_usb_phy_probe(struct platform_device *pdev) if (err) return err; + priv->pm_notifier.notifier_call = brcm_pm_notifier; + register_pm_notifier(&priv->pm_notifier); + mutex_init(&priv->mutex); /* make sure invert settings are correct */ @@ -528,7 +561,10 @@ static int brcm_usb_phy_probe(struct platform_device *pdev) static int brcm_usb_phy_remove(struct platform_device *pdev) { + struct brcm_usb_phy_data *priv = dev_get_drvdata(&pdev->dev); + sysfs_remove_group(&pdev->dev.kobj, &brcm_usb_phy_group); + unregister_pm_notifier(&priv->pm_notifier); return 0; } @@ -539,6 +575,7 @@ static int brcm_usb_phy_suspend(struct device *dev) struct brcm_usb_phy_data *priv = dev_get_drvdata(dev); if (priv->init_count) { + dev_dbg(dev, "SUSPEND\n"); priv->ini.wake_enabled = device_may_wakeup(dev); if (priv->phys[BRCM_USB_PHY_3_0].inited) brcm_usb_uninit_xhci(&priv->ini); @@ -578,6 +615,7 @@ static int brcm_usb_phy_resume(struct device *dev) * Uninitialize anything that wasn't previously initialized. */ if (priv->init_count) { + dev_dbg(dev, "RESUME\n"); if (priv->wake_irq >= 0) disable_irq_wake(priv->wake_irq); brcm_usb_init_common(&priv->ini); From 5070ce86246a8a4ebacd0c15b121e6b6325bc167 Mon Sep 17 00:00:00 2001 From: Al Cooper Date: Wed, 1 Dec 2021 13:06:53 -0500 Subject: [PATCH 0426/1295] phy: broadcom: Kconfig: Fix PHY_BRCM_USB config option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit 4b402fa8e0b7 ("phy: phy-brcm-usb: support PHY on the BCM4908") added a second "default" line for ARCH_BCM_4908 above the original "default" line for ARCH_BRCMSTB. When two "default" lines are used, only the first is used and this change stopped the PHY_BRCM_USB option for being enabled for ARCH_BRCMSTB. The fix is to use one "default line with "||". Fixes: 4b402fa8e0b7 ("phy: phy-brcm-usb: support PHY on the BCM4908") Signed-off-by: Al Cooper Acked-by: Rafał Miłecki Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20211201180653.35097-4-alcooperx@gmail.com Signed-off-by: Vinod Koul --- drivers/phy/broadcom/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/phy/broadcom/Kconfig b/drivers/phy/broadcom/Kconfig index f81e23742079..849c4204f550 100644 --- a/drivers/phy/broadcom/Kconfig +++ b/drivers/phy/broadcom/Kconfig @@ -97,8 +97,7 @@ config PHY_BRCM_USB depends on OF select GENERIC_PHY select SOC_BRCMSTB if ARCH_BRCMSTB - default ARCH_BCM4908 - default ARCH_BRCMSTB + default ARCH_BCM4908 || ARCH_BRCMSTB help Enable this to support the Broadcom STB USB PHY. This driver is required by the USB XHCI, EHCI and OHCI From b36a2050040b2d839bdc044007cdd57101d7f881 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Fri, 21 Jan 2022 04:38:56 -0800 Subject: [PATCH 0427/1295] io_uring: fix bug in slow unregistering of nodes In some cases io_rsrc_ref_quiesce will call io_rsrc_node_switch_start, and then immediately flush the delayed work queue &ctx->rsrc_put_work. However the percpu_ref_put does not immediately destroy the node, it will be called asynchronously via RCU. That ends up with io_rsrc_node_ref_zero only being called after rsrc_put_work has been flushed, and so the process ends up sleeping for 1 second unnecessarily. This patch executes the put code immediately if we are busy quiescing. Fixes: 4a38aed2a0a7 ("io_uring: batch reap of dead file registrations") Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220121123856.3557884-1-dylany@fb.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e54c4127422e..dd4c801c7afd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7822,10 +7822,15 @@ static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) struct io_ring_ctx *ctx = node->rsrc_data->ctx; unsigned long flags; bool first_add = false; + unsigned long delay = HZ; spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); node->done = true; + /* if we are mid-quiesce then do not delay */ + if (node->rsrc_data->quiesce) + delay = 0; + while (!list_empty(&ctx->rsrc_ref_list)) { node = list_first_entry(&ctx->rsrc_ref_list, struct io_rsrc_node, node); @@ -7838,7 +7843,7 @@ static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); if (first_add) - mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); + mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); } static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) From 83114df32ae779df57e0af99a8ba6c3968b2ba3d Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 20 Jan 2022 10:10:25 +0000 Subject: [PATCH 0428/1295] block: fix memory leak in disk_register_independent_access_ranges kobject_init_and_add() takes reference even when it fails. According to the doc of kobject_init_and_add() If this function returns an error, kobject_put() must be called to properly clean up the memory associated with the object. Fix this issue by adding kobject_put(). Callback function blk_ia_ranges_sysfs_release() in kobject_put() can handle the pointer "iars" properly. Fixes: a2247f19ee1c ("block: Add independent access ranges support") Signed-off-by: Miaoqian Lin Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20220120101025.22411-1-linmq006@gmail.com Signed-off-by: Jens Axboe --- block/blk-ia-ranges.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c index b925f3db3ab7..18c68d8b9138 100644 --- a/block/blk-ia-ranges.c +++ b/block/blk-ia-ranges.c @@ -144,7 +144,7 @@ int disk_register_independent_access_ranges(struct gendisk *disk, &q->kobj, "%s", "independent_access_ranges"); if (ret) { q->ia_ranges = NULL; - kfree(iars); + kobject_put(&iars->kobj); return ret; } From 94bfe2bdfc5059a0870447ccf2c8048f3d016898 Mon Sep 17 00:00:00 2001 From: Alim Akhtar Date: Wed, 5 Jan 2022 22:13:41 +0530 Subject: [PATCH 0429/1295] MAINTAINERS: add reviewer entry for Samsung/Exynos platform Adds myself as reviewer for Samsung/Exynos platform to help in review of current and upcoming SoCs patches. Signed-off-by: Alim Akhtar Acked-by: Sylwester Nawrocki Link: https://lore.kernel.org/r/20220105164341.27479-1-alim.akhtar@samsung.com Signed-off-by: Krzysztof Kozlowski --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index ea3e6c914384..118bd4649fb2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2569,6 +2569,7 @@ N: rockchip ARM/SAMSUNG S3C, S5P AND EXYNOS ARM ARCHITECTURES M: Krzysztof Kozlowski +R: Alim Akhtar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -15280,6 +15281,7 @@ PIN CONTROLLER - SAMSUNG M: Tomasz Figa M: Krzysztof Kozlowski M: Sylwester Nawrocki +R: Alim Akhtar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -17070,6 +17072,7 @@ SAMSUNG SOC CLOCK DRIVERS M: Sylwester Nawrocki M: Tomasz Figa M: Chanwoo Choi +R: Alim Akhtar L: linux-samsung-soc@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/snawrocki/clk.git From 442b0c08db7e35980bed6af091877f4dda72ffca Mon Sep 17 00:00:00 2001 From: Sam Protsenko Date: Fri, 14 Jan 2022 16:46:06 +0200 Subject: [PATCH 0430/1295] soc: samsung: Fix typo in CONFIG_EXYNOS_USI description The proper name is Exynos Auto V9, not V0. It was the typo slipped in unnoticed, fix it. Fixes: b603377e408f ("soc: samsung: Add USI driver") Signed-off-by: Sam Protsenko Reviewed-by: Chanho Park Link: https://lore.kernel.org/r/20220114144606.24358-1-semen.protsenko@linaro.org Signed-off-by: Krzysztof Kozlowski --- drivers/soc/samsung/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/samsung/Kconfig b/drivers/soc/samsung/Kconfig index a9f8b224322e..02e319508cc6 100644 --- a/drivers/soc/samsung/Kconfig +++ b/drivers/soc/samsung/Kconfig @@ -31,7 +31,7 @@ config EXYNOS_USI help Enable support for USI block. USI (Universal Serial Interface) is an IP-core found in modern Samsung Exynos SoCs, like Exynos850 and - ExynosAutoV0. USI block can be configured to provide one of the + ExynosAutoV9. USI block can be configured to provide one of the following serial protocols: UART, SPI or High Speed I2C. This driver allows one to configure USI for desired protocol, which From 1f52b0aba6fd37653416375cb8a1ca673acf8d5f Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 17 Jan 2022 16:13:28 +0000 Subject: [PATCH 0431/1295] x86/MCE/AMD: Allow thresholding interface updates after init Changes to the AMD Thresholding sysfs code prevents sysfs writes from updating the underlying registers once CPU init is completed, i.e. "threshold_banks" is set. Allow the registers to be updated if the thresholding interface is already initialized or if in the init path. Use the "set_lvt_off" value to indicate if running in the init path, since this value is only set during init. Fixes: a037f3ca0ea0 ("x86/mce/amd: Make threshold bank setting hotplug robust") Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Link: https://lore.kernel.org/r/20220117161328.19148-1-yazen.ghannam@amd.com --- arch/x86/kernel/cpu/mce/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index a1e2f41796dc..9f4b508886dd 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -423,7 +423,7 @@ static void threshold_restart_bank(void *_tr) u32 hi, lo; /* sysfs write might race against an offline operation */ - if (this_cpu_read(threshold_banks)) + if (!this_cpu_read(threshold_banks) && !tr->set_lvt_off) return; rdmsr(tr->b->address, lo, hi); From 96d9d1fa5cd505078534113308ced0aa56d8da58 Mon Sep 17 00:00:00 2001 From: Yanming Liu Date: Thu, 20 Jan 2022 04:20:52 +0800 Subject: [PATCH 0432/1295] Drivers: hv: balloon: account for vmbus packet header in max_pkt_size Commit adae1e931acd ("Drivers: hv: vmbus: Copy packets sent by Hyper-V out of the ring buffer") introduced a notion of maximum packet size in vmbus channel and used that size to initialize a buffer holding all incoming packet along with their vmbus packet header. hv_balloon uses the default maximum packet size VMBUS_DEFAULT_MAX_PKT_SIZE which matches its maximum message size, however vmbus_open expects this size to also include vmbus packet header. This leads to 4096 bytes dm_unballoon_request messages being truncated to 4080 bytes. When the driver tries to read next packet it starts from a wrong read_index, receives garbage and prints a lot of "Unhandled message: type: " in dmesg. Allocate the buffer with HV_HYP_PAGE_SIZE more bytes to make room for the header. Fixes: adae1e931acd ("Drivers: hv: vmbus: Copy packets sent by Hyper-V out of the ring buffer") Suggested-by: Michael Kelley (LINUX) Suggested-by: Andrea Parri (Microsoft) Signed-off-by: Yanming Liu Reviewed-by: Michael Kelley Reviewed-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20220119202052.3006981-1-yanminglr@gmail.com Signed-off-by: Wei Liu --- drivers/hv/hv_balloon.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index ca873a3b98db..f2d05bff4245 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1660,6 +1660,13 @@ static int balloon_connect_vsp(struct hv_device *dev) unsigned long t; int ret; + /* + * max_pkt_size should be large enough for one vmbus packet header plus + * our receive buffer size. Hyper-V sends messages up to + * HV_HYP_PAGE_SIZE bytes long on balloon channel. + */ + dev->channel->max_pkt_size = HV_HYP_PAGE_SIZE * 2; + ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0, balloon_onchannelcallback, dev); if (ret) From 30cc53897470d45219fb0a5eafd0cc8b0032cd1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 11 Jan 2022 18:29:18 +0100 Subject: [PATCH 0433/1295] pinctrl: thunderbay: comment process of building functions a bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This should make code a bit easier to follow. While at it use some "for" loops to simplify array iteration loops. Ref: 5d0674999cc5 ("pinctrl: keembay: comment process of building functions a bit") Signed-off-by: Rafał Miłecki Link: https://lore.kernel.org/r/20220111172919.6567-1-zajec5@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-thunderbay.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/pinctrl/pinctrl-thunderbay.c b/drivers/pinctrl/pinctrl-thunderbay.c index b5b47f4dd774..4756a23ca572 100644 --- a/drivers/pinctrl/pinctrl-thunderbay.c +++ b/drivers/pinctrl/pinctrl-thunderbay.c @@ -839,27 +839,30 @@ static int thunderbay_build_functions(struct thunderbay_pinctrl *tpc) void *ptr; int pin; - /* Total number of functions is unknown at this point. Allocate first. */ + /* + * Allocate maximum possible number of functions. Assume every pin + * being part of 8 (hw maximum) globally unique muxes. + */ tpc->nfuncs = 0; thunderbay_funcs = kcalloc(tpc->soc->npins * 8, sizeof(*thunderbay_funcs), GFP_KERNEL); if (!thunderbay_funcs) return -ENOMEM; - /* Find total number of functions and each's properties */ + /* Setup 1 function for each unique mux */ for (pin = 0; pin < tpc->soc->npins; pin++) { const struct pinctrl_pin_desc *pin_info = thunderbay_pins + pin; - struct thunderbay_mux_desc *pin_mux = pin_info->drv_data; + struct thunderbay_mux_desc *pin_mux; - while (pin_mux->name) { - struct function_desc *func = thunderbay_funcs; + for (pin_mux = pin_info->drv_data; pin_mux->name; pin_mux++) { + struct function_desc *func; - while (func->name) { + /* Check if we already have function for this mux */ + for (func = thunderbay_funcs; func->name; func++) { if (!strcmp(pin_mux->name, func->name)) { func->num_group_names++; break; } - func++; } if (!func->name) { @@ -868,8 +871,6 @@ static int thunderbay_build_functions(struct thunderbay_pinctrl *tpc) func->data = (int *)&pin_mux->mode; tpc->nfuncs++; } - - pin_mux++; } } From 25d2e41cf59bd6ccd23adc2965a157053bc3ed5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 11 Jan 2022 18:29:19 +0100 Subject: [PATCH 0434/1295] pinctrl: thunderbay: rework loops looking for groups names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the outer loop iterate over functions as that's the real subject. This simplifies code (and reduces amount of lines of code) as allocating memory for names doesn't require extra checks anymore. While at it use local "group_names" variable. It fixes: drivers/pinctrl/pinctrl-thunderbay.c: In function 'thunderbay_add_functions': drivers/pinctrl/pinctrl-thunderbay.c:815:8: warning: assignment discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers] 815 | grp = func->group_names; | ^ Ref: c26c4bfc1040 ("pinctrl: keembay: rework loops looking for groups names") Reported-by: Nathan Chancellor Signed-off-by: Rafał Miłecki Link: https://lore.kernel.org/r/20220111172919.6567-2-zajec5@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-thunderbay.c | 67 ++++++++++------------------ 1 file changed, 23 insertions(+), 44 deletions(-) diff --git a/drivers/pinctrl/pinctrl-thunderbay.c b/drivers/pinctrl/pinctrl-thunderbay.c index 4756a23ca572..79d44bca039e 100644 --- a/drivers/pinctrl/pinctrl-thunderbay.c +++ b/drivers/pinctrl/pinctrl-thunderbay.c @@ -773,63 +773,42 @@ static int thunderbay_build_groups(struct thunderbay_pinctrl *tpc) static int thunderbay_add_functions(struct thunderbay_pinctrl *tpc, struct function_desc *funcs) { - struct function_desc *function = funcs; int i; /* Assign the groups for each function */ - for (i = 0; i < tpc->soc->npins; i++) { - const struct pinctrl_pin_desc *pin_info = thunderbay_pins + i; - struct thunderbay_mux_desc *pin_mux = pin_info->drv_data; + for (i = 0; i < tpc->nfuncs; i++) { + struct function_desc *func = &funcs[i]; + const char **group_names; + unsigned int grp_idx = 0; + int j; - while (pin_mux->name) { - const char **grp; - int j, grp_num, match = 0; - size_t grp_size; - struct function_desc *func; + group_names = devm_kcalloc(tpc->dev, func->num_group_names, + sizeof(*group_names), GFP_KERNEL); + if (!group_names) + return -ENOMEM; - for (j = 0; j < tpc->nfuncs; j++) { - if (!strcmp(pin_mux->name, function[j].name)) { - match = 1; - break; - } + for (j = 0; j < tpc->soc->npins; j++) { + const struct pinctrl_pin_desc *pin_info = &thunderbay_pins[j]; + struct thunderbay_mux_desc *pin_mux; + + for (pin_mux = pin_info->drv_data; pin_mux->name; pin_mux++) { + if (!strcmp(pin_mux->name, func->name)) + group_names[grp_idx++] = pin_info->name; } - - if (!match) - return -EINVAL; - - func = function + j; - grp_num = func->num_group_names; - grp_size = sizeof(*func->group_names); - - if (!func->group_names) { - func->group_names = devm_kcalloc(tpc->dev, - grp_num, - grp_size, - GFP_KERNEL); - if (!func->group_names) { - kfree(func); - return -ENOMEM; - } - } - - grp = func->group_names; - while (*grp) - grp++; - - *grp = pin_info->name; - pin_mux++; } + + func->group_names = group_names; } /* Add all functions */ for (i = 0; i < tpc->nfuncs; i++) { pinmux_generic_add_function(tpc->pctrl, - function[i].name, - function[i].group_names, - function[i].num_group_names, - function[i].data); + funcs[i].name, + funcs[i].group_names, + funcs[i].num_group_names, + funcs[i].data); } - kfree(function); + kfree(funcs); return 0; } From aa28514592d52043f4837a6457d6310452135ae1 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 4 Jan 2022 17:42:38 +0100 Subject: [PATCH 0435/1295] pinctrl: cherryview: Trigger hwirq0 for interrupt-lines without a mapping Commit bdfbef2d29dc ("pinctrl: cherryview: Don't use selection 0 to mark an interrupt line as unused") made the code properly differentiate between unset vs (hwirq) 0 entries in the GPIO-controller interrupt-line to GPIO pinnumber/hwirq mapping. This is causing some boards to not boot. This commit restores the old behavior of triggering hwirq 0 when receiving an interrupt on an interrupt-line for which there is no mapping. Fixes: bdfbef2d29dc ("pinctrl: cherryview: Don't use selection 0 to mark an interrupt line as unused") Reported-and-tested-by: Jarkko Nikula Signed-off-by: Hans de Goede Acked-by: Andy Shevchenko Acked-by: Mika Westerberg Link: https://lore.kernel.org/r/20220104164238.253142-1-hdegoede@redhat.com Signed-off-by: Linus Walleij --- drivers/pinctrl/intel/pinctrl-cherryview.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/intel/pinctrl-cherryview.c b/drivers/pinctrl/intel/pinctrl-cherryview.c index abffda1fd51e..1d5818269076 100644 --- a/drivers/pinctrl/intel/pinctrl-cherryview.c +++ b/drivers/pinctrl/intel/pinctrl-cherryview.c @@ -1471,8 +1471,9 @@ static void chv_gpio_irq_handler(struct irq_desc *desc) offset = cctx->intr_lines[intr_line]; if (offset == CHV_INVALID_HWIRQ) { - dev_err(dev, "interrupt on unused interrupt line %u\n", intr_line); - continue; + dev_warn_once(dev, "interrupt on unmapped interrupt line %u\n", intr_line); + /* Some boards expect hwirq 0 to trigger in this case */ + offset = 0; } generic_handle_domain_irq(gc->irq.domain, offset); From 1fd6bb5b47a65eacb063b37e6fa6df2b8fa92959 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 5 Jan 2022 17:29:52 +0000 Subject: [PATCH 0436/1295] pinctrl: sunxi: Fix H616 I2S3 pin data Two bugs have sneaked in the H616 pinctrl data: - PH9 uses the mux value of 0x3 twice (one should be 0x5 instead) - PH8 and PH9 use the "i2s3" function name twice in each pin For the double pin name we use the same trick we pulled for i2s0: append the pin function to the group name to designate the special function. Fixes: 25adc29407fb ("pinctrl: sunxi: Add support for the Allwinner H616 pin controller") Reported-by: SASANO Takayoshi Signed-off-by: Andre Przywara Reviewed-by: Jernej Skrabec Reviewed-by: Samuel Holland Link: https://lore.kernel.org/r/20220105172952.23347-1-andre.przywara@arm.com Signed-off-by: Linus Walleij --- drivers/pinctrl/sunxi/pinctrl-sun50i-h616.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pinctrl/sunxi/pinctrl-sun50i-h616.c b/drivers/pinctrl/sunxi/pinctrl-sun50i-h616.c index ce1917e230f4..152b71226a80 100644 --- a/drivers/pinctrl/sunxi/pinctrl-sun50i-h616.c +++ b/drivers/pinctrl/sunxi/pinctrl-sun50i-h616.c @@ -363,16 +363,16 @@ static const struct sunxi_desc_pin h616_pins[] = { SUNXI_FUNCTION(0x0, "gpio_in"), SUNXI_FUNCTION(0x1, "gpio_out"), SUNXI_FUNCTION(0x2, "uart2"), /* CTS */ - SUNXI_FUNCTION(0x3, "i2s3"), /* DO0 */ + SUNXI_FUNCTION(0x3, "i2s3_dout0"), /* DO0 */ SUNXI_FUNCTION(0x4, "spi1"), /* MISO */ - SUNXI_FUNCTION(0x5, "i2s3"), /* DI1 */ + SUNXI_FUNCTION(0x5, "i2s3_din1"), /* DI1 */ SUNXI_FUNCTION_IRQ_BANK(0x6, 6, 8)), /* PH_EINT8 */ SUNXI_PIN(SUNXI_PINCTRL_PIN(H, 9), SUNXI_FUNCTION(0x0, "gpio_in"), SUNXI_FUNCTION(0x1, "gpio_out"), - SUNXI_FUNCTION(0x3, "i2s3"), /* DI0 */ + SUNXI_FUNCTION(0x3, "i2s3_din0"), /* DI0 */ SUNXI_FUNCTION(0x4, "spi1"), /* CS1 */ - SUNXI_FUNCTION(0x3, "i2s3"), /* DO1 */ + SUNXI_FUNCTION(0x5, "i2s3_dout1"), /* DO1 */ SUNXI_FUNCTION_IRQ_BANK(0x6, 6, 9)), /* PH_EINT9 */ SUNXI_PIN(SUNXI_PINCTRL_PIN(H, 10), SUNXI_FUNCTION(0x0, "gpio_in"), From 9ca8581e79e51c57e60b3b8e3b89d816448f49fe Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 22 Jan 2022 10:47:22 +0900 Subject: [PATCH 0437/1295] ksmbd: fix SMB 3.11 posix extension mount failure cifs client set 4 to DataLength of create_posix context, which mean Mode variable of create_posix context is only available. So buffer validation of ksmbd should check only the size of Mode except for the size of Reserved variable. Fixes: 8f77150c15f8 ("ksmbd: add buffer validation for SMB2_CREATE_CONTEXT") Cc: stable@vger.kernel.org # v5.15+ Reported-by: Steve French Tested-by: Steve French Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 1866c81c5c99..3926ca18dca4 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -2688,7 +2688,7 @@ int smb2_open(struct ksmbd_work *work) (struct create_posix *)context; if (le16_to_cpu(context->DataOffset) + le32_to_cpu(context->DataLength) < - sizeof(struct create_posix)) { + sizeof(struct create_posix) - 4) { rc = -EINVAL; goto err_out1; } From a66c5ed539277b9f2363bbace0dba88b85b36c26 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 6 Jan 2022 11:48:52 -0800 Subject: [PATCH 0438/1295] hwmon: (lm90) Reduce maximum conversion rate for G781 According to its datasheet, G781 supports a maximum conversion rate value of 8 (62.5 ms). However, chips labeled G781 and G780 were found to only support a maximum conversion rate value of 7 (125 ms). On the other side, chips labeled G781-1 and G784 were found to support a conversion rate value of 8. There is no known means to distinguish G780 from G781 or G784; all chips report the same manufacturer ID and chip revision. Setting the conversion rate register value to 8 on chips not supporting it causes unexpected behavior since the real conversion rate is set to 0 (16 seconds) if a value of 8 is written into the conversion rate register. Limit the conversion rate register value to 7 for all G78x chips to avoid the problem. Fixes: ae544f64cc7b ("hwmon: (lm90) Add support for GMT G781") Signed-off-by: Guenter Roeck --- drivers/hwmon/lm90.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/lm90.c b/drivers/hwmon/lm90.c index 74019dff2550..cc5e48fe304b 100644 --- a/drivers/hwmon/lm90.c +++ b/drivers/hwmon/lm90.c @@ -373,7 +373,7 @@ static const struct lm90_params lm90_params[] = { .flags = LM90_HAVE_OFFSET | LM90_HAVE_REM_LIMIT_EXT | LM90_HAVE_BROKEN_ALERT | LM90_HAVE_CRIT, .alert_alarms = 0x7c, - .max_convrate = 8, + .max_convrate = 7, }, [lm86] = { .flags = LM90_HAVE_OFFSET | LM90_HAVE_REM_LIMIT_EXT From bc341a1a98827925082e95db174734fc8bd68af6 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 8 Jan 2022 11:37:19 -0800 Subject: [PATCH 0439/1295] hwmon: (lm90) Re-enable interrupts after alert clears If alert handling is broken, interrupts are disabled after an alert and re-enabled after the alert clears. However, if there is an interrupt handler, this does not apply if alerts were originally disabled and enabled when the driver was loaded. In that case, interrupts will stay disabled after an alert was handled though the alert handler even after the alert condition clears. Address the situation by always re-enabling interrupts after the alert condition clears if there is an interrupt handler. Fixes: 2abdc357c55d9 ("hwmon: (lm90) Unmask hardware interrupt") Cc: Dmitry Osipenko Signed-off-by: Guenter Roeck --- drivers/hwmon/lm90.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/lm90.c b/drivers/hwmon/lm90.c index cc5e48fe304b..e4ecf3440d7c 100644 --- a/drivers/hwmon/lm90.c +++ b/drivers/hwmon/lm90.c @@ -848,7 +848,7 @@ static int lm90_update_device(struct device *dev) * Re-enable ALERT# output if it was originally enabled and * relevant alarms are all clear */ - if (!(data->config_orig & 0x80) && + if ((client->irq || !(data->config_orig & 0x80)) && !(data->alarms & data->alert_alarms)) { if (data->config & 0x80) { dev_dbg(&client->dev, "Re-enabling ALERT#\n"); From a53fff96f35763d132a36c620b183fdf11022d7a Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 7 Jan 2022 11:05:23 -0800 Subject: [PATCH 0440/1295] hwmon: (lm90) Mark alert as broken for MAX6654 Experiments with MAX6654 show that its alert function is broken, similar to other chips supported by the lm90 driver. Mark it accordingly. Fixes: 229d495d8189 ("hwmon: (lm90) Add max6654 support to lm90 driver") Cc: Josh Lehan Signed-off-by: Guenter Roeck --- drivers/hwmon/lm90.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hwmon/lm90.c b/drivers/hwmon/lm90.c index e4ecf3440d7c..280ae5f58187 100644 --- a/drivers/hwmon/lm90.c +++ b/drivers/hwmon/lm90.c @@ -400,6 +400,7 @@ static const struct lm90_params lm90_params[] = { .reg_local_ext = MAX6657_REG_R_LOCAL_TEMPL, }, [max6654] = { + .flags = LM90_HAVE_BROKEN_ALERT, .alert_alarms = 0x7c, .max_convrate = 7, .reg_local_ext = MAX6657_REG_R_LOCAL_TEMPL, From 94746b0ba479743355e0d3cc1cb9cfe3011fb8be Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 7 Jan 2022 11:11:00 -0800 Subject: [PATCH 0441/1295] hwmon: (lm90) Mark alert as broken for MAX6680 Experiments with MAX6680 and MAX6681 show that the alert function of those chips is broken, similar to other chips supported by the lm90 driver. Mark it accordingly. Fixes: 4667bcb8d8fc ("hwmon: (lm90) Introduce chip parameter structure") Signed-off-by: Guenter Roeck --- drivers/hwmon/lm90.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/lm90.c b/drivers/hwmon/lm90.c index 280ae5f58187..06cb971c889b 100644 --- a/drivers/hwmon/lm90.c +++ b/drivers/hwmon/lm90.c @@ -419,7 +419,7 @@ static const struct lm90_params lm90_params[] = { }, [max6680] = { .flags = LM90_HAVE_OFFSET | LM90_HAVE_CRIT - | LM90_HAVE_CRIT_ALRM_SWP, + | LM90_HAVE_CRIT_ALRM_SWP | LM90_HAVE_BROKEN_ALERT, .alert_alarms = 0x7c, .max_convrate = 7, }, From f614629f9c1080dcc844a8430e3fb4c37ebbf05d Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 7 Jan 2022 12:36:41 -0800 Subject: [PATCH 0442/1295] hwmon: (lm90) Mark alert as broken for MAX6646/6647/6649 Experiments with MAX6646 and MAX6648 show that the alert function of those chips is broken, similar to other chips supported by the lm90 driver. Mark it accordingly. Fixes: 4667bcb8d8fc ("hwmon: (lm90) Introduce chip parameter structure") Signed-off-by: Guenter Roeck --- drivers/hwmon/lm90.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/lm90.c b/drivers/hwmon/lm90.c index 06cb971c889b..ba01127c1deb 100644 --- a/drivers/hwmon/lm90.c +++ b/drivers/hwmon/lm90.c @@ -394,7 +394,7 @@ static const struct lm90_params lm90_params[] = { .max_convrate = 9, }, [max6646] = { - .flags = LM90_HAVE_CRIT, + .flags = LM90_HAVE_CRIT | LM90_HAVE_BROKEN_ALERT, .alert_alarms = 0x7c, .max_convrate = 6, .reg_local_ext = MAX6657_REG_R_LOCAL_TEMPL, From d379880d9adb9f1ada3f1266aa49ea2561328e08 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 10 Jan 2022 23:23:31 -0800 Subject: [PATCH 0443/1295] hwmon: (lm90) Fix sysfs and udev notifications sysfs and udev notifications need to be sent to the _alarm attributes, not to the value attributes. Fixes: 94dbd23ed88c ("hwmon: (lm90) Use hwmon_notify_event()") Cc: Dmitry Osipenko Signed-off-by: Guenter Roeck --- drivers/hwmon/lm90.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/hwmon/lm90.c b/drivers/hwmon/lm90.c index ba01127c1deb..1c9493c70813 100644 --- a/drivers/hwmon/lm90.c +++ b/drivers/hwmon/lm90.c @@ -1808,22 +1808,22 @@ static bool lm90_is_tripped(struct i2c_client *client, u16 *status) if (st & LM90_STATUS_LLOW) hwmon_notify_event(data->hwmon_dev, hwmon_temp, - hwmon_temp_min, 0); + hwmon_temp_min_alarm, 0); if (st & LM90_STATUS_RLOW) hwmon_notify_event(data->hwmon_dev, hwmon_temp, - hwmon_temp_min, 1); + hwmon_temp_min_alarm, 1); if (st2 & MAX6696_STATUS2_R2LOW) hwmon_notify_event(data->hwmon_dev, hwmon_temp, - hwmon_temp_min, 2); + hwmon_temp_min_alarm, 2); if (st & LM90_STATUS_LHIGH) hwmon_notify_event(data->hwmon_dev, hwmon_temp, - hwmon_temp_max, 0); + hwmon_temp_max_alarm, 0); if (st & LM90_STATUS_RHIGH) hwmon_notify_event(data->hwmon_dev, hwmon_temp, - hwmon_temp_max, 1); + hwmon_temp_max_alarm, 1); if (st2 & MAX6696_STATUS2_R2HIGH) hwmon_notify_event(data->hwmon_dev, hwmon_temp, - hwmon_temp_max, 2); + hwmon_temp_max_alarm, 2); return true; } From f1e75e0d6a1ae02fa4189d7aeebab623bade2a21 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 18 Jan 2022 07:53:19 -0800 Subject: [PATCH 0444/1295] hwmon: (pmbus/ir38064) Mark ir38064_of_match as __maybe_unused If CONFIG_PM is not enabled, the following warning is reported. drivers/hwmon/pmbus/ir38064.c:54:34: warning: unused variable 'ir38064_of_match' Mark it as __maybe_unused. Reported-by: kernel test robot Cc: Arthur Heymans Signed-off-by: Guenter Roeck --- drivers/hwmon/pmbus/ir38064.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/pmbus/ir38064.c b/drivers/hwmon/pmbus/ir38064.c index 0ea7e1c18bdc..09276e397194 100644 --- a/drivers/hwmon/pmbus/ir38064.c +++ b/drivers/hwmon/pmbus/ir38064.c @@ -62,7 +62,7 @@ static const struct i2c_device_id ir38064_id[] = { MODULE_DEVICE_TABLE(i2c, ir38064_id); -static const struct of_device_id ir38064_of_match[] = { +static const struct of_device_id __maybe_unused ir38064_of_match[] = { { .compatible = "infineon,ir38060" }, { .compatible = "infineon,ir38064" }, { .compatible = "infineon,ir38164" }, From c1ec0cabc36718efc7fe8b4157d41b82d08ec1d2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 21 Jan 2022 14:55:43 +0300 Subject: [PATCH 0445/1295] hwmon: (adt7470) Prevent divide by zero in adt7470_fan_write() The "val" variable is controlled by the user and comes from hwmon_attr_store(). The FAN_RPM_TO_PERIOD() macro divides by "val" so a zero will crash the system. Check for that and return -EINVAL. Negatives are also invalid so return -EINVAL for those too. Fixes: fc958a61ff6d ("hwmon: (adt7470) Convert to devm_hwmon_device_register_with_info API") Signed-off-by: Dan Carpenter Signed-off-by: Guenter Roeck --- drivers/hwmon/adt7470.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hwmon/adt7470.c b/drivers/hwmon/adt7470.c index d519aca4a9d6..fb6d14d213a1 100644 --- a/drivers/hwmon/adt7470.c +++ b/drivers/hwmon/adt7470.c @@ -662,6 +662,9 @@ static int adt7470_fan_write(struct device *dev, u32 attr, int channel, long val struct adt7470_data *data = dev_get_drvdata(dev); int err; + if (val <= 0) + return -EINVAL; + val = FAN_RPM_TO_PERIOD(val); val = clamp_val(val, 1, 65534); From 6d1e6bcb31663ee83aaea1f171f3dbfe95dd4a69 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Mon, 17 Jan 2022 16:31:08 +0530 Subject: [PATCH 0446/1295] phy: ti: Fix missing sentinel for clk_div_table _get_table_maxdiv() tries to access "clk_div_table" array out of bound defined in phy-j721e-wiz.c. Add a sentinel entry to prevent the following global-out-of-bounds error reported by enabling KASAN. [ 9.552392] BUG: KASAN: global-out-of-bounds in _get_maxdiv+0xc0/0x148 [ 9.558948] Read of size 4 at addr ffff8000095b25a4 by task kworker/u4:1/38 [ 9.565926] [ 9.567441] CPU: 1 PID: 38 Comm: kworker/u4:1 Not tainted 5.16.0-116492-gdaadb3bd0e8d-dirty #360 [ 9.576242] Hardware name: Texas Instruments J721e EVM (DT) [ 9.581832] Workqueue: events_unbound deferred_probe_work_func [ 9.587708] Call trace: [ 9.590174] dump_backtrace+0x20c/0x218 [ 9.594038] show_stack+0x18/0x68 [ 9.597375] dump_stack_lvl+0x9c/0xd8 [ 9.601062] print_address_description.constprop.0+0x78/0x334 [ 9.606830] kasan_report+0x1f0/0x260 [ 9.610517] __asan_load4+0x9c/0xd8 [ 9.614030] _get_maxdiv+0xc0/0x148 [ 9.617540] divider_determine_rate+0x88/0x488 [ 9.622005] divider_round_rate_parent+0xc8/0x124 [ 9.626729] wiz_clk_div_round_rate+0x54/0x68 [ 9.631113] clk_core_determine_round_nolock+0x124/0x158 [ 9.636448] clk_core_round_rate_nolock+0x68/0x138 [ 9.641260] clk_core_set_rate_nolock+0x268/0x3a8 [ 9.645987] clk_set_rate+0x50/0xa8 [ 9.649499] cdns_sierra_phy_init+0x88/0x248 [ 9.653794] phy_init+0x98/0x108 [ 9.657046] cdns_pcie_enable_phy+0xa0/0x170 [ 9.661340] cdns_pcie_init_phy+0x250/0x2b0 [ 9.665546] j721e_pcie_probe+0x4b8/0x798 [ 9.669579] platform_probe+0x8c/0x108 [ 9.673350] really_probe+0x114/0x630 [ 9.677037] __driver_probe_device+0x18c/0x220 [ 9.681505] driver_probe_device+0xac/0x150 [ 9.685712] __device_attach_driver+0xec/0x170 [ 9.690178] bus_for_each_drv+0xf0/0x158 [ 9.694124] __device_attach+0x184/0x210 [ 9.698070] device_initial_probe+0x14/0x20 [ 9.702277] bus_probe_device+0xec/0x100 [ 9.706223] deferred_probe_work_func+0x124/0x180 [ 9.710951] process_one_work+0x4b0/0xbc0 [ 9.714983] worker_thread+0x74/0x5d0 [ 9.718668] kthread+0x214/0x230 [ 9.721919] ret_from_fork+0x10/0x20 [ 9.725520] [ 9.727032] The buggy address belongs to the variable: [ 9.732183] clk_div_table+0x24/0x440 Fixes: 091876cc355d ("phy: ti: j721e-wiz: Add support for WIZ module present in TI J721E SoC") Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Kishon Vijay Abraham I Link: https://lore.kernel.org/r/20220117110108.4117-1-kishon@ti.com Signed-off-by: Vinod Koul --- drivers/phy/ti/phy-j721e-wiz.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/phy/ti/phy-j721e-wiz.c b/drivers/phy/ti/phy-j721e-wiz.c index b3384c31637a..da546c35d1d5 100644 --- a/drivers/phy/ti/phy-j721e-wiz.c +++ b/drivers/phy/ti/phy-j721e-wiz.c @@ -233,6 +233,7 @@ static const struct clk_div_table clk_div_table[] = { { .val = 1, .div = 2, }, { .val = 2, .div = 4, }, { .val = 3, .div = 8, }, + { /* sentinel */ }, }; static const struct wiz_clk_div_sel clk_div_sel[] = { From 29afbd769ca338fa14cbfbbc824f7dc457ed7f2e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 15 Jan 2022 14:51:46 +0300 Subject: [PATCH 0447/1295] phy: cadence: Sierra: fix error handling bugs in probe() There are two bugs in the error handling: 1: If devm_of_phy_provider_register() fails then there was no cleanup. 2: The error handling called of_node_put(child) improperly leading to a use after free. We are only holding the reference inside the loop so the last two gotos after the loop lead to a use after free bug. Fix this by cleaning up the partial allocations (or partial iterations) in the loop before doing the goto. Fixes: a43f72ae136a ("phy: cadence: Sierra: Change MAX_LANES of Sierra to 16") Fixes: 44d30d622821 ("phy: cadence: Add driver for Sierra PHY") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20220115115146.GC7552@kili Signed-off-by: Vinod Koul --- drivers/phy/cadence/phy-cadence-sierra.c | 33 ++++++++++++++---------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/drivers/phy/cadence/phy-cadence-sierra.c b/drivers/phy/cadence/phy-cadence-sierra.c index da24acd26666..e265647e29a2 100644 --- a/drivers/phy/cadence/phy-cadence-sierra.c +++ b/drivers/phy/cadence/phy-cadence-sierra.c @@ -1338,7 +1338,7 @@ static int cdns_sierra_phy_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; const struct cdns_sierra_data *data; unsigned int id_value; - int i, ret, node = 0; + int ret, node = 0; void __iomem *base; struct device_node *dn = dev->of_node, *child; @@ -1416,7 +1416,8 @@ static int cdns_sierra_phy_probe(struct platform_device *pdev) dev_err(dev, "failed to get reset %s\n", child->full_name); ret = PTR_ERR(sp->phys[node].lnk_rst); - goto put_child2; + of_node_put(child); + goto put_control; } if (!sp->autoconf) { @@ -1424,7 +1425,9 @@ static int cdns_sierra_phy_probe(struct platform_device *pdev) if (ret) { dev_err(dev, "missing property in node %s\n", child->name); - goto put_child; + of_node_put(child); + reset_control_put(sp->phys[node].lnk_rst); + goto put_control; } } @@ -1434,7 +1437,9 @@ static int cdns_sierra_phy_probe(struct platform_device *pdev) if (IS_ERR(gphy)) { ret = PTR_ERR(gphy); - goto put_child; + of_node_put(child); + reset_control_put(sp->phys[node].lnk_rst); + goto put_control; } sp->phys[node].phy = gphy; phy_set_drvdata(gphy, &sp->phys[node]); @@ -1446,26 +1451,28 @@ static int cdns_sierra_phy_probe(struct platform_device *pdev) if (sp->num_lanes > SIERRA_MAX_LANES) { ret = -EINVAL; dev_err(dev, "Invalid lane configuration\n"); - goto put_child2; + goto put_control; } /* If more than one subnode, configure the PHY as multilink */ if (!sp->autoconf && sp->nsubnodes > 1) { ret = cdns_sierra_phy_configure_multilink(sp); if (ret) - goto put_child2; + goto put_control; } pm_runtime_enable(dev); phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate); - return PTR_ERR_OR_ZERO(phy_provider); + if (IS_ERR(phy_provider)) { + ret = PTR_ERR(phy_provider); + goto put_control; + } -put_child: - node++; -put_child2: - for (i = 0; i < node; i++) - reset_control_put(sp->phys[i].lnk_rst); - of_node_put(child); + return 0; + +put_control: + while (--node >= 0) + reset_control_put(sp->phys[node].lnk_rst); clk_disable: cdns_sierra_phy_disable_clocks(sp); reset_control_assert(sp->apb_rst); From aec982603aa8cc0a21143681feb5f60ecc69d718 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 6 Dec 2021 11:11:51 +0000 Subject: [PATCH 0448/1295] powerpc/fixmap: Fix VM debug warning on unmap Unmapping a fixmap entry is done by calling __set_fixmap() with FIXMAP_PAGE_CLEAR as flags. Today, powerpc __set_fixmap() calls map_kernel_page(). map_kernel_page() is not happy when called a second time for the same page. WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/pgtable.c:194 set_pte_at+0xc/0x1e8 CPU: 0 PID: 1 Comm: swapper Not tainted 5.16.0-rc3-s3k-dev-01993-g350ff07feb7d-dirty #682 NIP: c0017cd4 LR: c00187f0 CTR: 00000010 REGS: e1011d50 TRAP: 0700 Not tainted (5.16.0-rc3-s3k-dev-01993-g350ff07feb7d-dirty) MSR: 00029032 CR: 42000208 XER: 00000000 GPR00: c0165fec e1011e10 c14c0000 c0ee2550 ff800000 c0f3d000 00000000 c001686c GPR08: 00001000 b00045a9 00000001 c0f58460 c0f50000 00000000 c0007e10 00000000 GPR16: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 GPR24: 00000000 00000000 c0ee2550 00000000 c0f57000 00000ff8 00000000 ff800000 NIP [c0017cd4] set_pte_at+0xc/0x1e8 LR [c00187f0] map_kernel_page+0x9c/0x100 Call Trace: [e1011e10] [c0736c68] vsnprintf+0x358/0x6c8 (unreliable) [e1011e30] [c0165fec] __set_fixmap+0x30/0x44 [e1011e40] [c0c13bdc] early_iounmap+0x11c/0x170 [e1011e70] [c0c06cb0] ioremap_legacy_serial_console+0x88/0xc0 [e1011e90] [c0c03634] do_one_initcall+0x80/0x178 [e1011ef0] [c0c0385c] kernel_init_freeable+0xb4/0x250 [e1011f20] [c0007e34] kernel_init+0x24/0x140 [e1011f30] [c0016268] ret_from_kernel_thread+0x5c/0x64 Instruction dump: 7fe3fb78 48019689 80010014 7c630034 83e1000c 5463d97e 7c0803a6 38210010 4e800020 81250000 712a0001 41820008 <0fe00000> 9421ffe0 93e1001c 48000030 Implement unmap_kernel_page() which clears an existing pte. Reported-by: Maxime Bizon Signed-off-by: Christophe Leroy Tested-by: Maxime Bizon Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b0b752f6f6ecc60653e873f385c6f0dce4e9ab6a.1638789098.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/pgtable.h | 1 + arch/powerpc/include/asm/book3s/64/pgtable.h | 2 ++ arch/powerpc/include/asm/fixmap.h | 6 ++++-- arch/powerpc/include/asm/nohash/32/pgtable.h | 1 + arch/powerpc/include/asm/nohash/64/pgtable.h | 1 + arch/powerpc/mm/pgtable.c | 9 +++++++++ 6 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 609c80f67194..f8b94f78403f 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -178,6 +178,7 @@ static inline bool pte_user(pte_t pte) #ifndef __ASSEMBLY__ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); +void unmap_kernel_page(unsigned long va); #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 33e073d6b0c4..875730d5af40 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1082,6 +1082,8 @@ static inline int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t p return hash__map_kernel_page(ea, pa, prot); } +void unmap_kernel_page(unsigned long va); + static inline int __meminit vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys) diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h index 947b5b9c4424..a832aeafe560 100644 --- a/arch/powerpc/include/asm/fixmap.h +++ b/arch/powerpc/include/asm/fixmap.h @@ -111,8 +111,10 @@ static inline void __set_fixmap(enum fixed_addresses idx, BUILD_BUG_ON(idx >= __end_of_fixed_addresses); else if (WARN_ON(idx >= __end_of_fixed_addresses)) return; - - map_kernel_page(__fix_to_virt(idx), phys, flags); + if (pgprot_val(flags)) + map_kernel_page(__fix_to_virt(idx), phys, flags); + else + unmap_kernel_page(__fix_to_virt(idx)); } #define __early_set_fixmap __set_fixmap diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index b67742e2a9b2..d959c2a73fbf 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -64,6 +64,7 @@ extern int icache_44x_need_flush; #ifndef __ASSEMBLY__ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); +void unmap_kernel_page(unsigned long va); #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index a3313e853e5e..2816d158280a 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -308,6 +308,7 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define __swp_entry_to_pte(x) __pte((x).val) int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot); +void unmap_kernel_page(unsigned long va); extern int __meminit vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index abb3198bd277..6ec5a7dd7913 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -206,6 +206,15 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, __set_pte_at(mm, addr, ptep, pte, 0); } +void unmap_kernel_page(unsigned long va) +{ + pmd_t *pmdp = pmd_off_k(va); + pte_t *ptep = pte_offset_kernel(pmdp, va); + + pte_clear(&init_mm, va, ptep); + flush_tlb_kernel_range(va, va + PAGE_SIZE); +} + /* * This is called when relaxing access to a PTE. It's also called in the page * fault path when we don't hit any of the major fault cases, ie, a minor From fb6433b48a178d4672cb26632454ee0b21056eaa Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Sat, 22 Jan 2022 09:04:29 +0530 Subject: [PATCH 0449/1295] powerpc/perf: Fix power_pmu_disable to call clear_pmi_irq_pending only if PMI is pending Running selftest with CONFIG_PPC_IRQ_SOFT_MASK_DEBUG enabled in kernel triggered below warning: [ 172.851380] ------------[ cut here ]------------ [ 172.851391] WARNING: CPU: 8 PID: 2901 at arch/powerpc/include/asm/hw_irq.h:246 power_pmu_disable+0x270/0x280 [ 172.851402] Modules linked in: dm_mod bonding nft_ct nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables rfkill nfnetlink sunrpc xfs libcrc32c pseries_rng xts vmx_crypto uio_pdrv_genirq uio sch_fq_codel ip_tables ext4 mbcache jbd2 sd_mod t10_pi sg ibmvscsi ibmveth scsi_transport_srp fuse [ 172.851442] CPU: 8 PID: 2901 Comm: lost_exception_ Not tainted 5.16.0-rc5-03218-g798527287598 #2 [ 172.851451] NIP: c00000000013d600 LR: c00000000013d5a4 CTR: c00000000013b180 [ 172.851458] REGS: c000000017687860 TRAP: 0700 Not tainted (5.16.0-rc5-03218-g798527287598) [ 172.851465] MSR: 8000000000029033 CR: 48004884 XER: 20040000 [ 172.851482] CFAR: c00000000013d5b4 IRQMASK: 1 [ 172.851482] GPR00: c00000000013d5a4 c000000017687b00 c000000002a10600 0000000000000004 [ 172.851482] GPR04: 0000000082004000 c0000008ba08f0a8 0000000000000000 00000008b7ed0000 [ 172.851482] GPR08: 00000000446194f6 0000000000008000 c00000000013b118 c000000000d58e68 [ 172.851482] GPR12: c00000000013d390 c00000001ec54a80 0000000000000000 0000000000000000 [ 172.851482] GPR16: 0000000000000000 0000000000000000 c000000015d5c708 c0000000025396d0 [ 172.851482] GPR20: 0000000000000000 0000000000000000 c00000000a3bbf40 0000000000000003 [ 172.851482] GPR24: 0000000000000000 c0000008ba097400 c0000000161e0d00 c00000000a3bb600 [ 172.851482] GPR28: c000000015d5c700 0000000000000001 0000000082384090 c0000008ba0020d8 [ 172.851549] NIP [c00000000013d600] power_pmu_disable+0x270/0x280 [ 172.851557] LR [c00000000013d5a4] power_pmu_disable+0x214/0x280 [ 172.851565] Call Trace: [ 172.851568] [c000000017687b00] [c00000000013d5a4] power_pmu_disable+0x214/0x280 (unreliable) [ 172.851579] [c000000017687b40] [c0000000003403ac] perf_pmu_disable+0x4c/0x60 [ 172.851588] [c000000017687b60] [c0000000003445e4] __perf_event_task_sched_out+0x1d4/0x660 [ 172.851596] [c000000017687c50] [c000000000d1175c] __schedule+0xbcc/0x12a0 [ 172.851602] [c000000017687d60] [c000000000d11ea8] schedule+0x78/0x140 [ 172.851608] [c000000017687d90] [c0000000001a8080] sys_sched_yield+0x20/0x40 [ 172.851615] [c000000017687db0] [c0000000000334dc] system_call_exception+0x18c/0x380 [ 172.851622] [c000000017687e10] [c00000000000c74c] system_call_common+0xec/0x268 The warning indicates that MSR_EE being set(interrupt enabled) when there was an overflown PMC detected. This could happen in power_pmu_disable since it runs under interrupt soft disable condition ( local_irq_save ) and not with interrupts hard disabled. commit 2c9ac51b850d ("powerpc/perf: Fix PMU callbacks to clear pending PMI before resetting an overflown PMC") intended to clear PMI pending bit in Paca when disabling the PMU. It could happen that PMC gets overflown while code is in power_pmu_disable callback function. Hence add a check to see if PMI pending bit is set in Paca before clearing it via clear_pmi_pending. Fixes: 2c9ac51b850d ("powerpc/perf: Fix PMU callbacks to clear pending PMI before resetting an overflown PMC") Reported-by: Sachin Sant Signed-off-by: Athira Rajeev Tested-by: Sachin Sant Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220122033429.25395-1-atrajeev@linux.vnet.ibm.com --- arch/powerpc/perf/core-book3s.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 32b98b7a1f86..b5b42cf0a703 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -1355,9 +1355,20 @@ static void power_pmu_disable(struct pmu *pmu) * Otherwise provide a warning if there is PMI pending, but * no counter is found overflown. */ - if (any_pmc_overflown(cpuhw)) - clear_pmi_irq_pending(); - else + if (any_pmc_overflown(cpuhw)) { + /* + * Since power_pmu_disable runs under local_irq_save, it + * could happen that code hits a PMC overflow without PMI + * pending in paca. Hence only clear PMI pending if it was + * set. + * + * If a PMI is pending, then MSR[EE] must be disabled (because + * the masked PMI handler disabling EE). So it is safe to + * call clear_pmi_irq_pending(). + */ + if (pmi_irq_pending()) + clear_pmi_irq_pending(); + } else WARN_ON(pmi_irq_pending()); val = mmcra = cpuhw->mmcr.mmcra; From f3b7e73b2c6619884351a3a0a7468642f852b8a2 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 19 Jan 2022 19:26:37 +0100 Subject: [PATCH 0450/1295] s390/module: fix loading modules with a lot of relocations If the size of the PLT entries generated by apply_rela() exceeds 64KiB, the first ones can no longer reach __jump_r1 with brc. Fix by using brcl. An alternative solution is to add a __jump_r1 copy after every 64KiB, however, the space savings are quite small and do not justify the additional complexity. Fixes: f19fbd5ed642 ("s390: introduce execute-trampolines for branches") Cc: stable@vger.kernel.org Reported-by: Andrea Righi Signed-off-by: Ilya Leoshkevich Reviewed-by: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Signed-off-by: Heiko Carstens --- arch/s390/kernel/module.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index d52d85367bf7..b032e556eeb7 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -33,7 +33,7 @@ #define DEBUGP(fmt , ...) #endif -#define PLT_ENTRY_SIZE 20 +#define PLT_ENTRY_SIZE 22 void *module_alloc(unsigned long size) { @@ -341,27 +341,26 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, case R_390_PLTOFF32: /* 32 bit offset from GOT to PLT. */ case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */ if (info->plt_initialized == 0) { - unsigned int insn[5]; - unsigned int *ip = me->core_layout.base + - me->arch.plt_offset + - info->plt_offset; + unsigned char insn[PLT_ENTRY_SIZE]; + char *plt_base; + char *ip; - insn[0] = 0x0d10e310; /* basr 1,0 */ - insn[1] = 0x100a0004; /* lg 1,10(1) */ + plt_base = me->core_layout.base + me->arch.plt_offset; + ip = plt_base + info->plt_offset; + *(int *)insn = 0x0d10e310; /* basr 1,0 */ + *(int *)&insn[4] = 0x100c0004; /* lg 1,12(1) */ if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) { - unsigned int *ij; - ij = me->core_layout.base + - me->arch.plt_offset + - me->arch.plt_size - PLT_ENTRY_SIZE; - insn[2] = 0xa7f40000 + /* j __jump_r1 */ - (unsigned int)(u16) - (((unsigned long) ij - 8 - - (unsigned long) ip) / 2); + char *jump_r1; + + jump_r1 = plt_base + me->arch.plt_size - + PLT_ENTRY_SIZE; + /* brcl 0xf,__jump_r1 */ + *(short *)&insn[8] = 0xc0f4; + *(int *)&insn[10] = (jump_r1 - (ip + 8)) / 2; } else { - insn[2] = 0x07f10000; /* br %r1 */ + *(int *)&insn[8] = 0x07f10000; /* br %r1 */ } - insn[3] = (unsigned int) (val >> 32); - insn[4] = (unsigned int) val; + *(long *)&insn[14] = val; write(ip, insn, sizeof(insn)); info->plt_initialized = 1; From 90c5318795eefa09a9f9aef8d18a904e24962b5c Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 19 Jan 2022 19:26:38 +0100 Subject: [PATCH 0451/1295] s390/module: test loading modules with a lot of relocations Add a test in order to prevent regressions. Signed-off-by: Ilya Leoshkevich Reviewed-by: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 15 +++++++++ arch/s390/lib/Makefile | 3 ++ arch/s390/lib/test_modules.c | 35 +++++++++++++++++++ arch/s390/lib/test_modules.h | 50 ++++++++++++++++++++++++++++ arch/s390/lib/test_modules_helpers.c | 13 ++++++++ 5 files changed, 116 insertions(+) create mode 100644 arch/s390/lib/test_modules.c create mode 100644 arch/s390/lib/test_modules.h create mode 100644 arch/s390/lib/test_modules_helpers.c diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 9750f92380f5..be9f39fd06df 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -945,6 +945,9 @@ config S390_GUEST endmenu +config S390_MODULES_SANITY_TEST_HELPERS + def_bool n + menu "Selftests" config S390_UNWIND_SELFTEST @@ -971,4 +974,16 @@ config S390_KPROBES_SANITY_TEST Say N if you are unsure. +config S390_MODULES_SANITY_TEST + def_tristate n + depends on KUNIT + default KUNIT_ALL_TESTS + prompt "Enable s390 specific modules tests" + select S390_MODULES_SANITY_TEST_HELPERS + help + This option enables an s390 specific modules test. This option is + not useful for distributions or general kernels, but only for + kernel developers working on architecture code. + + Say N if you are unsure. endmenu diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 707cd4622c13..69feb8ed3312 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -17,4 +17,7 @@ KASAN_SANITIZE_uaccess.o := n obj-$(CONFIG_S390_UNWIND_SELFTEST) += test_unwind.o CFLAGS_test_unwind.o += -fno-optimize-sibling-calls +obj-$(CONFIG_S390_MODULES_SANITY_TEST) += test_modules.o +obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o + lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/s390/lib/test_modules.c b/arch/s390/lib/test_modules.c new file mode 100644 index 000000000000..d056baa8fbb0 --- /dev/null +++ b/arch/s390/lib/test_modules.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include + +#include "test_modules.h" + +#define DECLARE_RETURN(i) int test_modules_return_ ## i(void) +REPEAT_10000(DECLARE_RETURN); + +/* + * Test that modules with many relocations are loaded properly. + */ +static void test_modules_many_vmlinux_relocs(struct kunit *test) +{ + int result = 0; + +#define CALL_RETURN(i) result += test_modules_return_ ## i() + REPEAT_10000(CALL_RETURN); + KUNIT_ASSERT_EQ(test, result, 49995000); +} + +static struct kunit_case modules_testcases[] = { + KUNIT_CASE(test_modules_many_vmlinux_relocs), + {} +}; + +static struct kunit_suite modules_test_suite = { + .name = "modules_test_s390", + .test_cases = modules_testcases, +}; + +kunit_test_suites(&modules_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/arch/s390/lib/test_modules.h b/arch/s390/lib/test_modules.h new file mode 100644 index 000000000000..43b5e4b4af3e --- /dev/null +++ b/arch/s390/lib/test_modules.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef TEST_MODULES_H +#define TEST_MODULES_H + +#define __REPEAT_10000_3(f, x) \ + f(x ## 0); \ + f(x ## 1); \ + f(x ## 2); \ + f(x ## 3); \ + f(x ## 4); \ + f(x ## 5); \ + f(x ## 6); \ + f(x ## 7); \ + f(x ## 8); \ + f(x ## 9) +#define __REPEAT_10000_2(f, x) \ + __REPEAT_10000_3(f, x ## 0); \ + __REPEAT_10000_3(f, x ## 1); \ + __REPEAT_10000_3(f, x ## 2); \ + __REPEAT_10000_3(f, x ## 3); \ + __REPEAT_10000_3(f, x ## 4); \ + __REPEAT_10000_3(f, x ## 5); \ + __REPEAT_10000_3(f, x ## 6); \ + __REPEAT_10000_3(f, x ## 7); \ + __REPEAT_10000_3(f, x ## 8); \ + __REPEAT_10000_3(f, x ## 9) +#define __REPEAT_10000_1(f, x) \ + __REPEAT_10000_2(f, x ## 0); \ + __REPEAT_10000_2(f, x ## 1); \ + __REPEAT_10000_2(f, x ## 2); \ + __REPEAT_10000_2(f, x ## 3); \ + __REPEAT_10000_2(f, x ## 4); \ + __REPEAT_10000_2(f, x ## 5); \ + __REPEAT_10000_2(f, x ## 6); \ + __REPEAT_10000_2(f, x ## 7); \ + __REPEAT_10000_2(f, x ## 8); \ + __REPEAT_10000_2(f, x ## 9) +#define REPEAT_10000(f) \ + __REPEAT_10000_1(f, 0); \ + __REPEAT_10000_1(f, 1); \ + __REPEAT_10000_1(f, 2); \ + __REPEAT_10000_1(f, 3); \ + __REPEAT_10000_1(f, 4); \ + __REPEAT_10000_1(f, 5); \ + __REPEAT_10000_1(f, 6); \ + __REPEAT_10000_1(f, 7); \ + __REPEAT_10000_1(f, 8); \ + __REPEAT_10000_1(f, 9) + +#endif diff --git a/arch/s390/lib/test_modules_helpers.c b/arch/s390/lib/test_modules_helpers.c new file mode 100644 index 000000000000..1670349a03eb --- /dev/null +++ b/arch/s390/lib/test_modules_helpers.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include + +#include "test_modules.h" + +#define DEFINE_RETURN(i) \ + int test_modules_return_ ## i(void) \ + { \ + return 1 ## i - 10000; \ + } \ + EXPORT_SYMBOL_GPL(test_modules_return_ ## i) +REPEAT_10000(DEFINE_RETURN); From c9bb19368b3ab111aedf3297e65bf84c9d3aa005 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 17 Dec 2021 14:58:49 +0100 Subject: [PATCH 0452/1295] s390: update defconfigs Signed-off-by: Heiko Carstens --- arch/s390/configs/debug_defconfig | 20 ++++++++++---------- arch/s390/configs/defconfig | 16 +++++++++------- arch/s390/configs/zfcpdump_defconfig | 3 +++ 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 7fe8975b49ec..498bed9b261b 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -63,6 +63,7 @@ CONFIG_APPLDATA_BASE=y CONFIG_KVM=m CONFIG_S390_UNWIND_SELFTEST=m CONFIG_S390_KPROBES_SANITY_TEST=m +CONFIG_S390_MODULES_SANITY_TEST=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_STATIC_KEYS_SELFTEST=y @@ -96,7 +97,6 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_FRONTSWAP=y CONFIG_CMA_DEBUG=y CONFIG_CMA_DEBUGFS=y CONFIG_CMA_SYSFS=y @@ -109,6 +109,7 @@ CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y CONFIG_PERCPU_STATS=y CONFIG_GUP_TEST=y +CONFIG_ANON_VMA_NAME=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_PACKET_DIAG=m @@ -116,7 +117,6 @@ CONFIG_UNIX=y CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m -CONFIG_NET_SWITCHDEV=y CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_INET=y @@ -185,7 +185,6 @@ CONFIG_NF_CT_NETLINK_TIMEOUT=m CONFIG_NF_TABLES=m CONFIG_NF_TABLES_INET=y CONFIG_NFT_CT=m -CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m CONFIG_NFT_NAT=m @@ -391,6 +390,7 @@ CONFIG_OPENVSWITCH=m CONFIG_VSOCKETS=m CONFIG_VIRTIO_VSOCKETS=m CONFIG_NETLINK_DIAG=m +CONFIG_NET_SWITCHDEV=y CONFIG_CGROUP_NET_PRIO=y CONFIG_NET_PKTGEN=m CONFIG_PCI=y @@ -400,6 +400,7 @@ CONFIG_PCI_IOV=y CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_S390=y CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_SAFE=y CONFIG_CONNECTOR=y CONFIG_ZRAM=y CONFIG_BLK_DEV_LOOP=m @@ -501,6 +502,7 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_DEC is not set # CONFIG_NET_VENDOR_DLINK is not set # CONFIG_NET_VENDOR_EMULEX is not set +# CONFIG_NET_VENDOR_ENGLEDER is not set # CONFIG_NET_VENDOR_EZCHIP is not set # CONFIG_NET_VENDOR_GOOGLE is not set # CONFIG_NET_VENDOR_HUAWEI is not set @@ -511,7 +513,6 @@ CONFIG_NLMON=m CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y -CONFIG_MLX5_ESWITCH=y # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set # CONFIG_NET_VENDOR_MICROSEMI is not set @@ -542,6 +543,7 @@ CONFIG_MLX5_ESWITCH=y # CONFIG_NET_VENDOR_SYNOPSYS is not set # CONFIG_NET_VENDOR_TEHUTI is not set # CONFIG_NET_VENDOR_TI is not set +# CONFIG_NET_VENDOR_VERTEXCOM is not set # CONFIG_NET_VENDOR_VIA is not set # CONFIG_NET_VENDOR_WIZNET is not set # CONFIG_NET_VENDOR_XILINX is not set @@ -592,6 +594,7 @@ CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m +# CONFIG_SURFACE_PLATFORMS is not set CONFIG_S390_CCW_IOMMU=y CONFIG_S390_AP_IOMMU=y CONFIG_EXT4_FS=y @@ -756,9 +759,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_CRYPTO_STATS=y -CONFIG_CRYPTO_LIB_BLAKE2S=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m CONFIG_ZCRYPT=m CONFIG_PKEY=m CONFIG_CRYPTO_PAES_S390=m @@ -774,6 +774,8 @@ CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_CRC32_S390=y CONFIG_CRYPTO_DEV_VIRTIO=m CONFIG_CORDIC=m +CONFIG_CRYPTO_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m CONFIG_CRC32_SELFTEST=y CONFIG_CRC4=m CONFIG_CRC7=m @@ -807,7 +809,6 @@ CONFIG_SLUB_DEBUG_ON=y CONFIG_SLUB_STATS=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUG_VM=y -CONFIG_DEBUG_VM_VMACACHE=y CONFIG_DEBUG_VM_PGFLAGS=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m @@ -819,12 +820,11 @@ CONFIG_PANIC_ON_OOPS=y CONFIG_DETECT_HUNG_TASK=y CONFIG_WQ_WATCHDOG=y CONFIG_TEST_LOCKUP=m -CONFIG_DEBUG_TIMEKEEPING=y CONFIG_PROVE_LOCKING=y CONFIG_LOCK_STAT=y -CONFIG_DEBUG_LOCKDEP=y CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DEBUG_LOCKING_API_SELFTESTS=y +CONFIG_DEBUG_IRQFLAGS=y CONFIG_DEBUG_SG=y CONFIG_DEBUG_NOTIFIERS=y CONFIG_BUG_ON_DATA_CORRUPTION=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 466780c465f5..61e36b999f67 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -61,6 +61,7 @@ CONFIG_APPLDATA_BASE=y CONFIG_KVM=m CONFIG_S390_UNWIND_SELFTEST=m CONFIG_S390_KPROBES_SANITY_TEST=m +CONFIG_S390_MODULES_SANITY_TEST=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y # CONFIG_GCC_PLUGINS is not set @@ -91,7 +92,6 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_FRONTSWAP=y CONFIG_CMA_SYSFS=y CONFIG_CMA_AREAS=7 CONFIG_MEM_SOFT_DIRTY=y @@ -101,6 +101,7 @@ CONFIG_ZSMALLOC_STAT=y CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y CONFIG_PERCPU_STATS=y +CONFIG_ANON_VMA_NAME=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_PACKET_DIAG=m @@ -108,7 +109,6 @@ CONFIG_UNIX=y CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m -CONFIG_NET_SWITCHDEV=y CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_INET=y @@ -177,7 +177,6 @@ CONFIG_NF_CT_NETLINK_TIMEOUT=m CONFIG_NF_TABLES=m CONFIG_NF_TABLES_INET=y CONFIG_NFT_CT=m -CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m CONFIG_NFT_NAT=m @@ -382,6 +381,7 @@ CONFIG_OPENVSWITCH=m CONFIG_VSOCKETS=m CONFIG_VIRTIO_VSOCKETS=m CONFIG_NETLINK_DIAG=m +CONFIG_NET_SWITCHDEV=y CONFIG_CGROUP_NET_PRIO=y CONFIG_NET_PKTGEN=m CONFIG_PCI=y @@ -391,6 +391,7 @@ CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_S390=y CONFIG_UEVENT_HELPER=y CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_SAFE=y CONFIG_CONNECTOR=y CONFIG_ZRAM=y CONFIG_BLK_DEV_LOOP=m @@ -492,6 +493,7 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_DEC is not set # CONFIG_NET_VENDOR_DLINK is not set # CONFIG_NET_VENDOR_EMULEX is not set +# CONFIG_NET_VENDOR_ENGLEDER is not set # CONFIG_NET_VENDOR_EZCHIP is not set # CONFIG_NET_VENDOR_GOOGLE is not set # CONFIG_NET_VENDOR_HUAWEI is not set @@ -502,7 +504,6 @@ CONFIG_NLMON=m CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y -CONFIG_MLX5_ESWITCH=y # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set # CONFIG_NET_VENDOR_MICROSEMI is not set @@ -533,6 +534,7 @@ CONFIG_MLX5_ESWITCH=y # CONFIG_NET_VENDOR_SYNOPSYS is not set # CONFIG_NET_VENDOR_TEHUTI is not set # CONFIG_NET_VENDOR_TI is not set +# CONFIG_NET_VENDOR_VERTEXCOM is not set # CONFIG_NET_VENDOR_VIA is not set # CONFIG_NET_VENDOR_WIZNET is not set # CONFIG_NET_VENDOR_XILINX is not set @@ -582,6 +584,7 @@ CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m +# CONFIG_SURFACE_PLATFORMS is not set CONFIG_S390_CCW_IOMMU=y CONFIG_S390_AP_IOMMU=y CONFIG_EXT4_FS=y @@ -743,9 +746,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_CRYPTO_STATS=y -CONFIG_CRYPTO_LIB_BLAKE2S=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m CONFIG_ZCRYPT=m CONFIG_PKEY=m CONFIG_CRYPTO_PAES_S390=m @@ -762,6 +762,8 @@ CONFIG_CRYPTO_CRC32_S390=y CONFIG_CRYPTO_DEV_VIRTIO=m CONFIG_CORDIC=m CONFIG_PRIME_NUMBERS=m +CONFIG_CRYPTO_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m CONFIG_CRC4=m CONFIG_CRC7=m CONFIG_CRC8=m diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index eed3b9acfa71..c55c668dc3c7 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -1,6 +1,7 @@ # CONFIG_SWAP is not set CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_BPF_SYSCALL=y # CONFIG_CPU_ISOLATION is not set # CONFIG_UTS_NS is not set # CONFIG_TIME_NS is not set @@ -34,6 +35,7 @@ CONFIG_NET=y # CONFIG_PCPU_DEV_REFCNT is not set # CONFIG_ETHTOOL_NETLINK is not set CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_SAFE=y CONFIG_BLK_DEV_RAM=y # CONFIG_DCSSBLK is not set # CONFIG_DASD is not set @@ -58,6 +60,7 @@ CONFIG_ZFCP=y # CONFIG_HID is not set # CONFIG_VIRTIO_MENU is not set # CONFIG_VHOST_MENU is not set +# CONFIG_SURFACE_PLATFORMS is not set # CONFIG_IOMMU_SUPPORT is not set # CONFIG_DNOTIFY is not set # CONFIG_INOTIFY_USER is not set From dda8e14363f4f2bac0a1122322a35f47b5565d46 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 20 Jan 2022 20:49:48 +0100 Subject: [PATCH 0453/1295] gpio: sim: check the label length when setting up device properties If the user-space sets the chip label to an empty string - we should check the length and not override the default name or else line hogs will not be properly attached. Fixes: cb8c474e79be ("gpio: sim: new testing module") Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-sim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index 838bbfed11d3..04b137eca8da 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -816,7 +816,7 @@ gpio_sim_make_bank_swnode(struct gpio_sim_bank *bank, properties[prop_idx++] = PROPERTY_ENTRY_U32("ngpios", bank->num_lines); - if (bank->label) + if (bank->label && (strlen(bank->label) > 0)) properties[prop_idx++] = PROPERTY_ENTRY_STRING("gpio-sim,label", bank->label); From 8aa0f94b0a8d5304ea1bd63bf1ed06f9e395e328 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 22 Jan 2022 23:35:36 -0800 Subject: [PATCH 0454/1295] gpio: sim: add doc file to index file Include the gpio-sim.rst file in the GPIO index (toc/table of contents). Quietens this doc build warning: Documentation/admin-guide/gpio/gpio-sim.rst: WARNING: document isn't included in any toctree Fixes: b48f6b466e44 ("gpio: sim: new testing module") Signed-off-by: Randy Dunlap Reported-by: Stephen Rothwell Signed-off-by: Bartosz Golaszewski --- Documentation/admin-guide/gpio/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/admin-guide/gpio/index.rst b/Documentation/admin-guide/gpio/index.rst index 7db367572f30..f6861ca16ffe 100644 --- a/Documentation/admin-guide/gpio/index.rst +++ b/Documentation/admin-guide/gpio/index.rst @@ -10,6 +10,7 @@ gpio gpio-aggregator sysfs gpio-mockup + gpio-sim .. only:: subproject and html From 278583055a237270fac70518275ba877bf9e4013 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 21 Jan 2022 18:42:07 +0000 Subject: [PATCH 0455/1295] KVM: arm64: Use shadow SPSR_EL1 when injecting exceptions on !VHE Injecting an exception into a guest with non-VHE is risky business. Instead of writing in the shadow register for the switch code to restore it, we override the CPU register instead. Which gets overriden a few instructions later by said restore code. The result is that although the guest correctly gets the exception, it will return to the original context in some random state, depending on what was there the first place... Boo. Fix the issue by writing to the shadow register. The original code is absolutely fine on VHE, as the state is already loaded, and writing to the shadow register in that case would actually be a bug. Fixes: bb666c472ca2 ("KVM: arm64: Inject AArch64 exceptions from HYP") Cc: stable@vger.kernel.org Signed-off-by: Marc Zyngier Reviewed-by: Fuad Tabba Link: https://lore.kernel.org/r/20220121184207.423426-1-maz@kernel.org --- arch/arm64/kvm/hyp/exception.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 0418399e0a20..c5d009715402 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -38,7 +38,10 @@ static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, u64 val) { - write_sysreg_el1(val, SYS_SPSR); + if (has_vhe()) + write_sysreg_el1(val, SYS_SPSR); + else + __vcpu_sys_reg(vcpu, SPSR_EL1) = val; } static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val) From ddec7abd4d93760ad5b2c7c61bf123a7707664ca Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 11 Jan 2022 11:07:08 +0100 Subject: [PATCH 0456/1295] platform/x86: x86-android-tablets: Correct crystal_cove_charger module name The module was renamed to intel_crystal_cove_charger before it was merged, updated bq24190_modules to match. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220111100708.38585-1-hdegoede@redhat.com --- drivers/platform/x86/x86-android-tablets.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/x86-android-tablets.c b/drivers/platform/x86/x86-android-tablets.c index 3ba63ad91b28..8d6a3cad260b 100644 --- a/drivers/platform/x86/x86-android-tablets.c +++ b/drivers/platform/x86/x86-android-tablets.c @@ -187,8 +187,8 @@ static struct bq24190_platform_data bq24190_pdata = { }; static const char * const bq24190_modules[] __initconst = { - "crystal_cove_charger", /* For the bq24190 IRQ */ - "bq24190_charger", /* For the Vbus regulator for intel-int3496 */ + "intel_crystal_cove_charger", /* For the bq24190 IRQ */ + "bq24190_charger", /* For the Vbus regulator for intel-int3496 */ NULL }; From 4ce2a32d40260374dfce5344960c419fde23ce87 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 10 Jan 2022 11:39:50 +0100 Subject: [PATCH 0457/1295] platform/x86: x86-android-tablets: Add support for disabling ACPI _AEI handlers Some of the broken DSDTs on these devices often also include broken / wrong _AEI (ACPI Event Interrupt) handlers, which can cause e.g. interrupt storms by listening to a floating GPIO pin. Add support for disabling these and disable them on the Asus ME176C and TF103C tablets. Signed-off-by: Hans de Goede Reviewed-By: Lubomir Rintel Link: https://lore.kernel.org/r/20220110103952.48760-1-hdegoede@redhat.com --- drivers/platform/x86/x86-android-tablets.c | 23 ++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/x86-android-tablets.c b/drivers/platform/x86/x86-android-tablets.c index 8d6a3cad260b..08c98e881d0e 100644 --- a/drivers/platform/x86/x86-android-tablets.c +++ b/drivers/platform/x86/x86-android-tablets.c @@ -26,6 +26,7 @@ #include /* For gpio_get_desc() which is EXPORT_SYMBOL_GPL() */ #include "../../gpio/gpiolib.h" +#include "../../gpio/gpiolib-acpi.h" /* * Helper code to get Linux IRQ numbers given a description of the IRQ source @@ -47,7 +48,7 @@ struct x86_acpi_irq_data { int polarity; /* ACPI_ACTIVE_HIGH / ACPI_ACTIVE_LOW / ACPI_ACTIVE_BOTH */ }; -static int x86_acpi_irq_helper_gpiochip_find(struct gpio_chip *gc, void *data) +static int gpiochip_find_match_label(struct gpio_chip *gc, void *data) { return gc->label && !strcmp(gc->label, data); } @@ -73,7 +74,7 @@ static int x86_acpi_irq_helper_get(const struct x86_acpi_irq_data *data) return irq; case X86_ACPI_IRQ_TYPE_GPIOINT: /* Like acpi_dev_gpio_irq_get(), but without parsing ACPI resources */ - chip = gpiochip_find(data->chip, x86_acpi_irq_helper_gpiochip_find); + chip = gpiochip_find(data->chip, gpiochip_find_match_label); if (!chip) { pr_err("error cannot find GPIO chip %s\n", data->chip); return -ENODEV; @@ -143,6 +144,7 @@ struct x86_serdev_info { }; struct x86_dev_info { + char *invalid_aei_gpiochip; const char * const *modules; struct gpiod_lookup_table **gpiod_lookup_tables; const struct x86_i2c_client_info *i2c_client_info; @@ -317,6 +319,7 @@ static const struct x86_dev_info asus_me176c_info __initconst = { .serdev_count = ARRAY_SIZE(asus_me176c_serdevs), .gpiod_lookup_tables = asus_me176c_gpios, .modules = bq24190_modules, + .invalid_aei_gpiochip = "INT33FC:02", }; /* Asus TF103C tablets have an Android factory img with everything hardcoded */ @@ -417,6 +420,7 @@ static const struct x86_dev_info asus_tf103c_info __initconst = { .pdev_count = ARRAY_SIZE(int3496_pdevs), .gpiod_lookup_tables = asus_tf103c_gpios, .modules = bq24190_modules, + .invalid_aei_gpiochip = "INT33FC:02", }; /* @@ -795,6 +799,7 @@ static __init int x86_android_tablet_init(void) { const struct x86_dev_info *dev_info; const struct dmi_system_id *id; + struct gpio_chip *chip; int i, ret = 0; id = dmi_first_match(x86_android_tablet_ids); @@ -803,6 +808,20 @@ static __init int x86_android_tablet_init(void) dev_info = id->driver_data; + /* + * The broken DSDTs on these devices often also include broken + * _AEI (ACPI Event Interrupt) handlers, disable these. + */ + if (dev_info->invalid_aei_gpiochip) { + chip = gpiochip_find(dev_info->invalid_aei_gpiochip, + gpiochip_find_match_label); + if (!chip) { + pr_err("error cannot find GPIO chip %s\n", dev_info->invalid_aei_gpiochip); + return -ENODEV; + } + acpi_gpiochip_free_interrupts(chip); + } + /* * Since this runs from module_init() it cannot use -EPROBE_DEFER, * instead pre-load any modules which are listed as requirements. From 84c2dcdd475f3f5d1d30c87404cafba4dd4b75ec Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 10 Jan 2022 11:39:51 +0100 Subject: [PATCH 0458/1295] platform/x86: x86-android-tablets: Add an init() callback to struct x86_dev_info Add an init() callback to struct x86_dev_info, board descriptions can use this to do some custom setup before registering the i2c_clients, platform- devices and servdevs. Also add an exit() callback to also allow for cleanup of the custom setup. Signed-off-by: Hans de Goede Reviewed-By: Lubomir Rintel Link: https://lore.kernel.org/r/20220110103952.48760-2-hdegoede@redhat.com --- drivers/platform/x86/x86-android-tablets.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/platform/x86/x86-android-tablets.c b/drivers/platform/x86/x86-android-tablets.c index 08c98e881d0e..fe4dba5997fe 100644 --- a/drivers/platform/x86/x86-android-tablets.c +++ b/drivers/platform/x86/x86-android-tablets.c @@ -153,6 +153,8 @@ struct x86_dev_info { int i2c_client_count; int pdev_count; int serdev_count; + int (*init)(void); + void (*exit)(void); }; /* Generic / shared bq24190 settings */ @@ -674,6 +676,7 @@ static struct i2c_client **i2c_clients; static struct platform_device **pdevs; static struct serdev_device **serdevs; static struct gpiod_lookup_table **gpiod_lookup_tables; +static void (*exit_handler)(void); static __init int x86_instantiate_i2c_client(const struct x86_dev_info *dev_info, int idx) @@ -791,6 +794,9 @@ static void x86_android_tablet_cleanup(void) kfree(i2c_clients); + if (exit_handler) + exit_handler(); + for (i = 0; gpiod_lookup_tables && gpiod_lookup_tables[i]; i++) gpiod_remove_lookup_table(gpiod_lookup_tables[i]); } @@ -833,6 +839,15 @@ static __init int x86_android_tablet_init(void) for (i = 0; gpiod_lookup_tables && gpiod_lookup_tables[i]; i++) gpiod_add_lookup_table(gpiod_lookup_tables[i]); + if (dev_info->init) { + ret = dev_info->init(); + if (ret < 0) { + x86_android_tablet_cleanup(); + return ret; + } + exit_handler = dev_info->exit; + } + i2c_clients = kcalloc(dev_info->i2c_client_count, sizeof(*i2c_clients), GFP_KERNEL); if (!i2c_clients) { x86_android_tablet_cleanup(); From 442bf564eb0c4577d98a77e87caa10f704dddcad Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 10 Jan 2022 11:39:52 +0100 Subject: [PATCH 0459/1295] platform/x86: x86-android-tablets: Constify the gpiod_lookup_tables arrays The individual gpiod_lookup_table structs cannot be const because they contain a list-head which gets used when registering them. But the array of pointers to the gpiod_lookup_table-s used by a board can be const, constify these. Signed-off-by: Hans de Goede Reviewed-By: Lubomir Rintel Link: https://lore.kernel.org/r/20220110103952.48760-3-hdegoede@redhat.com --- drivers/platform/x86/x86-android-tablets.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/platform/x86/x86-android-tablets.c b/drivers/platform/x86/x86-android-tablets.c index fe4dba5997fe..e1b22bb3dbd8 100644 --- a/drivers/platform/x86/x86-android-tablets.c +++ b/drivers/platform/x86/x86-android-tablets.c @@ -146,7 +146,7 @@ struct x86_serdev_info { struct x86_dev_info { char *invalid_aei_gpiochip; const char * const *modules; - struct gpiod_lookup_table **gpiod_lookup_tables; + struct gpiod_lookup_table * const *gpiod_lookup_tables; const struct x86_i2c_client_info *i2c_client_info; const struct platform_device_info *pdev_info; const struct x86_serdev_info *serdev_info; @@ -306,7 +306,7 @@ static struct gpiod_lookup_table asus_me176c_goodix_gpios = { }, }; -static struct gpiod_lookup_table *asus_me176c_gpios[] = { +static struct gpiod_lookup_table * const asus_me176c_gpios[] = { &int3496_gpo2_pin22_gpios, &asus_me176c_goodix_gpios, NULL @@ -410,7 +410,7 @@ static const struct x86_i2c_client_info asus_tf103c_i2c_clients[] __initconst = }, }; -static struct gpiod_lookup_table *asus_tf103c_gpios[] = { +static struct gpiod_lookup_table * const asus_tf103c_gpios[] = { &int3496_gpo2_pin22_gpios, NULL }; @@ -565,7 +565,7 @@ static struct gpiod_lookup_table whitelabel_tm800a550l_goodix_gpios = { }, }; -static struct gpiod_lookup_table *whitelabel_tm800a550l_gpios[] = { +static struct gpiod_lookup_table * const whitelabel_tm800a550l_gpios[] = { &whitelabel_tm800a550l_goodix_gpios, NULL }; @@ -675,7 +675,7 @@ static int serdev_count; static struct i2c_client **i2c_clients; static struct platform_device **pdevs; static struct serdev_device **serdevs; -static struct gpiod_lookup_table **gpiod_lookup_tables; +static struct gpiod_lookup_table * const *gpiod_lookup_tables; static void (*exit_handler)(void); static __init int x86_instantiate_i2c_client(const struct x86_dev_info *dev_info, From 5de2ffd5acd33368e472dd3255a51cac528c730e Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Mon, 10 Jan 2022 07:35:12 +0100 Subject: [PATCH 0460/1295] platform/x86: x86-android-tablets: Fix the buttons on CZC P10T tablet This switches the P10T tablet to "Android" mode, where the Home button sends a single sancode instead of a Windows-specific key combination and the other button doesn't disable the Wi-Fi. Signed-off-by: Lubomir Rintel Link: https://lore.kernel.org/r/20220110063512.273252-1-lkundrak@v3.sk Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/x86-android-tablets.c | 51 ++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/drivers/platform/x86/x86-android-tablets.c b/drivers/platform/x86/x86-android-tablets.c index e1b22bb3dbd8..bd1ba676bcc0 100644 --- a/drivers/platform/x86/x86-android-tablets.c +++ b/drivers/platform/x86/x86-android-tablets.c @@ -496,6 +496,39 @@ static const struct x86_dev_info chuwi_hi8_info __initconst = { .i2c_client_count = ARRAY_SIZE(chuwi_hi8_i2c_clients), }; +#define CZC_EC_EXTRA_PORT 0x68 +#define CZC_EC_ANDROID_KEYS 0x63 + +static int __init czc_p10t_init(void) +{ + /* + * The device boots up in "Windows 7" mode, when the home button sends a + * Windows specific key sequence (Left Meta + D) and the second button + * sends an unknown one while also toggling the Radio Kill Switch. + * This is a surprising behavior when the second button is labeled "Back". + * + * The vendor-supplied Android-x86 build switches the device to a "Android" + * mode by writing value 0x63 to the I/O port 0x68. This just seems to just + * set bit 6 on address 0x96 in the EC region; switching the bit directly + * seems to achieve the same result. It uses a "p10t_switcher" to do the + * job. It doesn't seem to be able to do anything else, and no other use + * of the port 0x68 is known. + * + * In the Android mode, the home button sends just a single scancode, + * which can be handled in Linux userspace more reasonably and the back + * button only sends a scancode without toggling the kill switch. + * The scancode can then be mapped either to Back or RF Kill functionality + * in userspace, depending on how the button is labeled on that particular + * model. + */ + outb(CZC_EC_ANDROID_KEYS, CZC_EC_EXTRA_PORT); + return 0; +} + +static const struct x86_dev_info czc_p10t __initconst = { + .init = czc_p10t_init, +}; + /* * Whitelabel (sold as various brands) TM800A550L tablets. * These tablet's DSDT contains a whole bunch of bogus ACPI I2C devices @@ -647,6 +680,24 @@ static const struct dmi_system_id x86_android_tablet_ids[] __initconst = { }, .driver_data = (void *)&chuwi_hi8_info, }, + { + /* CZC P10T */ + .ident = "CZC ODEON TPC-10 (\"P10T\")", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "CZC"), + DMI_MATCH(DMI_PRODUCT_NAME, "ODEON*TPC-10"), + }, + .driver_data = (void *)&czc_p10t, + }, + { + /* A variant of CZC P10T */ + .ident = "ViewSonic ViewPad 10", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ViewSonic"), + DMI_MATCH(DMI_PRODUCT_NAME, "VPAD10"), + }, + .driver_data = (void *)&czc_p10t, + }, { /* Whitelabel (sold as various brands) TM800A550L */ .matches = { From 17f6736a020ef195f4855e807c76d2360310d143 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Mon, 10 Jan 2022 07:36:29 +0100 Subject: [PATCH 0461/1295] platform/x86: x86-android-tablets: Trivial typo fix for MODULE_AUTHOR Bring balance to the quoting of Hans' e-mail address. Signed-off-by: Lubomir Rintel Link: https://lore.kernel.org/r/20220110063629.273364-1-lkundrak@v3.sk Signed-off-by: Hans de Goede --- drivers/platform/x86/x86-android-tablets.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/x86-android-tablets.c b/drivers/platform/x86/x86-android-tablets.c index bd1ba676bcc0..9360a8a92486 100644 --- a/drivers/platform/x86/x86-android-tablets.c +++ b/drivers/platform/x86/x86-android-tablets.c @@ -950,6 +950,6 @@ static __init int x86_android_tablet_init(void) module_init(x86_android_tablet_init); module_exit(x86_android_tablet_cleanup); -MODULE_AUTHOR("Hans de Goede "); MODULE_DESCRIPTION("X86 Android tablets DSDT fixups driver"); MODULE_LICENSE("GPL"); From c197e969e3082b9c19175d2f013a0dbd3ce52236 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sat, 15 Jan 2022 15:08:49 +0100 Subject: [PATCH 0462/1295] platform/surface: Reinstate platform dependency Microsoft Surface platform-specific devices are only present on Microsoft Surface platforms, which are currently limited to arm64 and x86. Hence add a dependency on ARM64 || X86, to prevent asking the user about drivers for these devices when configuring a kernel for an architecture that does not support Microsoft Surface platforms. Fixes: 272479928172edf0 ("platform: surface: Propagate ACPI Dependency") Signed-off-by: Geert Uytterhoeven Acked-by: Maximilian Luz Link: https://lore.kernel.org/r/20220115140849.269479-1-geert@linux-m68k.org Signed-off-by: Hans de Goede --- drivers/platform/surface/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/surface/Kconfig b/drivers/platform/surface/Kconfig index 5f0578e25f71..463f1ec5c14e 100644 --- a/drivers/platform/surface/Kconfig +++ b/drivers/platform/surface/Kconfig @@ -5,6 +5,7 @@ menuconfig SURFACE_PLATFORMS bool "Microsoft Surface Platform-Specific Device Drivers" + depends on ARM64 || X86 || COMPILE_TEST default y help Say Y here to get to see options for platform-specific device drivers From 512eb73cfd1208898cf10cb06094e0ee0bb53b58 Mon Sep 17 00:00:00 2001 From: Yuka Kawajiri Date: Wed, 12 Jan 2022 00:40:21 +0900 Subject: [PATCH 0463/1295] platform/x86: touchscreen_dmi: Add info for the RWC NANOTE P8 AY07J 2-in-1 Add touchscreen info for RWC NANOTE P8 (AY07J) 2-in-1. Signed-off-by: Yuka Kawajiri Link: https://lore.kernel.org/r/20220111154019.4599-1-yukx00@gmail.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/touchscreen_dmi.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index 494f23052678..bc97bfa8e8a6 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -770,6 +770,21 @@ static const struct ts_dmi_data predia_basic_data = { .properties = predia_basic_props, }; +static const struct property_entry rwc_nanote_p8_props[] = { + PROPERTY_ENTRY_U32("touchscreen-min-y", 46), + PROPERTY_ENTRY_U32("touchscreen-size-x", 1728), + PROPERTY_ENTRY_U32("touchscreen-size-y", 1140), + PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), + PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-rwc-nanote-p8.fw"), + PROPERTY_ENTRY_U32("silead,max-fingers", 10), + { } +}; + +static const struct ts_dmi_data rwc_nanote_p8_data = { + .acpi_name = "MSSL1680:00", + .properties = rwc_nanote_p8_props, +}; + static const struct property_entry schneider_sct101ctm_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1715), PROPERTY_ENTRY_U32("touchscreen-size-y", 1140), @@ -1394,6 +1409,15 @@ const struct dmi_system_id touchscreen_dmi_table[] = { DMI_EXACT_MATCH(DMI_BOARD_NAME, "0E57"), }, }, + { + /* RWC NANOTE P8 */ + .driver_data = (void *)&rwc_nanote_p8_data, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Default string"), + DMI_MATCH(DMI_PRODUCT_NAME, "AY07J"), + DMI_MATCH(DMI_PRODUCT_SKU, "0001") + }, + }, { /* Schneider SCT101CTM */ .driver_data = (void *)&schneider_sct101ctm_data, From b288420e773f5a9db77115b9cc3767a8ada16648 Mon Sep 17 00:00:00 2001 From: Alexander Kobel Date: Wed, 12 Jan 2022 12:18:27 +0100 Subject: [PATCH 0464/1295] platform/x86: thinkpad_acpi: Add quirk for ThinkPads without a fan Some ThinkPad models, like the X1 Tablet 1st and 2nd Gen, are passively cooled without any fan. Currently, an entry in /proc/acpi/ibm/fan is nevertheless created, and misleadingly shows status: enabled speed: 65535 level: auto This patch adds a TPACPI_FAN_NOFAN quirk definition and corresponding handling to not initialize a fan interface at all. For the time being, the quirk is only applied for X1 Tablet 2nd Gen (types 20JB, 20JC; EC N1O...); further models (such as Gen1, types 20GG and 20GH) can be added easily once tested. Tested on a 20JCS00C00, BIOS N1OET58W (1.43), EC N1OHT34W. Signed-off-by: Alexander Kobel Link: https://lore.kernel.org/r/12d4b825-a2b9-8cb7-6ed3-db4d66f46a60@a-kobel.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/thinkpad_acpi.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 098180fb1cfc..33f611af6e51 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -8679,9 +8679,10 @@ static const struct attribute_group fan_driver_attr_group = { .attrs = fan_driver_attributes, }; -#define TPACPI_FAN_Q1 0x0001 /* Unitialized HFSP */ -#define TPACPI_FAN_2FAN 0x0002 /* EC 0x31 bit 0 selects fan2 */ -#define TPACPI_FAN_2CTL 0x0004 /* selects fan2 control */ +#define TPACPI_FAN_Q1 0x0001 /* Uninitialized HFSP */ +#define TPACPI_FAN_2FAN 0x0002 /* EC 0x31 bit 0 selects fan2 */ +#define TPACPI_FAN_2CTL 0x0004 /* selects fan2 control */ +#define TPACPI_FAN_NOFAN 0x0008 /* no fan available */ static const struct tpacpi_quirk fan_quirk_table[] __initconst = { TPACPI_QEC_IBM('1', 'Y', TPACPI_FAN_Q1), @@ -8702,6 +8703,7 @@ static const struct tpacpi_quirk fan_quirk_table[] __initconst = { TPACPI_Q_LNV3('N', '4', '0', TPACPI_FAN_2CTL), /* P1 / X1 Extreme (4nd gen) */ TPACPI_Q_LNV3('N', '3', '0', TPACPI_FAN_2CTL), /* P15 (1st gen) / P15v (1st gen) */ TPACPI_Q_LNV3('N', '3', '2', TPACPI_FAN_2CTL), /* X1 Carbon (9th gen) */ + TPACPI_Q_LNV3('N', '1', 'O', TPACPI_FAN_NOFAN), /* X1 Tablet (2nd gen) */ }; static int __init fan_init(struct ibm_init_struct *iibm) @@ -8730,6 +8732,11 @@ static int __init fan_init(struct ibm_init_struct *iibm) quirks = tpacpi_check_quirks(fan_quirk_table, ARRAY_SIZE(fan_quirk_table)); + if (quirks & TPACPI_FAN_NOFAN) { + pr_info("No integrated ThinkPad fan available\n"); + return -ENODEV; + } + if (gfan_handle) { /* 570, 600e/x, 770e, 770x */ fan_status_access_mode = TPACPI_FAN_RD_ACPI_GFAN; From a29012ab23163f78087a7e77719f05d201088700 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 12 Jan 2022 00:23:09 +0100 Subject: [PATCH 0465/1295] platform/x86: intel_crystal_cove_charger: Fix IRQ masking / unmasking The driver as originally submitted accidentally relied on Android having run before and Android having unmasked the 2nd level IRQ-mask for the charger IRQ. This worked since these are PMIC registers which are only reset when the battery is fully drained or disconnected. Fix the charger IRQ no longer working after loss of battery power by properly setting the 2nd level IRQ-mask for the charger IRQ. Note this removes the need to enable/disable our parent IRQ which just sets the mask bit in the 1st level IRQ-mask register, setting one of the 2 level masks is enough to stop the IRQ from getting reported. Fixes: 761db353d9e2 ("platform/x86: Add intel_crystal_cove_charger driver") Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220111232309.377642-1-hdegoede@redhat.com --- .../platform/x86/intel/crystal_cove_charger.c | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/platform/x86/intel/crystal_cove_charger.c b/drivers/platform/x86/intel/crystal_cove_charger.c index 0374bc742513..e4299cfa2205 100644 --- a/drivers/platform/x86/intel/crystal_cove_charger.c +++ b/drivers/platform/x86/intel/crystal_cove_charger.c @@ -17,6 +17,7 @@ #include #define CHGRIRQ_REG 0x0a +#define MCHGRIRQ_REG 0x17 struct crystal_cove_charger_data { struct mutex buslock; /* irq_bus_lock */ @@ -25,8 +26,8 @@ struct crystal_cove_charger_data { struct irq_domain *irq_domain; int irq; int charger_irq; - bool irq_enabled; - bool irq_is_enabled; + u8 mask; + u8 new_mask; }; static irqreturn_t crystal_cove_charger_irq(int irq, void *data) @@ -53,13 +54,9 @@ static void crystal_cove_charger_irq_bus_sync_unlock(struct irq_data *data) { struct crystal_cove_charger_data *charger = irq_data_get_irq_chip_data(data); - if (charger->irq_is_enabled != charger->irq_enabled) { - if (charger->irq_enabled) - enable_irq(charger->irq); - else - disable_irq(charger->irq); - - charger->irq_is_enabled = charger->irq_enabled; + if (charger->mask != charger->new_mask) { + regmap_write(charger->regmap, MCHGRIRQ_REG, charger->new_mask); + charger->mask = charger->new_mask; } mutex_unlock(&charger->buslock); @@ -69,14 +66,14 @@ static void crystal_cove_charger_irq_unmask(struct irq_data *data) { struct crystal_cove_charger_data *charger = irq_data_get_irq_chip_data(data); - charger->irq_enabled = true; + charger->new_mask &= ~BIT(data->hwirq); } static void crystal_cove_charger_irq_mask(struct irq_data *data) { struct crystal_cove_charger_data *charger = irq_data_get_irq_chip_data(data); - charger->irq_enabled = false; + charger->new_mask |= BIT(data->hwirq); } static void crystal_cove_charger_rm_irq_domain(void *data) @@ -130,10 +127,13 @@ static int crystal_cove_charger_probe(struct platform_device *pdev) irq_set_nested_thread(charger->charger_irq, true); irq_set_noprobe(charger->charger_irq); + /* Mask the single 2nd level IRQ before enabling the 1st level IRQ */ + charger->mask = charger->new_mask = BIT(0); + regmap_write(charger->regmap, MCHGRIRQ_REG, charger->mask); + ret = devm_request_threaded_irq(&pdev->dev, charger->irq, NULL, crystal_cove_charger_irq, - IRQF_ONESHOT | IRQF_NO_AUTOEN, - KBUILD_MODNAME, charger); + IRQF_ONESHOT, KBUILD_MODNAME, charger); if (ret) return dev_err_probe(&pdev->dev, ret, "requesting irq\n"); From 17da2d5f93692086dd096a975225ffd5622d0bf8 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 11 Jan 2022 18:25:21 -0800 Subject: [PATCH 0466/1295] platform/x86: ISST: Fix possible circular locking dependency detected As reported: [ 256.104522] ====================================================== [ 256.113783] WARNING: possible circular locking dependency detected [ 256.120093] 5.16.0-rc6-yocto-standard+ #99 Not tainted [ 256.125362] ------------------------------------------------------ [ 256.131673] intel-speed-sel/844 is trying to acquire lock: [ 256.137290] ffffffffc036f0d0 (punit_misc_dev_lock){+.+.}-{3:3}, at: isst_if_open+0x18/0x90 [isst_if_common] [ 256.147171] [ 256.147171] but task is already holding lock: [ 256.153135] ffffffff8ee7cb50 (misc_mtx){+.+.}-{3:3}, at: misc_open+0x2a/0x170 [ 256.160407] [ 256.160407] which lock already depends on the new lock. [ 256.160407] [ 256.168712] [ 256.168712] the existing dependency chain (in reverse order) is: [ 256.176327] [ 256.176327] -> #1 (misc_mtx){+.+.}-{3:3}: [ 256.181946] lock_acquire+0x1e6/0x330 [ 256.186265] __mutex_lock+0x9b/0x9b0 [ 256.190497] mutex_lock_nested+0x1b/0x20 [ 256.195075] misc_register+0x32/0x1a0 [ 256.199390] isst_if_cdev_register+0x65/0x180 [isst_if_common] [ 256.205878] isst_if_probe+0x144/0x16e [isst_if_mmio] ... [ 256.241976] [ 256.241976] -> #0 (punit_misc_dev_lock){+.+.}-{3:3}: [ 256.248552] validate_chain+0xbc6/0x1750 [ 256.253131] __lock_acquire+0x88c/0xc10 [ 256.257618] lock_acquire+0x1e6/0x330 [ 256.261933] __mutex_lock+0x9b/0x9b0 [ 256.266165] mutex_lock_nested+0x1b/0x20 [ 256.270739] isst_if_open+0x18/0x90 [isst_if_common] [ 256.276356] misc_open+0x100/0x170 [ 256.280409] chrdev_open+0xa5/0x1e0 ... The call sequence suggested that misc_device /dev file can be opened before misc device is yet to be registered, which is done only once. Here punit_misc_dev_lock was used as common lock, to protect the registration by multiple ISST HW drivers, one time setup, prevent duplicate registry of misc device and prevent load/unload when device is open. We can split into locks: - One which just prevent duplicate call to misc_register() and one time setup. Also never call again if the misc_register() failed or required one time setup is failed. This lock is not shared with any misc device callbacks. - The other lock protects registry, load and unload of HW drivers. Sequence in isst_if_cdev_register() - Register callbacks under punit_misc_dev_open_lock - Call isst_misc_reg() which registers misc_device on the first registry which is under punit_misc_dev_reg_lock, which is not shared with callbacks. Sequence in isst_if_cdev_unregister Just opposite of isst_if_cdev_register Reported-and-tested-by: Liwei Song Signed-off-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20220112022521.54669-1-srinivas.pandruvada@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- .../intel/speed_select_if/isst_if_common.c | 97 ++++++++++++------- 1 file changed, 63 insertions(+), 34 deletions(-) diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c index c9a85eb2e860..e8424e70d81d 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c @@ -596,7 +596,10 @@ static long isst_if_def_ioctl(struct file *file, unsigned int cmd, return ret; } -static DEFINE_MUTEX(punit_misc_dev_lock); +/* Lock to prevent module registration when already opened by user space */ +static DEFINE_MUTEX(punit_misc_dev_open_lock); +/* Lock to allow one share misc device for all ISST interace */ +static DEFINE_MUTEX(punit_misc_dev_reg_lock); static int misc_usage_count; static int misc_device_ret; static int misc_device_open; @@ -606,7 +609,7 @@ static int isst_if_open(struct inode *inode, struct file *file) int i, ret = 0; /* Fail open, if a module is going away */ - mutex_lock(&punit_misc_dev_lock); + mutex_lock(&punit_misc_dev_open_lock); for (i = 0; i < ISST_IF_DEV_MAX; ++i) { struct isst_if_cmd_cb *cb = &punit_callbacks[i]; @@ -628,7 +631,7 @@ static int isst_if_open(struct inode *inode, struct file *file) } else { misc_device_open++; } - mutex_unlock(&punit_misc_dev_lock); + mutex_unlock(&punit_misc_dev_open_lock); return ret; } @@ -637,7 +640,7 @@ static int isst_if_relase(struct inode *inode, struct file *f) { int i; - mutex_lock(&punit_misc_dev_lock); + mutex_lock(&punit_misc_dev_open_lock); misc_device_open--; for (i = 0; i < ISST_IF_DEV_MAX; ++i) { struct isst_if_cmd_cb *cb = &punit_callbacks[i]; @@ -645,7 +648,7 @@ static int isst_if_relase(struct inode *inode, struct file *f) if (cb->registered) module_put(cb->owner); } - mutex_unlock(&punit_misc_dev_lock); + mutex_unlock(&punit_misc_dev_open_lock); return 0; } @@ -662,6 +665,43 @@ static struct miscdevice isst_if_char_driver = { .fops = &isst_if_char_driver_ops, }; +static int isst_misc_reg(void) +{ + mutex_lock(&punit_misc_dev_reg_lock); + if (misc_device_ret) + goto unlock_exit; + + if (!misc_usage_count) { + misc_device_ret = isst_if_cpu_info_init(); + if (misc_device_ret) + goto unlock_exit; + + misc_device_ret = misc_register(&isst_if_char_driver); + if (misc_device_ret) { + isst_if_cpu_info_exit(); + goto unlock_exit; + } + } + misc_usage_count++; + +unlock_exit: + mutex_unlock(&punit_misc_dev_reg_lock); + + return misc_device_ret; +} + +static void isst_misc_unreg(void) +{ + mutex_lock(&punit_misc_dev_reg_lock); + if (misc_usage_count) + misc_usage_count--; + if (!misc_usage_count && !misc_device_ret) { + misc_deregister(&isst_if_char_driver); + isst_if_cpu_info_exit(); + } + mutex_unlock(&punit_misc_dev_reg_lock); +} + /** * isst_if_cdev_register() - Register callback for IOCTL * @device_type: The device type this callback handling. @@ -679,38 +719,31 @@ static struct miscdevice isst_if_char_driver = { */ int isst_if_cdev_register(int device_type, struct isst_if_cmd_cb *cb) { - if (misc_device_ret) - return misc_device_ret; + int ret; if (device_type >= ISST_IF_DEV_MAX) return -EINVAL; - mutex_lock(&punit_misc_dev_lock); + mutex_lock(&punit_misc_dev_open_lock); + /* Device is already open, we don't want to add new callbacks */ if (misc_device_open) { - mutex_unlock(&punit_misc_dev_lock); + mutex_unlock(&punit_misc_dev_open_lock); return -EAGAIN; } - if (!misc_usage_count) { - int ret; - - misc_device_ret = misc_register(&isst_if_char_driver); - if (misc_device_ret) - goto unlock_exit; - - ret = isst_if_cpu_info_init(); - if (ret) { - misc_deregister(&isst_if_char_driver); - misc_device_ret = ret; - goto unlock_exit; - } - } memcpy(&punit_callbacks[device_type], cb, sizeof(*cb)); punit_callbacks[device_type].registered = 1; - misc_usage_count++; -unlock_exit: - mutex_unlock(&punit_misc_dev_lock); + mutex_unlock(&punit_misc_dev_open_lock); - return misc_device_ret; + ret = isst_misc_reg(); + if (ret) { + /* + * No need of mutex as the misc device register failed + * as no one can open device yet. Hence no contention. + */ + punit_callbacks[device_type].registered = 0; + return ret; + } + return 0; } EXPORT_SYMBOL_GPL(isst_if_cdev_register); @@ -725,16 +758,12 @@ EXPORT_SYMBOL_GPL(isst_if_cdev_register); */ void isst_if_cdev_unregister(int device_type) { - mutex_lock(&punit_misc_dev_lock); - misc_usage_count--; + isst_misc_unreg(); + mutex_lock(&punit_misc_dev_open_lock); punit_callbacks[device_type].registered = 0; if (device_type == ISST_IF_DEV_MBOX) isst_delete_hash(); - if (!misc_usage_count && !misc_device_ret) { - misc_deregister(&isst_if_char_driver); - isst_if_cpu_info_exit(); - } - mutex_unlock(&punit_misc_dev_lock); + mutex_unlock(&punit_misc_dev_open_lock); } EXPORT_SYMBOL_GPL(isst_if_cdev_unregister); From f7086daab3b540c89951b9b4c00fc49111f7cfa6 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 17 Jan 2022 12:26:43 +0100 Subject: [PATCH 0467/1295] platform/x86: amd-pmc: Make amd_pmc_stb_debugfs_fops static amd_pmc_stb_debugfs_fops is not used outside of amd-pmc.c, make it static. Cc: Sanket Goswami Reported-by: kernel test robot Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220117112644.260168-1-hdegoede@redhat.com --- drivers/platform/x86/amd-pmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd-pmc.c b/drivers/platform/x86/amd-pmc.c index f794343d6aaa..85b680297934 100644 --- a/drivers/platform/x86/amd-pmc.c +++ b/drivers/platform/x86/amd-pmc.c @@ -226,7 +226,7 @@ static int amd_pmc_stb_debugfs_release(struct inode *inode, struct file *filp) return 0; } -const struct file_operations amd_pmc_stb_debugfs_fops = { +static const struct file_operations amd_pmc_stb_debugfs_fops = { .owner = THIS_MODULE, .open = amd_pmc_stb_debugfs_open, .read = amd_pmc_stb_debugfs_read, From f8c28b93d2628610cf793b3528f6f40fd1c7cd5b Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 17 Jan 2022 12:26:44 +0100 Subject: [PATCH 0468/1295] platform/x86: asus-tf103c-dock: Make 2 global structs static tf103c_dock_hid_ll_driver and tf103c_dock_pm_ops are not used outside of the driver, make them both static. Reported-by: kernel test robot Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220117112644.260168-2-hdegoede@redhat.com --- drivers/platform/x86/asus-tf103c-dock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/asus-tf103c-dock.c b/drivers/platform/x86/asus-tf103c-dock.c index d4ef8f362ee6..6fd0c9fea82d 100644 --- a/drivers/platform/x86/asus-tf103c-dock.c +++ b/drivers/platform/x86/asus-tf103c-dock.c @@ -250,7 +250,7 @@ static int tf103c_dock_hid_raw_request(struct hid_device *hid, u8 reportnum, return 0; } -struct hid_ll_driver tf103c_dock_hid_ll_driver = { +static struct hid_ll_driver tf103c_dock_hid_ll_driver = { .parse = tf103c_dock_hid_parse, .start = tf103c_dock_hid_start, .stop = tf103c_dock_hid_stop, @@ -921,7 +921,7 @@ static int __maybe_unused tf103c_dock_resume(struct device *dev) return 0; } -SIMPLE_DEV_PM_OPS(tf103c_dock_pm_ops, tf103c_dock_suspend, tf103c_dock_resume); +static SIMPLE_DEV_PM_OPS(tf103c_dock_pm_ops, tf103c_dock_suspend, tf103c_dock_resume); static const struct acpi_device_id tf103c_dock_acpi_match[] = { {"NPCE69A"}, From b8fb0d9b47660ddb8a8256412784aad7cee9f21a Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 20 Jan 2022 11:44:39 -0600 Subject: [PATCH 0469/1295] platform/x86: amd-pmc: Correct usage of SMU version Yellow carp has been outputting versions like `1093.24.0`, but this is supposed to be 69.24.0. That is the MSB is being interpreted incorrectly. The MSB is not part of the major version, but has generally been treated that way thus far. It's actually the program, and used to distinguish between two programs from a similar family but different codebase. Link: https://patchwork.freedesktop.org/patch/469993/ Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20220120174439.12770-1-mario.limonciello@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amd-pmc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/platform/x86/amd-pmc.c b/drivers/platform/x86/amd-pmc.c index 85b680297934..4c72ba68b315 100644 --- a/drivers/platform/x86/amd-pmc.c +++ b/drivers/platform/x86/amd-pmc.c @@ -124,9 +124,10 @@ struct amd_pmc_dev { u32 cpu_id; u32 active_ips; /* SMU version information */ - u16 major; - u16 minor; - u16 rev; + u8 smu_program; + u8 major; + u8 minor; + u8 rev; struct device *dev; struct pci_dev *rdev; struct mutex lock; /* generic mutex lock */ @@ -180,11 +181,13 @@ static int amd_pmc_get_smu_version(struct amd_pmc_dev *dev) if (rc) return rc; - dev->major = (val >> 16) & GENMASK(15, 0); + dev->smu_program = (val >> 24) & GENMASK(7, 0); + dev->major = (val >> 16) & GENMASK(7, 0); dev->minor = (val >> 8) & GENMASK(7, 0); dev->rev = (val >> 0) & GENMASK(7, 0); - dev_dbg(dev->dev, "SMU version is %u.%u.%u\n", dev->major, dev->minor, dev->rev); + dev_dbg(dev->dev, "SMU program %u version is %u.%u.%u\n", + dev->smu_program, dev->major, dev->minor, dev->rev); return 0; } From 2148927e6ed43a1667baf7c2ae3e0e05a44b51a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Wed, 19 Jan 2022 17:44:55 +0100 Subject: [PATCH 0470/1295] net: sfp: ignore disabled SFP node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit ce0aa27ff3f6 ("sfp: add sfp-bus to bridge between network devices and sfp cages") added code which finds SFP bus DT node even if the node is disabled with status = "disabled". Because of this, when phylink is created, it ends with non-null .sfp_bus member, even though the SFP module is not probed (because the node is disabled). We need to ignore disabled SFP bus node. Fixes: ce0aa27ff3f6 ("sfp: add sfp-bus to bridge between network devices and sfp cages") Signed-off-by: Marek Behún Cc: stable@vger.kernel.org # 2203cbf2c8b5 ("net: sfp: move fwnode parsing into sfp-bus layer") Signed-off-by: David S. Miller --- drivers/net/phy/sfp-bus.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index 0c6c0d1843bc..c1512c9925a6 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -651,6 +651,11 @@ struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode) else if (ret < 0) return ERR_PTR(ret); + if (!fwnode_device_is_available(ref.fwnode)) { + fwnode_handle_put(ref.fwnode); + return NULL; + } + bus = sfp_bus_get(ref.fwnode); fwnode_handle_put(ref.fwnode); if (!bus) From 9decff5f403f9a48f639736ec0271e2870cadbb6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 26 Dec 2021 15:32:45 +0100 Subject: [PATCH 0471/1295] optee: Use bitmap_free() to free bitmap kfree() and bitmap_free() are the same. But using the latter is more consistent when freeing memory allocated with bitmap_zalloc(). Signed-off-by: Christophe JAILLET Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/optee/notif.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tee/optee/notif.c b/drivers/tee/optee/notif.c index a28fa03dcd0e..05212842b0a5 100644 --- a/drivers/tee/optee/notif.c +++ b/drivers/tee/optee/notif.c @@ -121,5 +121,5 @@ int optee_notif_init(struct optee *optee, u_int max_key) void optee_notif_uninit(struct optee *optee) { - kfree(optee->notif.bitmap); + bitmap_free(optee->notif.bitmap); } From abc8dc34d1f6e34ed346c6e3fc554127e421b769 Mon Sep 17 00:00:00 2001 From: Jerome Forissier Date: Thu, 13 Jan 2022 16:27:13 +0100 Subject: [PATCH 0472/1295] tee: optee: do not check memref size on return from Secure World Commit c650b8dc7a79 ("tee: optee: do not check memref size on return from Secure World") was mistakenly lost in commit 4602c5842f64 ("optee: refactor driver with internal callbacks"). Remove the unwanted code again. Fixes: 4602c5842f64 ("optee: refactor driver with internal callbacks") Signed-off-by: Jerome Forissier Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/optee/smc_abi.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/tee/optee/smc_abi.c b/drivers/tee/optee/smc_abi.c index 449d6a72d289..dc40ae8b83b6 100644 --- a/drivers/tee/optee/smc_abi.c +++ b/drivers/tee/optee/smc_abi.c @@ -75,16 +75,6 @@ static int from_msg_param_tmp_mem(struct tee_param *p, u32 attr, p->u.memref.shm_offs = mp->u.tmem.buf_ptr - pa; p->u.memref.shm = shm; - /* Check that the memref is covered by the shm object */ - if (p->u.memref.size) { - size_t o = p->u.memref.shm_offs + - p->u.memref.size - 1; - - rc = tee_shm_get_pa(shm, o, NULL); - if (rc) - return rc; - } - return 0; } From aa6034678e873db8bd5c5a4b73f8b88c469374d6 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 21 Jan 2022 16:25:18 +0800 Subject: [PATCH 0473/1295] bonding: use rcu_dereference_rtnl when get bonding active slave bond_option_active_slave_get_rcu() should not be used in rtnl_mutex as it use rcu_dereference(). Replace to rcu_dereference_rtnl() so we also can use this function in rtnl protected context. With this update, we can rmeove the rcu_read_lock/unlock in bonding .ndo_eth_ioctl and .get_ts_info. Reported-by: Vladimir Oltean Fixes: 94dd016ae538 ("bond: pass get_ts_info and SIOC[SG]HWTSTAMP ioctl to active device") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 4 ---- include/net/bonding.h | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index ec498ce70f35..238b56d77c36 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4133,9 +4133,7 @@ static int bond_eth_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cm fallthrough; case SIOCGHWTSTAMP: - rcu_read_lock(); real_dev = bond_option_active_slave_get_rcu(bond); - rcu_read_unlock(); if (!real_dev) return -EOPNOTSUPP; @@ -5382,9 +5380,7 @@ static int bond_ethtool_get_ts_info(struct net_device *bond_dev, struct net_device *real_dev; struct phy_device *phydev; - rcu_read_lock(); real_dev = bond_option_active_slave_get_rcu(bond); - rcu_read_unlock(); if (real_dev) { ops = real_dev->ethtool_ops; phydev = real_dev->phydev; diff --git a/include/net/bonding.h b/include/net/bonding.h index f6ae3a4baea4..83cfd2d70247 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -346,7 +346,7 @@ static inline bool bond_uses_primary(struct bonding *bond) static inline struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond) { - struct slave *slave = rcu_dereference(bond->curr_active_slave); + struct slave *slave = rcu_dereference_rtnl(bond->curr_active_slave); return bond_uses_primary(bond) && slave ? slave->dev : NULL; } From 1d10f8a1f40b965d449e8f2d5ed7b96a7c138b77 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Fri, 21 Jan 2022 17:15:31 +0800 Subject: [PATCH 0474/1295] net-procfs: show net devices bound packet types After commit:7866a621043f ("dev: add per net_device packet type chains"), we can not get packet types that are bound to a specified net device by /proc/net/ptype, this patch fix the regression. Run "tcpdump -i ens192 udp -nns0" Before and after apply this patch: Before: [root@localhost ~]# cat /proc/net/ptype Type Device Function 0800 ip_rcv 0806 arp_rcv 86dd ipv6_rcv After: [root@localhost ~]# cat /proc/net/ptype Type Device Function ALL ens192 tpacket_rcv 0800 ip_rcv 0806 arp_rcv 86dd ipv6_rcv v1 -> v2: - fix the regression rather than adding new /proc API as suggested by Stephen Hemminger. Fixes: 7866a621043f ("dev: add per net_device packet type chains") Signed-off-by: Jianguo Wu Signed-off-by: David S. Miller --- net/core/net-procfs.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 5b8016335aca..88cc0ad7d386 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -190,12 +190,23 @@ static const struct seq_operations softnet_seq_ops = { .show = softnet_seq_show, }; -static void *ptype_get_idx(loff_t pos) +static void *ptype_get_idx(struct seq_file *seq, loff_t pos) { + struct list_head *ptype_list = NULL; struct packet_type *pt = NULL; + struct net_device *dev; loff_t i = 0; int t; + for_each_netdev_rcu(seq_file_net(seq), dev) { + ptype_list = &dev->ptype_all; + list_for_each_entry_rcu(pt, ptype_list, list) { + if (i == pos) + return pt; + ++i; + } + } + list_for_each_entry_rcu(pt, &ptype_all, list) { if (i == pos) return pt; @@ -216,22 +227,40 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); - return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; + return *pos ? ptype_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net_device *dev; struct packet_type *pt; struct list_head *nxt; int hash; ++*pos; if (v == SEQ_START_TOKEN) - return ptype_get_idx(0); + return ptype_get_idx(seq, 0); pt = v; nxt = pt->list.next; + if (pt->dev) { + if (nxt != &pt->dev->ptype_all) + goto found; + + dev = pt->dev; + for_each_netdev_continue_rcu(seq_file_net(seq), dev) { + if (!list_empty(&dev->ptype_all)) { + nxt = dev->ptype_all.next; + goto found; + } + } + + nxt = ptype_all.next; + goto ptype_all; + } + if (pt->type == htons(ETH_P_ALL)) { +ptype_all: if (nxt != &ptype_all) goto found; hash = 0; From 4064c461148ab129dfe5eaeea129b4af6cf4b9b7 Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Tue, 28 Dec 2021 21:25:57 +0100 Subject: [PATCH 0475/1295] optee: add error checks in optee_ffa_do_call_with_arg() Adds error checking in optee_ffa_do_call_with_arg() for correctness. Fixes: 4615e5a34b95 ("optee: add FF-A support") Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/optee/ffa_abi.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/tee/optee/ffa_abi.c b/drivers/tee/optee/ffa_abi.c index 20a1b1a3d965..0775759a29c0 100644 --- a/drivers/tee/optee/ffa_abi.c +++ b/drivers/tee/optee/ffa_abi.c @@ -619,9 +619,18 @@ static int optee_ffa_do_call_with_arg(struct tee_context *ctx, .data2 = (u32)(shm->sec_world_id >> 32), .data3 = shm->offset, }; - struct optee_msg_arg *arg = tee_shm_get_va(shm, 0); - unsigned int rpc_arg_offs = OPTEE_MSG_GET_ARG_SIZE(arg->num_params); - struct optee_msg_arg *rpc_arg = tee_shm_get_va(shm, rpc_arg_offs); + struct optee_msg_arg *arg; + unsigned int rpc_arg_offs; + struct optee_msg_arg *rpc_arg; + + arg = tee_shm_get_va(shm, 0); + if (IS_ERR(arg)) + return PTR_ERR(arg); + + rpc_arg_offs = OPTEE_MSG_GET_ARG_SIZE(arg->num_params); + rpc_arg = tee_shm_get_va(shm, rpc_arg_offs); + if (IS_ERR(rpc_arg)) + return PTR_ERR(rpc_arg); return optee_ffa_yielding_call(ctx, &data, rpc_arg); } From 27a8caa59babb96c5890569e131bc0eb6d45daee Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 21 Jan 2022 16:57:31 -0800 Subject: [PATCH 0476/1295] ipv4: fix ip option filtering for locally generated fragments During IP fragmentation we sanitize IP options. This means overwriting options which should not be copied with NOPs. Only the first fragment has the original, full options. ip_fraglist_prepare() copies the IP header and options from previous fragment to the next one. Commit 19c3401a917b ("net: ipv4: place control buffer handling away from fragmentation iterators") moved sanitizing options before ip_fraglist_prepare() which means options are sanitized and then overwritten again with the old values. Fixing this is not enough, however, nor did the sanitization work prior to aforementioned commit. ip_options_fragment() (which does the sanitization) uses ipcb->opt.optlen for the length of the options. ipcb->opt of fragments is not populated (it's 0), only the head skb has the state properly built. So even when called at the right time ip_options_fragment() does nothing. This seems to date back all the way to v2.5.44 when the fast path for pre-fragmented skbs had been introduced. Prior to that ip_options_build() would have been called for every fragment (in fact ever since v2.5.44 the fragmentation handing in ip_options_build() has been dead code, I'll clean it up in -next). In the original patch (see Link) caixf mentions fixing the handling for fragments other than the second one, but I'm not sure how _any_ fragment could have had their options sanitized with the code as it stood. Tested with python (MTU on lo lowered to 1000 to force fragmentation): import socket s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.setsockopt(socket.IPPROTO_IP, socket.IP_OPTIONS, bytearray([7,4,5,192, 20|0x80,4,1,0])) s.sendto(b'1'*2000, ('127.0.0.1', 1234)) Before: IP (tos 0x0, ttl 64, id 1053, offset 0, flags [+], proto UDP (17), length 996, options (RR [bad length 4] [bad ptr 5] 192.148.4.1,,RA value 256)) localhost.36500 > localhost.search-agent: UDP, length 2000 IP (tos 0x0, ttl 64, id 1053, offset 968, flags [+], proto UDP (17), length 996, options (RR [bad length 4] [bad ptr 5] 192.148.4.1,,RA value 256)) localhost > localhost: udp IP (tos 0x0, ttl 64, id 1053, offset 1936, flags [none], proto UDP (17), length 100, options (RR [bad length 4] [bad ptr 5] 192.148.4.1,,RA value 256)) localhost > localhost: udp After: IP (tos 0x0, ttl 96, id 42549, offset 0, flags [+], proto UDP (17), length 996, options (RR [bad length 4] [bad ptr 5] 192.148.4.1,,RA value 256)) localhost.51607 > localhost.search-agent: UDP, bad length 2000 > 960 IP (tos 0x0, ttl 96, id 42549, offset 968, flags [+], proto UDP (17), length 996, options (NOP,NOP,NOP,NOP,RA value 256)) localhost > localhost: udp IP (tos 0x0, ttl 96, id 42549, offset 1936, flags [none], proto UDP (17), length 100, options (NOP,NOP,NOP,NOP,RA value 256)) localhost > localhost: udp RA (20 | 0x80) is now copied as expected, RR (7) is "NOPed out". Link: https://lore.kernel.org/netdev/20220107080559.122713-1-ooppublic@163.com/ Fixes: 19c3401a917b ("net: ipv4: place control buffer handling away from fragmentation iterators") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: caixf Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 57c1d8431386..e331c8d4e6cf 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -825,15 +825,24 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, /* Everything is OK. Generate! */ ip_fraglist_init(skb, iph, hlen, &iter); - if (iter.frag) - ip_options_fragment(iter.frag); - for (;;) { /* Prepare header of the next frame, * before previous one went down. */ if (iter.frag) { + bool first_frag = (iter.offset == 0); + IPCB(iter.frag)->flags = IPCB(skb)->flags; ip_fraglist_prepare(skb, &iter); + if (first_frag && IPCB(skb)->opt.optlen) { + /* ipcb->opt is not populated for frags + * coming from __ip_make_skb(), + * ip_options_fragment() needs optlen + */ + IPCB(iter.frag)->opt.optlen = + IPCB(skb)->opt.optlen; + ip_options_fragment(iter.frag); + ip_send_check(iter.iph); + } } skb->tstamp = tstamp; From db9f0e8bf79e6da7068b5818fea0ffd9d0d4b4da Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Fri, 21 Jan 2022 18:59:18 -0800 Subject: [PATCH 0477/1295] ibmvnic: Allow extra failures before disabling If auto-priority-failover (APF) is enabled and there are at least two backing devices of different priorities, some resets like fail-over, change-param etc can cause at least two back to back failovers. (Failover from high priority backing device to lower priority one and then back to the higher priority one if that is still functional). Depending on the timimg of the two failovers it is possible to trigger a "hard" reset and for the hard reset to fail due to failovers. When this occurs, the driver assumes that the network is unstable and disables the VNIC for a 60-second "settling time". This in turn can cause the ethtool command to fail with "No such device" while the vnic automatically recovers a little while later. Given that it's possible to have two back to back failures, allow for extra failures before disabling the vnic for the settling time. Fixes: f15fde9d47b8 ("ibmvnic: delay next reset if hard reset fails") Signed-off-by: Sukadev Bhattiprolu Reviewed-by: Dany Madden Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 59536bd5cab1..42bf7a76db8e 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2602,6 +2602,7 @@ static void __ibmvnic_reset(struct work_struct *work) struct ibmvnic_rwi *rwi; unsigned long flags; u32 reset_state; + int num_fails = 0; int rc = 0; adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset); @@ -2655,11 +2656,23 @@ static void __ibmvnic_reset(struct work_struct *work) rc = do_hard_reset(adapter, rwi, reset_state); rtnl_unlock(); } - if (rc) { - /* give backing device time to settle down */ + if (rc) + num_fails++; + else + num_fails = 0; + + /* If auto-priority-failover is enabled we can get + * back to back failovers during resets, resulting + * in at least two failed resets (from high-priority + * backing device to low-priority one and then back) + * If resets continue to fail beyond that, give the + * adapter some time to settle down before retrying. + */ + if (num_fails >= 3) { netdev_dbg(adapter->netdev, - "[S:%s] Hard reset failed, waiting 60 secs\n", - adapter_state_to_string(adapter->state)); + "[S:%s] Hard reset failed %d times, waiting 60 secs\n", + adapter_state_to_string(adapter->state), + num_fails); set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(60 * HZ); } From 151b6a5c06b678687f64f2d9a99fd04d5cd32b72 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Fri, 21 Jan 2022 18:59:19 -0800 Subject: [PATCH 0478/1295] ibmvnic: init ->running_cap_crqs early We use ->running_cap_crqs to determine when the ibmvnic_tasklet() should send out the next protocol message type. i.e when we get back responses to all our QUERY_CAPABILITY CRQs we send out REQUEST_CAPABILITY crqs. Similiary, when we get responses to all the REQUEST_CAPABILITY crqs, we send out the QUERY_IP_OFFLOAD CRQ. We currently increment ->running_cap_crqs as we send out each CRQ and have the ibmvnic_tasklet() send out the next message type, when this running_cap_crqs count drops to 0. This assumes that all the CRQs of the current type were sent out before the count drops to 0. However it is possible that we send out say 6 CRQs, get preempted and receive all the 6 responses before we send out the remaining CRQs. This can result in ->running_cap_crqs count dropping to zero before all messages of the current type were sent and we end up sending the next protocol message too early. Instead initialize the ->running_cap_crqs upfront so the tasklet will only send the next protocol message after all responses are received. Use the cap_reqs local variable to also detect any discrepancy (either now or in future) in the number of capability requests we actually send. Currently only send_query_cap() is affected by this behavior (of sending next message early) since it is called from the worker thread (during reset) and from application thread (during ->ndo_open()) and they can be preempted. send_request_cap() is only called from the tasklet which processes CRQ responses sequentially, is not be affected. But to maintain the existing symmtery with send_query_capability() we update send_request_capability() also. Fixes: 249168ad07cd ("ibmvnic: Make CRQ interrupt tasklet wait for all capabilities crqs") Signed-off-by: Sukadev Bhattiprolu Reviewed-by: Dany Madden Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 106 +++++++++++++++++++---------- 1 file changed, 71 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 42bf7a76db8e..c65e98bc4f7d 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -3857,11 +3857,25 @@ static void send_request_cap(struct ibmvnic_adapter *adapter, int retry) struct device *dev = &adapter->vdev->dev; union ibmvnic_crq crq; int max_entries; + int cap_reqs; + + /* We send out 6 or 7 REQUEST_CAPABILITY CRQs below (depending on + * the PROMISC flag). Initialize this count upfront. When the tasklet + * receives a response to all of these, it will send the next protocol + * message (QUERY_IP_OFFLOAD). + */ + if (!(adapter->netdev->flags & IFF_PROMISC) || + adapter->promisc_supported) + cap_reqs = 7; + else + cap_reqs = 6; if (!retry) { /* Sub-CRQ entries are 32 byte long */ int entries_page = 4 * PAGE_SIZE / (sizeof(u64) * 4); + atomic_set(&adapter->running_cap_crqs, cap_reqs); + if (adapter->min_tx_entries_per_subcrq > entries_page || adapter->min_rx_add_entries_per_subcrq > entries_page) { dev_err(dev, "Fatal, invalid entries per sub-crq\n"); @@ -3922,44 +3936,45 @@ static void send_request_cap(struct ibmvnic_adapter *adapter, int retry) adapter->opt_rx_comp_queues; adapter->req_rx_add_queues = adapter->max_rx_add_queues; + } else { + atomic_add(cap_reqs, &adapter->running_cap_crqs); } - memset(&crq, 0, sizeof(crq)); crq.request_capability.first = IBMVNIC_CRQ_CMD; crq.request_capability.cmd = REQUEST_CAPABILITY; crq.request_capability.capability = cpu_to_be16(REQ_TX_QUEUES); crq.request_capability.number = cpu_to_be64(adapter->req_tx_queues); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); crq.request_capability.capability = cpu_to_be16(REQ_RX_QUEUES); crq.request_capability.number = cpu_to_be64(adapter->req_rx_queues); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); crq.request_capability.capability = cpu_to_be16(REQ_RX_ADD_QUEUES); crq.request_capability.number = cpu_to_be64(adapter->req_rx_add_queues); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); crq.request_capability.capability = cpu_to_be16(REQ_TX_ENTRIES_PER_SUBCRQ); crq.request_capability.number = cpu_to_be64(adapter->req_tx_entries_per_subcrq); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); crq.request_capability.capability = cpu_to_be16(REQ_RX_ADD_ENTRIES_PER_SUBCRQ); crq.request_capability.number = cpu_to_be64(adapter->req_rx_add_entries_per_subcrq); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); crq.request_capability.capability = cpu_to_be16(REQ_MTU); crq.request_capability.number = cpu_to_be64(adapter->req_mtu); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); if (adapter->netdev->flags & IFF_PROMISC) { @@ -3967,16 +3982,21 @@ static void send_request_cap(struct ibmvnic_adapter *adapter, int retry) crq.request_capability.capability = cpu_to_be16(PROMISC_REQUESTED); crq.request_capability.number = cpu_to_be64(1); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); } } else { crq.request_capability.capability = cpu_to_be16(PROMISC_REQUESTED); crq.request_capability.number = cpu_to_be64(0); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); } + + /* Keep at end to catch any discrepancy between expected and actual + * CRQs sent. + */ + WARN_ON(cap_reqs != 0); } static int pending_scrq(struct ibmvnic_adapter *adapter, @@ -4370,118 +4390,132 @@ static void send_query_map(struct ibmvnic_adapter *adapter) static void send_query_cap(struct ibmvnic_adapter *adapter) { union ibmvnic_crq crq; + int cap_reqs; + + /* We send out 25 QUERY_CAPABILITY CRQs below. Initialize this count + * upfront. When the tasklet receives a response to all of these, it + * can send out the next protocol messaage (REQUEST_CAPABILITY). + */ + cap_reqs = 25; + + atomic_set(&adapter->running_cap_crqs, cap_reqs); - atomic_set(&adapter->running_cap_crqs, 0); memset(&crq, 0, sizeof(crq)); crq.query_capability.first = IBMVNIC_CRQ_CMD; crq.query_capability.cmd = QUERY_CAPABILITY; crq.query_capability.capability = cpu_to_be16(MIN_TX_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MIN_RX_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MIN_RX_ADD_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_TX_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_RX_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_RX_ADD_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MIN_TX_ENTRIES_PER_SUBCRQ); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MIN_RX_ADD_ENTRIES_PER_SUBCRQ); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_TX_ENTRIES_PER_SUBCRQ); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_RX_ADD_ENTRIES_PER_SUBCRQ); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(TCP_IP_OFFLOAD); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(PROMISC_SUPPORTED); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MIN_MTU); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_MTU); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_MULTICAST_FILTERS); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(VLAN_HEADER_INSERTION); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(RX_VLAN_HEADER_INSERTION); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_TX_SG_ENTRIES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(RX_SG_SUPPORTED); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(OPT_TX_COMP_SUB_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(OPT_RX_COMP_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(OPT_RX_BUFADD_Q_PER_RX_COMP_Q); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(OPT_TX_ENTRIES_PER_SUBCRQ); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(OPT_RXBA_ENTRIES_PER_SUBCRQ); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(TX_RX_DESC_REQ); - atomic_inc(&adapter->running_cap_crqs); + ibmvnic_send_crq(adapter, &crq); + cap_reqs--; + + /* Keep at end to catch any discrepancy between expected and actual + * CRQs sent. + */ + WARN_ON(cap_reqs != 0); } static void send_query_ip_offload(struct ibmvnic_adapter *adapter) @@ -4785,6 +4819,8 @@ static void handle_request_cap_rsp(union ibmvnic_crq *crq, char *name; atomic_dec(&adapter->running_cap_crqs); + netdev_dbg(adapter->netdev, "Outstanding request-caps: %d\n", + atomic_read(&adapter->running_cap_crqs)); switch (be16_to_cpu(crq->request_capability_rsp.capability)) { case REQ_TX_QUEUES: req_value = &adapter->req_tx_queues; From 48079e7fdd0269d66b1d7d66ae88bd03162464ad Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Fri, 21 Jan 2022 18:59:20 -0800 Subject: [PATCH 0479/1295] ibmvnic: don't spin in tasklet ibmvnic_tasklet() continuously spins waiting for responses to all capability requests. It does this to avoid encountering an error during initialization of the vnic. However if there is a bug in the VIOS and we do not receive a response to one or more queries the tasklet ends up spinning continuously leading to hard lock ups. If we fail to receive a message from the VIOS it is reasonable to timeout the login attempt rather than spin indefinitely in the tasklet. Fixes: 249168ad07cd ("ibmvnic: Make CRQ interrupt tasklet wait for all capabilities crqs") Signed-off-by: Sukadev Bhattiprolu Reviewed-by: Dany Madden Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index c65e98bc4f7d..3df7fd0b944e 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -5499,12 +5499,6 @@ static void ibmvnic_tasklet(struct tasklet_struct *t) ibmvnic_handle_crq(crq, adapter); crq->generic.first = 0; } - - /* remain in tasklet until all - * capabilities responses are received - */ - if (!adapter->wait_capability) - done = true; } /* if capabilities CRQ's were sent in this tasklet, the following * tasklet must wait until all responses are received From 3a5d9db7fbdfc8207ddf70d92668ced0ab330701 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Fri, 21 Jan 2022 18:59:21 -0800 Subject: [PATCH 0480/1295] ibmvnic: remove unused ->wait_capability With previous bug fix, ->wait_capability flag is no longer needed and can be removed. Fixes: 249168ad07cd ("ibmvnic: Make CRQ interrupt tasklet wait for all capabilities crqs") Signed-off-by: Sukadev Bhattiprolu Reviewed-by: Dany Madden Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 38 +++++++++++------------------- drivers/net/ethernet/ibm/ibmvnic.h | 1 - 2 files changed, 14 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 3df7fd0b944e..bda7a2a9d211 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4884,10 +4884,8 @@ static void handle_request_cap_rsp(union ibmvnic_crq *crq, } /* Done receiving requested capabilities, query IP offload support */ - if (atomic_read(&adapter->running_cap_crqs) == 0) { - adapter->wait_capability = false; + if (atomic_read(&adapter->running_cap_crqs) == 0) send_query_ip_offload(adapter); - } } static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq, @@ -5185,10 +5183,8 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq, } out: - if (atomic_read(&adapter->running_cap_crqs) == 0) { - adapter->wait_capability = false; + if (atomic_read(&adapter->running_cap_crqs) == 0) send_request_cap(adapter, 0); - } } static int send_query_phys_parms(struct ibmvnic_adapter *adapter) @@ -5484,27 +5480,21 @@ static void ibmvnic_tasklet(struct tasklet_struct *t) struct ibmvnic_crq_queue *queue = &adapter->crq; union ibmvnic_crq *crq; unsigned long flags; - bool done = false; spin_lock_irqsave(&queue->lock, flags); - while (!done) { - /* Pull all the valid messages off the CRQ */ - while ((crq = ibmvnic_next_crq(adapter)) != NULL) { - /* This barrier makes sure ibmvnic_next_crq()'s - * crq->generic.first & IBMVNIC_CRQ_CMD_RSP is loaded - * before ibmvnic_handle_crq()'s - * switch(gen_crq->first) and switch(gen_crq->cmd). - */ - dma_rmb(); - ibmvnic_handle_crq(crq, adapter); - crq->generic.first = 0; - } + + /* Pull all the valid messages off the CRQ */ + while ((crq = ibmvnic_next_crq(adapter)) != NULL) { + /* This barrier makes sure ibmvnic_next_crq()'s + * crq->generic.first & IBMVNIC_CRQ_CMD_RSP is loaded + * before ibmvnic_handle_crq()'s + * switch(gen_crq->first) and switch(gen_crq->cmd). + */ + dma_rmb(); + ibmvnic_handle_crq(crq, adapter); + crq->generic.first = 0; } - /* if capabilities CRQ's were sent in this tasklet, the following - * tasklet must wait until all responses are received - */ - if (atomic_read(&adapter->running_cap_crqs) != 0) - adapter->wait_capability = true; + spin_unlock_irqrestore(&queue->lock, flags); } diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index 4a8f36e0ab07..4a7a56ff74ce 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -919,7 +919,6 @@ struct ibmvnic_adapter { int login_rsp_buf_sz; atomic_t running_cap_crqs; - bool wait_capability; struct ibmvnic_sub_crq_queue **tx_scrq ____cacheline_aligned; struct ibmvnic_sub_crq_queue **rx_scrq ____cacheline_aligned; From c0bf3d8a943b6f2e912b7c1de03e2ef28e76f760 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Sat, 22 Jan 2022 17:43:09 +0800 Subject: [PATCH 0481/1295] net/smc: Transitional solution for clcsock race issue We encountered a crash in smc_setsockopt() and it is caused by accessing smc->clcsock after clcsock was released. BUG: kernel NULL pointer dereference, address: 0000000000000020 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 50309 Comm: nginx Kdump: loaded Tainted: G E 5.16.0-rc4+ #53 RIP: 0010:smc_setsockopt+0x59/0x280 [smc] Call Trace: __sys_setsockopt+0xfc/0x190 __x64_sys_setsockopt+0x20/0x30 do_syscall_64+0x34/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f16ba83918e This patch tries to fix it by holding clcsock_release_lock and checking whether clcsock has already been released before access. In case that a crash of the same reason happens in smc_getsockopt() or smc_switch_to_fallback(), this patch also checkes smc->clcsock in them too. And the caller of smc_switch_to_fallback() will identify whether fallback succeeds according to the return value. Fixes: fd57770dd198 ("net/smc: wait for pending work before clcsock release_sock") Link: https://lore.kernel.org/lkml/5dd7ffd1-28e2-24cc-9442-1defec27375e@linux.ibm.com/T/ Signed-off-by: Wen Gu Acked-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 63 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 961854e56736..d5ea62b82bb8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -566,12 +566,17 @@ static void smc_stat_fallback(struct smc_sock *smc) mutex_unlock(&net->smc.mutex_fback_rsn); } -static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) +static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); - wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk); + wait_queue_head_t *clc_wait; unsigned long flags; + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -586,18 +591,30 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) * smc socket->wq, which should be removed * to clcsocket->wq during the fallback. */ + clc_wait = sk_sleep(smc->clcsock->sk); spin_lock_irqsave(&smc_wait->lock, flags); spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); list_splice_init(&smc_wait->head, &clc_wait->head); spin_unlock(&clc_wait->lock); spin_unlock_irqrestore(&smc_wait->lock, flags); } + mutex_unlock(&smc->clcsock_release_lock); + return 0; } /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { - smc_switch_to_fallback(smc, reason_code); + struct net *net = sock_net(&smc->sk); + int rc = 0; + + rc = smc_switch_to_fallback(smc, reason_code); + if (rc) { /* fallback fails */ + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ + return rc; + } smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) @@ -1518,11 +1535,12 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, { /* RDMA setup failed, switch back to TCP */ smc_conn_abort(new_smc, local_first); - if (reason_code < 0) { /* error, no fallback possible */ + if (reason_code < 0 || + smc_switch_to_fallback(new_smc, reason_code)) { + /* error, no fallback possible */ smc_listen_out_err(new_smc); return; } - smc_switch_to_fallback(new_smc, reason_code); if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { smc_listen_out_err(new_smc); @@ -1964,8 +1982,11 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { - smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); - smc_listen_out_connected(new_smc); + rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); + if (rc) + smc_listen_out_err(new_smc); + else + smc_listen_out_connected(new_smc); return; } @@ -2254,7 +2275,9 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_FASTOPEN) { if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); + rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); + if (rc) + goto out; } else { rc = -EINVAL; goto out; @@ -2447,6 +2470,11 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* generic setsockopts reaching us here always apply to the * CLC socket */ + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } if (unlikely(!smc->clcsock->ops->setsockopt)) rc = -EOPNOTSUPP; else @@ -2456,6 +2484,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_err = smc->clcsock->sk->sk_err; sk_error_report(sk); } + mutex_unlock(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; @@ -2472,7 +2501,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); + rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; } @@ -2515,13 +2544,23 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct smc_sock *smc; + int rc; smc = smc_sk(sock->sk); + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } /* socket options apply to the CLC socket */ - if (unlikely(!smc->clcsock->ops->getsockopt)) + if (unlikely(!smc->clcsock->ops->getsockopt)) { + mutex_unlock(&smc->clcsock_release_lock); return -EOPNOTSUPP; - return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, - optval, optlen); + } + rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, + optval, optlen); + mutex_unlock(&smc->clcsock_release_lock); + return rc; } static int smc_ioctl(struct socket *sock, unsigned int cmd, From 58cd4a088e8917b4092c7011d499e277e04a6644 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 21 Jan 2022 12:12:34 +0000 Subject: [PATCH 0482/1295] arm64: vdso: Fix "no previous prototype" warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If compiling the arm64 kernel with W=1 the following warning is produced: | arch/arm64/kernel/vdso/vgettimeofday.c:9:5: error: no previous prototype for ‘__kernel_clock_gettime’ [-Werror=missing-prototypes] | 9 | int __kernel_clock_gettime(clockid_t clock, | | ^~~~~~~~~~~~~~~~~~~~~~ | arch/arm64/kernel/vdso/vgettimeofday.c:15:5: error: no previous prototype for ‘__kernel_gettimeofday’ [-Werror=missing-prototypes] | 15 | int __kernel_gettimeofday(struct __kernel_old_timeval *tv, | | ^~~~~~~~~~~~~~~~~~~~~ | arch/arm64/kernel/vdso/vgettimeofday.c:21:5: error: no previous prototype for ‘__kernel_clock_getres’ [-Werror=missing-prototypes] | 21 | int __kernel_clock_getres(clockid_t clock_id, | | ^~~~~~~~~~~~~~~~~~~~~ This patch removes "-Wmissing-prototypes" and "-Wmissing-declarations" compilers flags from the compilation of vgettimeofday.c to make possible to build the kernel with CONFIG_WERROR enabled. Cc: Will Deacon Reported-by: Marc Kleine-Budde Signed-off-by: Vincenzo Frascino Tested-by: Marc Kleine-Budde Link: https://lore.kernel.org/r/20220121121234.47273-1-vincenzo.frascino@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vdso/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 60813497a381..172452f79e46 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -29,8 +29,11 @@ ldflags-y := -shared -soname=linux-vdso.so.1 --hash-style=sysv \ ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO +# -Wmissing-prototypes and -Wmissing-declarations are removed from +# the CFLAGS of vgettimeofday.c to make possible to build the +# kernel with CONFIG_WERROR enabled. CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) $(GCC_PLUGINS_CFLAGS) \ - $(CC_FLAGS_LTO) + $(CC_FLAGS_LTO) -Wmissing-prototypes -Wmissing-declarations KASAN_SANITIZE := n KCSAN_SANITIZE := n UBSAN_SANITIZE := n From 2afc3b5a31f9edf3ef0f374f5d70610c79c93a42 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 22 Jan 2022 06:40:56 -0500 Subject: [PATCH 0483/1295] ping: fix the sk_bound_dev_if match in ping_lookup When 'ping' changes to use PING socket instead of RAW socket by: # sysctl -w net.ipv4.ping_group_range="0 100" the selftests 'router_broadcast.sh' will fail, as such command # ip vrf exec vrf-h1 ping -I veth0 198.51.100.255 -b can't receive the response skb by the PING socket. It's caused by mismatch of sk_bound_dev_if and dif in ping_rcv() when looking up the PING socket, as dif is vrf-h1 if dif's master was set to vrf-h1. This patch is to fix this regression by also checking the sk_bound_dev_if against sdif so that the packets can stil be received even if the socket is not bound to the vrf device but to the real iif. Fixes: c319b4d76b9e ("net: ipv4: add IPPROTO_ICMP socket kind") Reported-by: Hangbin Liu Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ping.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 0e56df3a45e2..bcf7bc71cb56 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -220,7 +220,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) continue; } - if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && + sk->sk_bound_dev_if != inet_sdif(skb)) continue; sock_hold(sk); From ebe0582bee78e221b7d9f09ff22a530e0ddd6c96 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 23 Jan 2022 07:53:46 +0100 Subject: [PATCH 0484/1295] net: atlantic: Use the bitmap API instead of hand-writing it Simplify code by using bitmap_weight() and bitmap_zero() instead of hand-writing these functions. Signed-off-by: Christophe JAILLET Reviewed-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_filters.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_filters.c b/drivers/net/ethernet/aquantia/atlantic/aq_filters.c index 1bc4d33a0ce5..30a573db02bb 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_filters.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_filters.c @@ -826,7 +826,6 @@ int aq_filters_vlans_update(struct aq_nic_s *aq_nic) struct aq_hw_s *aq_hw = aq_nic->aq_hw; int hweight = 0; int err = 0; - int i; if (unlikely(!aq_hw_ops->hw_filter_vlan_set)) return -EOPNOTSUPP; @@ -837,8 +836,7 @@ int aq_filters_vlans_update(struct aq_nic_s *aq_nic) aq_nic->aq_hw_rx_fltrs.fl2.aq_vlans); if (aq_nic->ndev->features & NETIF_F_HW_VLAN_CTAG_FILTER) { - for (i = 0; i < BITS_TO_LONGS(VLAN_N_VID); i++) - hweight += hweight_long(aq_nic->active_vlans[i]); + hweight = bitmap_weight(aq_nic->active_vlans, VLAN_N_VID); err = aq_hw_ops->hw_filter_vlan_ctrl(aq_hw, false); if (err) @@ -871,7 +869,7 @@ int aq_filters_vlan_offload_off(struct aq_nic_s *aq_nic) struct aq_hw_s *aq_hw = aq_nic->aq_hw; int err = 0; - memset(aq_nic->active_vlans, 0, sizeof(aq_nic->active_vlans)); + bitmap_zero(aq_nic->active_vlans, VLAN_N_VID); aq_fvlan_rebuild(aq_nic, aq_nic->active_vlans, aq_nic->aq_hw_rx_fltrs.fl2.aq_vlans); From a37d9a17f099072fe4d3a9048b0321978707a918 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 20 Jan 2022 23:53:04 +0200 Subject: [PATCH 0485/1295] fsnotify: invalidate dcache before IN_DELETE event Apparently, there are some applications that use IN_DELETE event as an invalidation mechanism and expect that if they try to open a file with the name reported with the delete event, that it should not contain the content of the deleted file. Commit 49246466a989 ("fsnotify: move fsnotify_nameremove() hook out of d_delete()") moved the fsnotify delete hook before d_delete() so fsnotify will have access to a positive dentry. This allowed a race where opening the deleted file via cached dentry is now possible after receiving the IN_DELETE event. To fix the regression, create a new hook fsnotify_delete() that takes the unlinked inode as an argument and use a helper d_delete_notify() to pin the inode, so we can pass it to fsnotify_delete() after d_delete(). Backporting hint: this regression is from v5.3. Although patch will apply with only trivial conflicts to v5.4 and v5.10, it won't build, because fsnotify_delete() implementation is different in each of those versions (see fsnotify_link()). A follow up patch will fix the fsnotify_unlink/rmdir() calls in pseudo filesystem that do not need to call d_delete(). Link: https://lore.kernel.org/r/20220120215305.282577-1-amir73il@gmail.com Reported-by: Ivan Delalande Link: https://lore.kernel.org/linux-fsdevel/YeNyzoDM5hP5LtGW@visor/ Fixes: 49246466a989 ("fsnotify: move fsnotify_nameremove() hook out of d_delete()") Cc: stable@vger.kernel.org # v5.3+ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/btrfs/ioctl.c | 6 ++--- fs/namei.c | 10 ++++---- include/linux/fsnotify.h | 49 +++++++++++++++++++++++++++++++++++----- 3 files changed, 50 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a5bd6926f7ff..7807b28b7892 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3086,10 +3086,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, btrfs_inode_lock(inode, 0); err = btrfs_delete_subvolume(dir, dentry); btrfs_inode_unlock(inode, 0); - if (!err) { - fsnotify_rmdir(dir, dentry); - d_delete(dentry); - } + if (!err) + d_delete_notify(dir, dentry); out_dput: dput(dentry); diff --git a/fs/namei.c b/fs/namei.c index d81f04f8d818..4ed0e41feab7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3974,13 +3974,12 @@ int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir, dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); detach_mounts(dentry); - fsnotify_rmdir(dir, dentry); out: inode_unlock(dentry->d_inode); dput(dentry); if (!error) - d_delete(dentry); + d_delete_notify(dir, dentry); return error; } EXPORT_SYMBOL(vfs_rmdir); @@ -4102,7 +4101,6 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir, if (!error) { dont_mount(dentry); detach_mounts(dentry); - fsnotify_unlink(dir, dentry); } } } @@ -4110,9 +4108,11 @@ out: inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ - if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { + if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) { + fsnotify_unlink(dir, dentry); + } else if (!error) { fsnotify_link_count(target); - d_delete(dentry); + d_delete_notify(dir, dentry); } return error; diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 3a2d7dc3c607..bb8467cd11ae 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -224,6 +224,43 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, dir, &new_dentry->d_name, 0); } +/* + * fsnotify_delete - @dentry was unlinked and unhashed + * + * Caller must make sure that dentry->d_name is stable. + * + * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode + * as this may be called after d_delete() and old_dentry may be negative. + */ +static inline void fsnotify_delete(struct inode *dir, struct inode *inode, + struct dentry *dentry) +{ + __u32 mask = FS_DELETE; + + if (S_ISDIR(inode->i_mode)) + mask |= FS_ISDIR; + + fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name, + 0); +} + +/** + * d_delete_notify - delete a dentry and call fsnotify_delete() + * @dentry: The dentry to delete + * + * This helper is used to guaranty that the unlinked inode cannot be found + * by lookup of this name after fsnotify_delete() event has been delivered. + */ +static inline void d_delete_notify(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + ihold(inode); + d_delete(dentry); + fsnotify_delete(dir, inode, dentry); + iput(inode); +} + /* * fsnotify_unlink - 'name' was unlinked * @@ -231,10 +268,10 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, */ static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry) { - /* Expected to be called before d_delete() */ - WARN_ON_ONCE(d_is_negative(dentry)); + if (WARN_ON_ONCE(d_is_negative(dentry))) + return; - fsnotify_dirent(dir, dentry, FS_DELETE); + fsnotify_delete(dir, d_inode(dentry), dentry); } /* @@ -258,10 +295,10 @@ static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry) */ static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry) { - /* Expected to be called before d_delete() */ - WARN_ON_ONCE(d_is_negative(dentry)); + if (WARN_ON_ONCE(d_is_negative(dentry))) + return; - fsnotify_dirent(dir, dentry, FS_DELETE | FS_ISDIR); + fsnotify_delete(dir, d_inode(dentry), dentry); } /* From 29044dae2e746949ad4b9cbdbfb248994d1dcdb4 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 20 Jan 2022 23:53:05 +0200 Subject: [PATCH 0486/1295] fsnotify: fix fsnotify hooks in pseudo filesystems Commit 49246466a989 ("fsnotify: move fsnotify_nameremove() hook out of d_delete()") moved the fsnotify delete hook before d_delete() so fsnotify will have access to a positive dentry. This allowed a race where opening the deleted file via cached dentry is now possible after receiving the IN_DELETE event. To fix the regression in pseudo filesystems, convert d_delete() calls to d_drop() (see commit 46c46f8df9aa ("devpts_pty_kill(): don't bother with d_delete()") and move the fsnotify hook after d_drop(). Add a missing fsnotify_unlink() hook in nfsdfs that was found during the audit of fsnotify hooks in pseudo filesystems. Note that the fsnotify hooks in simple_recursive_removal() follow d_invalidate(), so they require no change. Link: https://lore.kernel.org/r/20220120215305.282577-2-amir73il@gmail.com Reported-by: Ivan Delalande Link: https://lore.kernel.org/linux-fsdevel/YeNyzoDM5hP5LtGW@visor/ Fixes: 49246466a989 ("fsnotify: move fsnotify_nameremove() hook out of d_delete()") Cc: stable@vger.kernel.org # v5.3+ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/configfs/dir.c | 6 +++--- fs/devpts/inode.c | 2 +- fs/nfsd/nfsctl.c | 5 +++-- net/sunrpc/rpc_pipe.c | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 1466b5d01cbb..d3cd2a94d1e8 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1780,8 +1780,8 @@ void configfs_unregister_group(struct config_group *group) configfs_detach_group(&group->cg_item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); + d_drop(dentry); fsnotify_rmdir(d_inode(parent), dentry); - d_delete(dentry); inode_unlock(d_inode(parent)); dput(dentry); @@ -1922,10 +1922,10 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) configfs_detach_group(&group->cg_item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - fsnotify_rmdir(d_inode(root), dentry); inode_unlock(d_inode(dentry)); - d_delete(dentry); + d_drop(dentry); + fsnotify_rmdir(d_inode(root), dentry); inode_unlock(d_inode(root)); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 42e5a766d33c..4f25015aa534 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -621,8 +621,8 @@ void devpts_pty_kill(struct dentry *dentry) dentry->d_fsdata = NULL; drop_nlink(dentry->d_inode); - fsnotify_unlink(d_inode(dentry->d_parent), dentry); d_drop(dentry); + fsnotify_unlink(d_inode(dentry->d_parent), dentry); dput(dentry); /* d_alloc_name() in devpts_pty_new() */ } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index b9f27fbcd768..68b020f2002b 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1247,7 +1247,8 @@ static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry) clear_ncl(d_inode(dentry)); dget(dentry); ret = simple_unlink(dir, dentry); - d_delete(dentry); + d_drop(dentry); + fsnotify_unlink(dir, dentry); dput(dentry); WARN_ON_ONCE(ret); } @@ -1338,8 +1339,8 @@ void nfsd_client_rmdir(struct dentry *dentry) dget(dentry); ret = simple_rmdir(dir, dentry); WARN_ON_ONCE(ret); + d_drop(dentry); fsnotify_rmdir(dir, dentry); - d_delete(dentry); dput(dentry); inode_unlock(dir); } diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index ee5336d73fdd..35588f0afa86 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -600,9 +600,9 @@ static int __rpc_rmdir(struct inode *dir, struct dentry *dentry) dget(dentry); ret = simple_rmdir(dir, dentry); + d_drop(dentry); if (!ret) fsnotify_rmdir(dir, dentry); - d_delete(dentry); dput(dentry); return ret; } @@ -613,9 +613,9 @@ static int __rpc_unlink(struct inode *dir, struct dentry *dentry) dget(dentry); ret = simple_unlink(dir, dentry); + d_drop(dentry); if (!ret) fsnotify_unlink(dir, dentry); - d_delete(dentry); dput(dentry); return ret; } From bdac3bbd0dc63873a9c606b8e4f814e6d61d288d Mon Sep 17 00:00:00 2001 From: Nicolas Frattaroli Date: Fri, 26 Nov 2021 16:43:42 +0100 Subject: [PATCH 0487/1295] spi: spi-rockchip: Add rk3568-spi compatible This adds a compatible string for the SPI controller found on the RK3566 and RK3568 SoCs. Signed-off-by: Nicolas Frattaroli Link: https://lore.kernel.org/r/20211126154344.724316-2-frattaroli.nicolas@gmail.com Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/spi/spi-rockchip.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/spi/spi-rockchip.yaml b/Documentation/devicetree/bindings/spi/spi-rockchip.yaml index 7f987e79337c..52a78a2e362e 100644 --- a/Documentation/devicetree/bindings/spi/spi-rockchip.yaml +++ b/Documentation/devicetree/bindings/spi/spi-rockchip.yaml @@ -33,6 +33,7 @@ properties: - rockchip,rk3328-spi - rockchip,rk3368-spi - rockchip,rk3399-spi + - rockchip,rk3568-spi - rockchip,rv1126-spi - const: rockchip,rk3066-spi From de8a820df2acd02eac1d98a99dd447634226d653 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Sun, 23 Jan 2022 20:27:58 +0800 Subject: [PATCH 0488/1295] net: stmmac: remove unused members in struct stmmac_priv The tx_coalesce and mii_irq are not used at all now, so remove them. Signed-off-by: Jisheng Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index 40b5ed94cb54..5b195d5051d6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -194,7 +194,6 @@ struct stmmac_priv { u32 tx_coal_timer[MTL_MAX_TX_QUEUES]; u32 rx_coal_frames[MTL_MAX_TX_QUEUES]; - int tx_coalesce; int hwts_tx_en; bool tx_path_in_lpi_mode; bool tso; @@ -229,7 +228,6 @@ struct stmmac_priv { unsigned int flow_ctrl; unsigned int pause; struct mii_bus *mii; - int mii_irq[PHY_MAX_ADDR]; struct phylink_config phylink_config; struct phylink *phylink; From 7fc3b7c2981bbd1047916ade327beccb90994eee Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Jan 2022 18:22:13 +0100 Subject: [PATCH 0489/1295] udf: Fix NULL ptr deref when converting from inline format udf_expand_file_adinicb() calls directly ->writepage to write data expanded into a page. This however misses to setup inode for writeback properly and so we can crash on inode->i_wb dereference when submitting page for IO like: BUG: kernel NULL pointer dereference, address: 0000000000000158 #PF: supervisor read access in kernel mode ... __folio_start_writeback+0x2ac/0x350 __block_write_full_page+0x37d/0x490 udf_expand_file_adinicb+0x255/0x400 [udf] udf_file_write_iter+0xbe/0x1b0 [udf] new_sync_write+0x125/0x1c0 vfs_write+0x28e/0x400 Fix the problem by marking the page dirty and going through the standard writeback path to write the page. Strictly speaking we would not even have to write the page but we want to catch e.g. ENOSPC errors early. Reported-by: butt3rflyh4ck CC: stable@vger.kernel.org Fixes: 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks") Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/udf/inode.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 1d6b7a50736b..d6aa506b6b58 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -258,10 +258,6 @@ int udf_expand_file_adinicb(struct inode *inode) char *kaddr; struct udf_inode_info *iinfo = UDF_I(inode); int err; - struct writeback_control udf_wbc = { - .sync_mode = WB_SYNC_NONE, - .nr_to_write = 1, - }; WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { @@ -305,8 +301,10 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; /* from now on we have normal address_space methods */ inode->i_data.a_ops = &udf_aops; + set_page_dirty(page); + unlock_page(page); up_write(&iinfo->i_data_sem); - err = inode->i_data.a_ops->writepage(page, &udf_wbc); + err = filemap_fdatawrite(inode->i_mapping); if (err) { /* Restore everything back so that we don't lose data... */ lock_page(page); From ea8569194b43f0f01f0a84c689388542c7254a1f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Jan 2022 09:57:25 +0100 Subject: [PATCH 0490/1295] udf: Restore i_lenAlloc when inode expansion fails When we fail to expand inode from inline format to a normal format, we restore inode to contain the original inline formatting but we forgot to set i_lenAlloc back. The mismatch between i_lenAlloc and i_size was then causing further problems such as warnings and lost data down the line. Reported-by: butt3rflyh4ck CC: stable@vger.kernel.org Fixes: 7e49b6f2480c ("udf: Convert UDF to new truncate calling sequence") Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/udf/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index d6aa506b6b58..ea8f6cd01f50 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -315,6 +315,7 @@ int udf_expand_file_adinicb(struct inode *inode) unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; inode->i_data.a_ops = &udf_adinicb_aops; + iinfo->i_lenAlloc = inode->i_size; up_write(&iinfo->i_data_sem); } put_page(page); From 9daf0a4d32d60a57f2a2533bdf4c178be7fdff7f Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 16 Jan 2022 04:59:36 -0800 Subject: [PATCH 0491/1295] quota: cleanup double word in comment Remove the second 'handle'. Link: https://lore.kernel.org/r/20220116125936.389767-1-trix@redhat.com Signed-off-by: Tom Rix Signed-off-by: Jan Kara --- include/linux/quota.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/quota.h b/include/linux/quota.h index 18ebd39c9487..fd692b4a41d5 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -91,7 +91,7 @@ extern bool qid_valid(struct kqid qid); * * When there is no mapping defined for the user-namespace, type, * qid tuple an invalid kqid is returned. Callers are expected to - * test for and handle handle invalid kqids being returned. + * test for and handle invalid kqids being returned. * Invalid kqids may be tested for using qid_valid(). */ static inline struct kqid make_kqid(struct user_namespace *from, From 94fea1d8a30eadc3ef07afc0f53dc06799bb300b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 22 Jan 2022 01:52:11 +0000 Subject: [PATCH 0492/1295] KVM: VMX: Zero host's SYSENTER_ESP iff SYSENTER is NOT used Zero vmcs.HOST_IA32_SYSENTER_ESP when initializing *constant* host state if and only if SYSENTER cannot be used, i.e. the kernel is a 64-bit kernel and is not emulating 32-bit syscalls. As the name suggests, vmx_set_constant_host_state() is intended for state that is *constant*. When SYSENTER is used, SYSENTER_ESP isn't constant because stacks are per-CPU, and the VMCS must be updated whenever the vCPU is migrated to a new CPU. The logic in vmx_vcpu_load_vmcs() doesn't differentiate between "never loaded" and "loaded on a different CPU", i.e. setting SYSENTER_ESP on VMCS load also handles setting correct host state when the VMCS is first loaded. Because a VMCS must be loaded before it is initialized during vCPU RESET, zeroing the field in vmx_set_constant_host_state() obliterates the value that was written when the VMCS was loaded. If the vCPU is run before it is migrated, the subsequent VM-Exit will zero out MSR_IA32_SYSENTER_ESP, leading to a #DF on the next 32-bit syscall. double fault: 0000 [#1] SMP CPU: 0 PID: 990 Comm: stable Not tainted 5.16.0+ #97 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 EIP: entry_SYSENTER_32+0x0/0xe7 Code: <9c> 50 eb 17 0f 20 d8 a9 00 10 00 00 74 0d 25 ff ef ff ff 0f 22 d8 EAX: 000000a2 EBX: a8d1300c ECX: a8d13014 EDX: 00000000 ESI: a8f87000 EDI: a8d13014 EBP: a8d12fc0 ESP: 00000000 DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068 EFLAGS: 00210093 CR0: 80050033 CR2: fffffffc CR3: 02c3b000 CR4: 00152e90 Fixes: 6ab8a4053f71 ("KVM: VMX: Avoid to rdmsrl(MSR_IA32_SYSENTER_ESP)") Cc: Lai Jiangshan Signed-off-by: Sean Christopherson Message-Id: <20220122015211.1468758-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a02a28ce7cc3..705e5c082738 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4094,10 +4094,14 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) vmcs_write32(HOST_IA32_SYSENTER_CS, low32); /* - * If 32-bit syscall is enabled, vmx_vcpu_load_vcms rewrites - * HOST_IA32_SYSENTER_ESP. + * SYSENTER is used for 32-bit system calls on either 32-bit or + * 64-bit kernels. It is always zero If neither is allowed, otherwise + * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may + * have already done so!). */ - vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); + if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) + vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); + rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ From adb759e599990416e42e659c024a654b76c84617 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 23 Jan 2022 13:42:19 +0100 Subject: [PATCH 0493/1295] x86,kvm/xen: Remove superfluous .fixup usage Commit 14243b387137 ("KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery") adds superfluous .fixup usage after the whole .fixup section was removed in commit e5eefda5aa51 ("x86: Remove .fixup section"). Fixes: 14243b387137 ("KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery") Reported-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Message-Id: <20220123124219.GH20638@worktop.programming.kicks-ass.net> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/xen.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 0e3f7d6e9fd7..bad57535fad0 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -316,10 +316,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) "\tnotq %0\n" "\t" LOCK_PREFIX "andq %0, %2\n" "2:\n" - "\t.section .fixup,\"ax\"\n" - "3:\tjmp\t2b\n" - "\t.previous\n" - _ASM_EXTABLE_UA(1b, 3b) + _ASM_EXTABLE_UA(1b, 2b) : "=r" (evtchn_pending_sel), "+m" (vi->evtchn_pending_sel), "+m" (v->arch.xen.evtchn_pending_sel) @@ -335,10 +332,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) "\tnotl %0\n" "\t" LOCK_PREFIX "andl %0, %2\n" "2:\n" - "\t.section .fixup,\"ax\"\n" - "3:\tjmp\t2b\n" - "\t.previous\n" - _ASM_EXTABLE_UA(1b, 3b) + _ASM_EXTABLE_UA(1b, 2b) : "=r" (evtchn_pending_sel32), "+m" (vi->evtchn_pending_sel), "+m" (v->arch.xen.evtchn_pending_sel) From 1625566ec8fd3a42e305c5118df81fb113eb60a7 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Mon, 24 Jan 2022 10:04:56 +0800 Subject: [PATCH 0494/1295] KVM: remove async parameter of hva_to_pfn_remapped() The async parameter of hva_to_pfn_remapped() is not used, so remove it. Signed-off-by: Xianting Tian Message-Id: <20220124020456.156386-1-xianting.tian@linux.alibaba.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5a1164483e6c..0bbb3cbf6014 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2463,9 +2463,8 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn) } static int hva_to_pfn_remapped(struct vm_area_struct *vma, - unsigned long addr, bool *async, - bool write_fault, bool *writable, - kvm_pfn_t *p_pfn) + unsigned long addr, bool write_fault, + bool *writable, kvm_pfn_t *p_pfn) { kvm_pfn_t pfn; pte_t *ptep; @@ -2575,7 +2574,7 @@ retry: if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { - r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); + r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn); if (r == -EAGAIN) goto retry; if (r < 0) From 9ff5549b1d1d3c3a9d71220d44bd246586160f1d Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 16 Jan 2022 11:18:31 -0800 Subject: [PATCH 0495/1295] video: hyperv_fb: Fix validation of screen resolution In the WIN10 version of the Synthetic Video protocol with Hyper-V, Hyper-V reports a list of supported resolutions as part of the protocol negotiation. The driver calculates the maximum width and height from the list of resolutions, and uses those maximums to validate any screen resolution specified in the video= option on the kernel boot line. This method of validation is incorrect. For example, the list of supported resolutions could contain 1600x1200 and 1920x1080, both of which fit in an 8 Mbyte frame buffer. But calculating the max width and height yields 1920 and 1200, and 1920x1200 resolution does not fit in an 8 Mbyte frame buffer. Unfortunately, this resolution is accepted, causing a kernel fault when the driver accesses memory outside the frame buffer. Instead, validate the specified screen resolution by calculating its size, and comparing against the frame buffer size. Delete the code for calculating the max width and height from the list of resolutions, since these max values have no use. Also add the frame buffer size to the info message to aid in understanding why a resolution might be rejected. Fixes: 67e7cdb4829d ("video: hyperv: hyperv_fb: Obtain screen resolution from Hyper-V host") Signed-off-by: Michael Kelley Reviewed-by: Haiyang Zhang Acked-by: Helge Deller Link: https://lore.kernel.org/r/1642360711-2335-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- drivers/video/fbdev/hyperv_fb.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c index 23999df52739..c8e0ea27caf1 100644 --- a/drivers/video/fbdev/hyperv_fb.c +++ b/drivers/video/fbdev/hyperv_fb.c @@ -287,8 +287,6 @@ struct hvfb_par { static uint screen_width = HVFB_WIDTH; static uint screen_height = HVFB_HEIGHT; -static uint screen_width_max = HVFB_WIDTH; -static uint screen_height_max = HVFB_HEIGHT; static uint screen_depth; static uint screen_fb_size; static uint dio_fb_size; /* FB size for deferred IO */ @@ -582,7 +580,6 @@ static int synthvid_get_supported_resolution(struct hv_device *hdev) int ret = 0; unsigned long t; u8 index; - int i; memset(msg, 0, sizeof(struct synthvid_msg)); msg->vid_hdr.type = SYNTHVID_RESOLUTION_REQUEST; @@ -613,13 +610,6 @@ static int synthvid_get_supported_resolution(struct hv_device *hdev) goto out; } - for (i = 0; i < msg->resolution_resp.resolution_count; i++) { - screen_width_max = max_t(unsigned int, screen_width_max, - msg->resolution_resp.supported_resolution[i].width); - screen_height_max = max_t(unsigned int, screen_height_max, - msg->resolution_resp.supported_resolution[i].height); - } - screen_width = msg->resolution_resp.supported_resolution[index].width; screen_height = @@ -941,7 +931,7 @@ static void hvfb_get_option(struct fb_info *info) if (x < HVFB_WIDTH_MIN || y < HVFB_HEIGHT_MIN || (synthvid_ver_ge(par->synthvid_version, SYNTHVID_VERSION_WIN10) && - (x > screen_width_max || y > screen_height_max)) || + (x * y * screen_depth / 8 > screen_fb_size)) || (par->synthvid_version == SYNTHVID_VERSION_WIN8 && x * y * screen_depth / 8 > SYNTHVID_FB_SIZE_WIN8) || (par->synthvid_version == SYNTHVID_VERSION_WIN7 && @@ -1194,8 +1184,8 @@ static int hvfb_probe(struct hv_device *hdev, } hvfb_get_option(info); - pr_info("Screen resolution: %dx%d, Color depth: %d\n", - screen_width, screen_height, screen_depth); + pr_info("Screen resolution: %dx%d, Color depth: %d, Frame buffer size: %d\n", + screen_width, screen_height, screen_depth, screen_fb_size); ret = hvfb_getmem(hdev, info); if (ret) { From 72bb9dcb6c33cfac80282713c2b4f2b254cd24d1 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 24 Jan 2022 08:45:37 +0530 Subject: [PATCH 0496/1295] arm64: Add Cortex-X2 CPU part definition Add the CPU Partnumbers for the new Arm designs. Cc: Will Deacon Cc: Suzuki Poulose Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Reviewed-by: Suzuki K Poulose Link: https://lore.kernel.org/r/1642994138-25887-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 19b8441aa8f2..657eeb06c784 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -74,6 +74,7 @@ #define ARM_CPU_PART_NEOVERSE_N1 0xD0C #define ARM_CPU_PART_CORTEX_A77 0xD0D #define ARM_CPU_PART_CORTEX_A710 0xD47 +#define ARM_CPU_PART_CORTEX_X2 0xD48 #define ARM_CPU_PART_NEOVERSE_N2 0xD49 #define APM_CPU_PART_POTENZA 0x000 @@ -116,6 +117,7 @@ #define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1) #define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77) #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) +#define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2) #define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) From eb30d838a44c9e59a2a106884f536119859c7257 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 24 Jan 2022 08:45:38 +0530 Subject: [PATCH 0497/1295] arm64: errata: Update ARM64_ERRATUM_[2119858|2224489] with Cortex-X2 ranges Errata ARM64_ERRATUM_[2119858|2224489] also affect some Cortex-X2 ranges as well. Lets update these errata definition and detection to accommodate all new Cortex-X2 based cpu MIDR ranges. Cc: Will Deacon Cc: Mathieu Poirier Cc: Suzuki Poulose Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Reviewed-by: Suzuki K Poulose Link: https://lore.kernel.org/r/1642994138-25887-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- Documentation/arm64/silicon-errata.rst | 4 ++++ arch/arm64/Kconfig | 12 ++++++------ arch/arm64/kernel/cpu_errata.c | 2 ++ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index 5342e895fb60..8789c79310bb 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -98,6 +98,10 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A710 | #2224489 | ARM64_ERRATUM_2224489 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-X2 | #2119858 | ARM64_ERRATUM_2119858 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-X2 | #2224489 | ARM64_ERRATUM_2224489 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N1 | #1349291 | N/A | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6978140edfa4..77b8f653f4bc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -671,14 +671,14 @@ config ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE bool config ARM64_ERRATUM_2119858 - bool "Cortex-A710: 2119858: workaround TRBE overwriting trace data in FILL mode" + bool "Cortex-A710/X2: 2119858: workaround TRBE overwriting trace data in FILL mode" default y depends on CORESIGHT_TRBE select ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE help - This option adds the workaround for ARM Cortex-A710 erratum 2119858. + This option adds the workaround for ARM Cortex-A710/X2 erratum 2119858. - Affected Cortex-A710 cores could overwrite up to 3 cache lines of trace + Affected Cortex-A710/X2 cores could overwrite up to 3 cache lines of trace data at the base of the buffer (pointed to by TRBASER_EL1) in FILL mode in the event of a WRAP event. @@ -761,14 +761,14 @@ config ARM64_ERRATUM_2253138 If unsure, say Y. config ARM64_ERRATUM_2224489 - bool "Cortex-A710: 2224489: workaround TRBE writing to address out-of-range" + bool "Cortex-A710/X2: 2224489: workaround TRBE writing to address out-of-range" depends on CORESIGHT_TRBE default y select ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE help - This option adds the workaround for ARM Cortex-A710 erratum 2224489. + This option adds the workaround for ARM Cortex-A710/X2 erratum 2224489. - Affected Cortex-A710 cores might write to an out-of-range address, not reserved + Affected Cortex-A710/X2 cores might write to an out-of-range address, not reserved for TRBE. Under some conditions, the TRBE might generate a write to the next virtually addressed page following the last page of the TRBE address space (i.e., the TRBLIMITR_EL1.LIMIT), instead of wrapping around to the base. diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 9e1c1aef9ebd..29cc062a4153 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -347,6 +347,7 @@ static const struct midr_range trbe_overwrite_fill_mode_cpus[] = { #endif #ifdef CONFIG_ARM64_ERRATUM_2119858 MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_RANGE(MIDR_CORTEX_X2, 0, 0, 2, 0), #endif {}, }; @@ -371,6 +372,7 @@ static struct midr_range trbe_write_out_of_range_cpus[] = { #endif #ifdef CONFIG_ARM64_ERRATUM_2224489 MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_RANGE(MIDR_CORTEX_X2, 0, 0, 2, 0), #endif {}, }; From 1e0924bd09916fab795fc2a21ec1d148f24299fd Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 24 Jan 2022 17:17:54 +0900 Subject: [PATCH 0498/1295] arm64: Mark start_backtrace() notrace and NOKPROBE_SYMBOL Mark the start_backtrace() as notrace and NOKPROBE_SYMBOL because this function is called from ftrace and lockdep to get the caller address via return_address(). The lockdep is used in kprobes, it should also be NOKPROBE_SYMBOL. Fixes: b07f3499661c ("arm64: stacktrace: Move start_backtrace() out of the header") Cc: # 5.13.x Signed-off-by: Masami Hiramatsu Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/164301227374.1433152.12808232644267107415.stgit@devnote2 Signed-off-by: Catalin Marinas --- arch/arm64/kernel/stacktrace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 0fb58fed54cb..e4103e085681 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -33,8 +33,8 @@ */ -static void start_backtrace(struct stackframe *frame, unsigned long fp, - unsigned long pc) +static notrace void start_backtrace(struct stackframe *frame, unsigned long fp, + unsigned long pc) { frame->fp = fp; frame->pc = pc; @@ -55,6 +55,7 @@ static void start_backtrace(struct stackframe *frame, unsigned long fp, frame->prev_fp = 0; frame->prev_type = STACK_TYPE_UNKNOWN; } +NOKPROBE_SYMBOL(start_backtrace); /* * Unwind from one frame record (A) to the next frame record (B). From 77311237eaffa240af6eae1d511b61e77a20a2ef Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 12 Jan 2022 22:58:46 +0200 Subject: [PATCH 0499/1295] pinctrl: Place correctly CONFIG_PINCTRL_ST in the Makefile Keep Makefile entries ordered in the same way as Kconfig ones. Reported-by: Linus Torvalds Signed-off-by: Andy Shevchenko --- drivers/pinctrl/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/Makefile b/drivers/pinctrl/Makefile index 08c364d611f5..f64d29f614ec 100644 --- a/drivers/pinctrl/Makefile +++ b/drivers/pinctrl/Makefile @@ -42,9 +42,9 @@ obj-$(CONFIG_PINCTRL_PISTACHIO) += pinctrl-pistachio.o obj-$(CONFIG_PINCTRL_RK805) += pinctrl-rk805.o obj-$(CONFIG_PINCTRL_ROCKCHIP) += pinctrl-rockchip.o obj-$(CONFIG_PINCTRL_SINGLE) += pinctrl-single.o +obj-$(CONFIG_PINCTRL_ST) += pinctrl-st.o obj-$(CONFIG_PINCTRL_STARFIVE) += pinctrl-starfive.o obj-$(CONFIG_PINCTRL_STMFX) += pinctrl-stmfx.o -obj-$(CONFIG_PINCTRL_ST) += pinctrl-st.o obj-$(CONFIG_PINCTRL_SX150X) += pinctrl-sx150x.o obj-$(CONFIG_PINCTRL_TB10X) += pinctrl-tb10x.o obj-$(CONFIG_PINCTRL_THUNDERBAY) += pinctrl-thunderbay.o From e986f0e602f19ecb7880b04dd1db415ed9bca3f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Bartosik?= Date: Mon, 24 Jan 2022 13:55:29 +0100 Subject: [PATCH 0500/1295] pinctrl: intel: fix unexpected interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ASUS Chromebook C223 with Celeron N3350 crashes sometimes during cold booot. Inspection of the kernel log showed that it gets into an inifite loop logging the following message: ->handle_irq(): 000000009cdb51e8, handle_bad_irq+0x0/0x251 ->irq_data.chip(): 000000005ec212a7, 0xffffa043009d8e7 ->action(): 00000 IRQ_NOPROBE set unexpected IRQ trap at vector 7c The issue happens during cold boot but only if cold boot happens at most several dozen seconds after Chromebook is powered off. For longer intervals between power off and power on (cold boot) the issue does not reproduce. The unexpected interrupt is sourced from INT3452 GPIO pin which is used for SD card detect. Investigation relevealed that when the interval between power off and power on (cold boot) is less than several dozen seconds then values of INT3452 GPIO interrupt enable and interrupt pending registers survive power off and power on sequence and interrupt for SD card detect pin is enabled and pending during probe of SD controller which causes the unexpected IRQ message. "Intel Pentium and Celeron Processor N- and J- Series" volume 3 doc mentions that GPIO interrupt enable and status registers default value is 0x0. The fix clears INT3452 GPIO interrupt enabled and interrupt pending registers in its probe function. Fixes: 7981c0015af2 ("pinctrl: intel: Add Intel Sunrisepoint pin controller and GPIO support") Signed-off-by: Łukasz Bartosik Signed-off-by: Andy Shevchenko --- drivers/pinctrl/intel/pinctrl-intel.c | 54 +++++++++++++++++---------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c index 85750974d182..e9bb98cb9112 100644 --- a/drivers/pinctrl/intel/pinctrl-intel.c +++ b/drivers/pinctrl/intel/pinctrl-intel.c @@ -1216,6 +1216,39 @@ static irqreturn_t intel_gpio_irq(int irq, void *data) return IRQ_RETVAL(ret); } +static void intel_gpio_irq_init(struct intel_pinctrl *pctrl) +{ + int i; + + for (i = 0; i < pctrl->ncommunities; i++) { + const struct intel_community *community; + void __iomem *base; + unsigned int gpp; + + community = &pctrl->communities[i]; + base = community->regs; + + for (gpp = 0; gpp < community->ngpps; gpp++) { + /* Mask and clear all interrupts */ + writel(0, base + community->ie_offset + gpp * 4); + writel(0xffff, base + community->is_offset + gpp * 4); + } + } +} + +static int intel_gpio_irq_init_hw(struct gpio_chip *gc) +{ + struct intel_pinctrl *pctrl = gpiochip_get_data(gc); + + /* + * Make sure the interrupt lines are in a proper state before + * further configuration. + */ + intel_gpio_irq_init(pctrl); + + return 0; +} + static int intel_gpio_add_community_ranges(struct intel_pinctrl *pctrl, const struct intel_community *community) { @@ -1320,6 +1353,7 @@ static int intel_gpio_probe(struct intel_pinctrl *pctrl, int irq) girq->num_parents = 0; girq->default_type = IRQ_TYPE_NONE; girq->handler = handle_bad_irq; + girq->init_hw = intel_gpio_irq_init_hw; ret = devm_gpiochip_add_data(pctrl->dev, &pctrl->chip, pctrl); if (ret) { @@ -1695,26 +1729,6 @@ int intel_pinctrl_suspend_noirq(struct device *dev) } EXPORT_SYMBOL_GPL(intel_pinctrl_suspend_noirq); -static void intel_gpio_irq_init(struct intel_pinctrl *pctrl) -{ - size_t i; - - for (i = 0; i < pctrl->ncommunities; i++) { - const struct intel_community *community; - void __iomem *base; - unsigned int gpp; - - community = &pctrl->communities[i]; - base = community->regs; - - for (gpp = 0; gpp < community->ngpps; gpp++) { - /* Mask and clear all interrupts */ - writel(0, base + community->ie_offset + gpp * 4); - writel(0xffff, base + community->is_offset + gpp * 4); - } - } -} - static bool intel_gpio_update_reg(void __iomem *reg, u32 mask, u32 value) { u32 curr, updated; From e12963c453263d5321a2c610e98cbc731233b685 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 19 Jan 2022 20:19:15 +0200 Subject: [PATCH 0501/1295] pinctrl: intel: Fix a glitch when updating IRQ flags on a preconfigured line The commit af7e3eeb84e2 ("pinctrl: intel: Disable input and output buffer when switching to GPIO") hadn't taken into account an update of the IRQ flags scenario. When updating the IRQ flags on the preconfigured line the ->irq_set_type() is called again. In such case the sequential Rx buffer configuration changes may trigger a falling or rising edge interrupt that may lead, on some platforms, to an undesired event. This may happen because each of intel_gpio_set_gpio_mode() and __intel_gpio_set_direction() updates the pad configuration with a different value of the GPIORXDIS bit. Notable, that the intel_gpio_set_gpio_mode() is called only for the pads that are configured as an input. Due to this fact, integrate the logic of __intel_gpio_set_direction() call into the intel_gpio_set_gpio_mode() so that the Rx buffer won't be disabled and immediately re-enabled. Fixes: af7e3eeb84e2 ("pinctrl: intel: Disable input and output buffer when switching to GPIO") Reported-by: Kane Chen Signed-off-by: Andy Shevchenko Acked-by: Mika Westerberg Tested-by: Grace Kao --- drivers/pinctrl/intel/pinctrl-intel.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c index e9bb98cb9112..826d494f3cc6 100644 --- a/drivers/pinctrl/intel/pinctrl-intel.c +++ b/drivers/pinctrl/intel/pinctrl-intel.c @@ -451,8 +451,8 @@ static void intel_gpio_set_gpio_mode(void __iomem *padcfg0) value &= ~PADCFG0_PMODE_MASK; value |= PADCFG0_PMODE_GPIO; - /* Disable input and output buffers */ - value |= PADCFG0_GPIORXDIS; + /* Disable TX buffer and enable RX (this will be input) */ + value &= ~PADCFG0_GPIORXDIS; value |= PADCFG0_GPIOTXDIS; /* Disable SCI/SMI/NMI generation */ @@ -497,9 +497,6 @@ static int intel_gpio_request_enable(struct pinctrl_dev *pctldev, intel_gpio_set_gpio_mode(padcfg0); - /* Disable TX buffer and enable RX (this will be input) */ - __intel_gpio_set_direction(padcfg0, true); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); return 0; @@ -1115,9 +1112,6 @@ static int intel_gpio_irq_type(struct irq_data *d, unsigned int type) intel_gpio_set_gpio_mode(reg); - /* Disable TX buffer and enable RX (this will be input) */ - __intel_gpio_set_direction(reg, true); - value = readl(reg); value &= ~(PADCFG0_RXEVCFG_MASK | PADCFG0_RXINV); From e33f42b20bcb2f55cb1eeeab9956a503dcf36107 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 13 Jan 2022 13:18:45 +0800 Subject: [PATCH 0502/1295] erofs: fix fsdax partition offset handling After seeking time on testing today upstream fsdax, I found it actually doesn't work well as below: [ 186.492983] ------------[ cut here ]------------ [ 186.493629] WARNING: CPU: 1 PID: 205 at fs/iomap/iter.c:33 iomap_iter+0x2f6/0x310 The problem is that m_dax_part_off should be applied to physical addresses and very sorry about that I didn't catch this eariler. Anyway, let's fix it up now. Also, I need to find a way to set up a standalone testcase to look after this later. Link: https://lore.kernel.org/r/20220113051845.244461-1-hsiangkao@linux.alibaba.com Fixes: de2051147771 ("fsdax: shift partition offset handling into the file systems") Reviewed-by: Christoph Hellwig Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/data.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fa7ddb7ad980..226a57c57ee6 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -252,12 +252,10 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return ret; iomap->offset = map.m_la; - if (flags & IOMAP_DAX) { + if (flags & IOMAP_DAX) iomap->dax_dev = mdev.m_daxdev; - iomap->offset += mdev.m_dax_part_off; - } else { + else iomap->bdev = mdev.m_bdev; - } iomap->length = map.m_llen; iomap->flags = 0; iomap->private = NULL; @@ -284,6 +282,8 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, } else { iomap->type = IOMAP_MAPPED; iomap->addr = mdev.m_pa; + if (flags & IOMAP_DAX) + iomap->addr += mdev.m_dax_part_off; } return 0; } From 7865827c432bf9885ee26e5767697c3d9e21a82c Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 21 Jan 2022 17:14:12 +0800 Subject: [PATCH 0503/1295] erofs: avoid unnecessary z_erofs_decompressqueue_work() declaration Just code rearrange. No logic changes. Link: https://lore.kernel.org/r/20220121091412.86086-1-hsiangkao@linux.alibaba.com Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 113 +++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 57 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 498b7666efe8..423bc1a61da5 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -810,68 +810,11 @@ static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, return false; } -static void z_erofs_decompressqueue_work(struct work_struct *work); -static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, - bool sync, int bios) -{ - struct erofs_sb_info *const sbi = EROFS_SB(io->sb); - - /* wake up the caller thread for sync decompression */ - if (sync) { - unsigned long flags; - - spin_lock_irqsave(&io->u.wait.lock, flags); - if (!atomic_add_return(bios, &io->pending_bios)) - wake_up_locked(&io->u.wait); - spin_unlock_irqrestore(&io->u.wait.lock, flags); - return; - } - - if (atomic_add_return(bios, &io->pending_bios)) - return; - /* Use workqueue and sync decompression for atomic contexts only */ - if (in_atomic() || irqs_disabled()) { - queue_work(z_erofs_workqueue, &io->u.work); - /* enable sync decompression for readahead */ - if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) - sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; - return; - } - z_erofs_decompressqueue_work(&io->u.work); -} - static bool z_erofs_page_is_invalidated(struct page *page) { return !page->mapping && !z_erofs_is_shortlived_page(page); } -static void z_erofs_decompressqueue_endio(struct bio *bio) -{ - tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); - struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); - blk_status_t err = bio->bi_status; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - - DBG_BUGON(PageUptodate(page)); - DBG_BUGON(z_erofs_page_is_invalidated(page)); - - if (err) - SetPageError(page); - - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { - if (!err) - SetPageUptodate(page); - unlock_page(page); - } - } - z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); - bio_put(bio); -} - static int z_erofs_decompress_pcluster(struct super_block *sb, struct z_erofs_pcluster *pcl, struct page **pagepool) @@ -1123,6 +1066,35 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) kvfree(bgq); } +static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, + bool sync, int bios) +{ + struct erofs_sb_info *const sbi = EROFS_SB(io->sb); + + /* wake up the caller thread for sync decompression */ + if (sync) { + unsigned long flags; + + spin_lock_irqsave(&io->u.wait.lock, flags); + if (!atomic_add_return(bios, &io->pending_bios)) + wake_up_locked(&io->u.wait); + spin_unlock_irqrestore(&io->u.wait.lock, flags); + return; + } + + if (atomic_add_return(bios, &io->pending_bios)) + return; + /* Use workqueue and sync decompression for atomic contexts only */ + if (in_atomic() || irqs_disabled()) { + queue_work(z_erofs_workqueue, &io->u.work); + /* enable sync decompression for readahead */ + if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) + sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; + return; + } + z_erofs_decompressqueue_work(&io->u.work); +} + static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, unsigned int nr, struct page **pagepool, @@ -1300,6 +1272,33 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, qtail[JQ_BYPASS] = &pcl->next; } +static void z_erofs_decompressqueue_endio(struct bio *bio) +{ + tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); + struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); + blk_status_t err = bio->bi_status; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + + DBG_BUGON(PageUptodate(page)); + DBG_BUGON(z_erofs_page_is_invalidated(page)); + + if (err) + SetPageError(page); + + if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { + if (!err) + SetPageUptodate(page); + unlock_page(page); + } + } + z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); + bio_put(bio); +} + static void z_erofs_submit_queue(struct super_block *sb, struct z_erofs_decompress_frontend *f, struct page **pagepool, From 40c67c291a93f8846c4a972c9ef1b7ba4544c8d0 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Wed, 12 Jan 2022 16:31:56 +0800 Subject: [PATCH 0504/1295] mmc: sdhci-of-esdhc: Check for error num after setting mask Because of the possible failure of the dma_supported(), the dma_set_mask_and_coherent() may return error num. Therefore, it should be better to check it and return the error if fails. And since the sdhci_setup_host() has already checked the return value of the enable_dma, we need not check it in sdhci_resume_host() again. Fixes: 5552d7ad596c ("mmc: sdhci-of-esdhc: set proper dma mask for ls104x chips") Signed-off-by: Jiasheng Jiang Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220112083156.1124782-1-jiasheng@iscas.ac.cn Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-esdhc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index a593b1fbd69e..0f3658b36513 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -524,12 +524,16 @@ static void esdhc_of_adma_workaround(struct sdhci_host *host, u32 intmask) static int esdhc_of_enable_dma(struct sdhci_host *host) { + int ret; u32 value; struct device *dev = mmc_dev(host->mmc); if (of_device_is_compatible(dev->of_node, "fsl,ls1043a-esdhc") || - of_device_is_compatible(dev->of_node, "fsl,ls1046a-esdhc")) - dma_set_mask_and_coherent(dev, DMA_BIT_MASK(40)); + of_device_is_compatible(dev->of_node, "fsl,ls1046a-esdhc")) { + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(40)); + if (ret) + return ret; + } value = sdhci_readl(host, ESDHC_DMA_SYSCTL); From 4d315357b3d6c315a7260420c6c6fc076e58d14b Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Wed, 19 Jan 2022 20:00:06 +0800 Subject: [PATCH 0505/1295] mmc: sh_mmcif: Check for null res pointer If there is no suitable resource, platform_get_resource() will return NULL. Therefore in order to avoid the dereference of the NULL pointer, it should be better to check the 'res'. Signed-off-by: Jiasheng Jiang Cc: stable@vger.kernel.org # v5.16+ Link: https://lore.kernel.org/r/20220119120006.1426964-1-jiasheng@iscas.ac.cn Signed-off-by: Ulf Hansson --- drivers/mmc/host/sh_mmcif.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index bcc595c70a9f..104dcd702870 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -405,6 +405,9 @@ static int sh_mmcif_dma_slave_config(struct sh_mmcif_host *host, struct dma_slave_config cfg = { 0, }; res = platform_get_resource(host->pd, IORESOURCE_MEM, 0); + if (!res) + return -EINVAL; + cfg.direction = direction; if (direction == DMA_DEV_TO_MEM) { From 379f56c24e698f14242f532b1d0a0f1747725e08 Mon Sep 17 00:00:00 2001 From: Andrey Skvortsov Date: Sat, 15 Jan 2022 15:14:46 +0300 Subject: [PATCH 0506/1295] mmc: core: Wait for command setting 'Power Off Notification' bit to complete SD card is allowed to signal busy on DAT0 up to 1s after the CMD49. According to SD spec (version 6.0 section 5.8.1.3) first host waits until busy of CMD49 is released and only then polls Power Management Status register up to 1s until the card indicates ready to power off. Without waiting for busy before polling status register sometimes card becomes unresponsive and system fails to suspend: [ 205.907459] Freezing remaining freezable tasks ... (elapsed 0.001 seconds) done. [ 206.421274] sunxi-mmc 1c0f000.mmc: data error, sending stop command [ 206.421321] sunxi-mmc 1c0f000.mmc: send stop command failed [ 206.421347] mmc0: error -110 reading status reg of PM func [ 206.421366] PM: dpm_run_callback(): mmc_bus_suspend+0x0/0x74 returns -110 [ 206.421402] mmcblk mmc0:aaaa: PM: failed to suspend async: error -110 [ 206.437064] PM: Some devices failed to suspend, or early wake event detected Tested with Sandisk Extreme PRO A2 64GB on Allwinner A64 system. Signed-off-by: Andrey Skvortsov Fixes: 2c5d42769038 ("mmc: core: Add support for Power Off Notification for SD cards") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220115121447.641524-1-andrej.skvortzov@gmail.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/sd.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index 45f578793980..bd87012c220c 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -67,7 +67,7 @@ static const unsigned int sd_au_size[] = { __res & __mask; \ }) -#define SD_POWEROFF_NOTIFY_TIMEOUT_MS 2000 +#define SD_POWEROFF_NOTIFY_TIMEOUT_MS 1000 #define SD_WRITE_EXTR_SINGLE_TIMEOUT_MS 1000 struct sd_busy_data { @@ -1664,6 +1664,12 @@ static int sd_poweroff_notify(struct mmc_card *card) goto out; } + /* Find out when the command is completed. */ + err = mmc_poll_for_busy(card, SD_WRITE_EXTR_SINGLE_TIMEOUT_MS, false, + MMC_BUSY_EXTR_SINGLE); + if (err) + goto out; + cb_data.card = card; cb_data.reg_buf = reg_buf; err = __mmc_poll_for_busy(card->host, SD_POWEROFF_NOTIFY_TIMEOUT_MS, From 0cb5950f3f3b51a4e8657d106f897f2b913e0586 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 20 Jan 2022 14:27:56 +0000 Subject: [PATCH 0507/1295] btrfs: fix deadlock when reserving space during defrag When defragging we can end up collecting a range for defrag that has already pages under delalloc (dirty), as long as the respective extent map for their range is not mapped to a hole, a prealloc extent or the extent map is from an old generation. Most of the time that is harmless from a functional perspective at least, however it can result in a deadlock: 1) At defrag_collect_targets() we find an extent map that meets all requirements but there's delalloc for the range it covers, and we add its range to list of ranges to defrag; 2) The defrag_collect_targets() function is called at defrag_one_range(), after it locked a range that overlaps the range of the extent map; 3) At defrag_one_range(), while the range is still locked, we call defrag_one_locked_target() for the range associated to the extent map we collected at step 1); 4) Then finally at defrag_one_locked_target() we do a call to btrfs_delalloc_reserve_space(), which will reserve data and metadata space. If the space reservations can not be satisfied right away, the flusher might be kicked in and start flushing delalloc and wait for the respective ordered extents to complete. If this happens we will deadlock, because both flushing delalloc and finishing an ordered extent, requires locking the range in the inode's io tree, which was already locked at defrag_collect_targets(). So fix this by skipping extent maps for which there's already delalloc. Fixes: eb793cf857828d ("btrfs: defrag: introduce helper to collect target file extents") CC: stable@vger.kernel.org # 5.16 Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6d5d6d2c4ad1..a883f1824a17 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1213,6 +1213,35 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (em->generation < newer_than) goto next; + /* + * Our start offset might be in the middle of an existing extent + * map, so take that into account. + */ + range_len = em->len - (cur - em->start); + /* + * If this range of the extent map is already flagged for delalloc, + * skip it, because: + * + * 1) We could deadlock later, when trying to reserve space for + * delalloc, because in case we can't immediately reserve space + * the flusher can start delalloc and wait for the respective + * ordered extents to complete. The deadlock would happen + * because we do the space reservation while holding the range + * locked, and starting writeback, or finishing an ordered + * extent, requires locking the range; + * + * 2) If there's delalloc there, it means there's dirty pages for + * which writeback has not started yet (we clean the delalloc + * flag when starting writeback and after creating an ordered + * extent). If we mark pages in an adjacent range for defrag, + * then we will have a larger contiguous range for delalloc, + * very likely resulting in a larger extent after writeback is + * triggered (except in a case of free space fragmentation). + */ + if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC, 0, NULL)) + goto next; + /* * For do_compress case, we want to compress all valid file * extents, thus no @extent_thresh or mergeable check. @@ -1221,7 +1250,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, goto add; /* Skip too large extent */ - if (em->len >= extent_thresh) + if (range_len >= extent_thresh) goto next; next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, From 3c9d31c715948aaff0ee6d322a91a2dec07770bf Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 20 Jan 2022 17:11:52 +0000 Subject: [PATCH 0508/1295] btrfs: add back missing dirty page rate limiting to defrag A defrag operation can dirty a lot of pages, specially if operating on the entire file or a large file range. Any task dirtying pages should periodically call balance_dirty_pages_ratelimited(), as stated in that function's comments, otherwise they can leave too many dirty pages in the system. This is what we did before the refactoring in 5.16, and it should have remained, just like in the buffered write path and relocation. So restore that behaviour. Fixes: 7b508037d4cac3 ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()") CC: stable@vger.kernel.org # 5.16 Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a883f1824a17..ac420060aac3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1579,6 +1579,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, } while (cur < last_byte) { + const unsigned long prev_sectors_defragged = sectors_defragged; u64 cluster_end; /* The cluster size 256K should always be page aligned */ @@ -1610,6 +1611,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, cluster_end + 1 - cur, extent_thresh, newer_than, do_compress, §ors_defragged, max_to_defrag); + + if (sectors_defragged > prev_sectors_defragged) + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_inode_unlock(inode, 0); if (ret < 0) break; From 27cdfde181bcacd226c230b2fd831f6f5b8c215f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 20 Jan 2022 17:41:17 +0000 Subject: [PATCH 0509/1295] btrfs: update writeback index when starting defrag When starting a defrag, we should update the writeback index of the inode's mapping in case it currently has a value beyond the start of the range we are defragging. This can help performance and often result in getting less extents after writeback - for e.g., if the current value of the writeback index sits somewhere in the middle of a range that gets dirty by the defrag, then after writeback we can get two smaller extents instead of a single, larger extent. We used to have this before the refactoring in 5.16, but it was removed without any reason to do so. Originally it was added in kernel 3.1, by commit 2a0f7f5769992b ("Btrfs: fix recursive auto-defrag"), in order to fix a loop with autodefrag resulting in dirtying and writing pages over and over, but some testing on current code did not show that happening, at least with the test described in that commit. So add back the behaviour, as at the very least it is a nice to have optimization. Fixes: 7b508037d4cac3 ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()") CC: stable@vger.kernel.org # 5.16 Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ac420060aac3..eef5b300b9a9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1537,6 +1537,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, int compress_type = BTRFS_COMPRESS_ZLIB; int ret = 0; u32 extent_thresh = range->extent_thresh; + pgoff_t start_index; if (isize == 0) return 0; @@ -1578,6 +1579,14 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, file_ra_state_init(ra, inode->i_mapping); } + /* + * Make writeback start from the beginning of the range, so that the + * defrag range can be written sequentially. + */ + start_index = cur >> PAGE_SHIFT; + if (start_index < inode->i_mapping->writeback_index) + inode->i_mapping->writeback_index = start_index; + while (cur < last_byte) { const unsigned long prev_sectors_defragged = sectors_defragged; u64 cluster_end; From 984d1efff2304833e20fce89046ef8a89fb51d15 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 10 Jan 2022 09:20:19 +0100 Subject: [PATCH 0510/1295] mailmap: update email address of Brian Silverman Brian Silverman's address at bluerivertech.com is not valid anymore, use Brian's private email address instead. Link: https://lore.kernel.org/all/20220110082359.2019735-1-mkl@pengutronix.de Cc: Brian Silverman Signed-off-by: Marc Kleine-Budde --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index b157f88ce26a..b76e520809d0 100644 --- a/.mailmap +++ b/.mailmap @@ -70,6 +70,7 @@ Boris Brezillon Boris Brezillon Brian Avery Brian King +Brian Silverman Changbin Du Changbin Du Chao Yu From 17a30422621c0e04cb6060d20d7edcefd7463347 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 14 Jan 2022 18:47:41 +0100 Subject: [PATCH 0511/1295] dt-bindings: can: tcan4x5x: fix mram-cfg RX FIFO config This tcan4x5x only comes with 2K of MRAM, a RX FIFO with a dept of 32 doesn't fit into the MRAM. Use a depth of 16 instead. Fixes: 4edd396a1911 ("dt-bindings: can: tcan4x5x: Add DT bindings for TCAN4x5X driver") Link: https://lore.kernel.org/all/20220119062951.2939851-1-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- Documentation/devicetree/bindings/net/can/tcan4x5x.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/can/tcan4x5x.txt b/Documentation/devicetree/bindings/net/can/tcan4x5x.txt index 0968b40aef1e..e3501bfa22e9 100644 --- a/Documentation/devicetree/bindings/net/can/tcan4x5x.txt +++ b/Documentation/devicetree/bindings/net/can/tcan4x5x.txt @@ -31,7 +31,7 @@ tcan4x5x: tcan4x5x@0 { #address-cells = <1>; #size-cells = <1>; spi-max-frequency = <10000000>; - bosch,mram-cfg = <0x0 0 0 32 0 0 1 1>; + bosch,mram-cfg = <0x0 0 0 16 0 0 1 1>; interrupt-parent = <&gpio1>; interrupts = <14 IRQ_TYPE_LEVEL_LOW>; device-state-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>; From db72589c49fd260bfc99c7160c079675bc7417af Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 14 Jan 2022 15:35:01 +0100 Subject: [PATCH 0512/1295] can: m_can: m_can_fifo_{read,write}: don't read or write from/to FIFO if length is 0 In order to optimize FIFO access, especially on m_can cores attached to slow busses like SPI, in patch | e39381770ec9 ("can: m_can: Disable IRQs on FIFO bus errors") bulk read/write support has been added to the m_can_fifo_{read,write} functions. That change leads to the tcan driver to call regmap_bulk_{read,write}() with a length of 0 (for CAN frames with 0 data length). regmap treats this as an error: | tcan4x5x spi1.0 tcan4x5x0: FIFO write returned -22 This patch fixes the problem by not calling the cdev->ops->{read,write)_fifo() in case of a 0 length read/write. Fixes: e39381770ec9 ("can: m_can: Disable IRQs on FIFO bus errors") Link: https://lore.kernel.org/all/20220114155751.2651888-1-mkl@pengutronix.de Cc: stable@vger.kernel.org Cc: Matt Kline Cc: Chandrasekar Ramakrishnan Reported-by: Michael Anochin Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 5b47cd867783..1a4b56f6fa8c 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -336,6 +336,9 @@ m_can_fifo_read(struct m_can_classdev *cdev, u32 addr_offset = cdev->mcfg[MRAM_RXF0].off + fgi * RXF0_ELEMENT_SIZE + offset; + if (val_count == 0) + return 0; + return cdev->ops->read_fifo(cdev, addr_offset, val, val_count); } @@ -346,6 +349,9 @@ m_can_fifo_write(struct m_can_classdev *cdev, u32 addr_offset = cdev->mcfg[MRAM_TXB].off + fpi * TXB_ELEMENT_SIZE + offset; + if (val_count == 0) + return 0; + return cdev->ops->write_fifo(cdev, addr_offset, val, val_count); } From e59986de5ff701494e14c722b78b6e6d513e0ab5 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 14 Jan 2022 18:50:54 +0100 Subject: [PATCH 0513/1295] can: tcan4x5x: regmap: fix max register value The MRAM of the tcan4x5x has a size of 2K and starts at 0x8000. There are no further registers in the tcan4x5x making 0x87fc the biggest addressable register. This patch fixes the max register value of the regmap config from 0x8ffc to 0x87fc. Fixes: 6e1caaf8ed22 ("can: tcan4x5x: fix max register value") Link: https://lore.kernel.org/all/20220119064011.2943292-1-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/tcan4x5x-regmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/m_can/tcan4x5x-regmap.c b/drivers/net/can/m_can/tcan4x5x-regmap.c index ca80dbaf7a3f..26e212b8ca7a 100644 --- a/drivers/net/can/m_can/tcan4x5x-regmap.c +++ b/drivers/net/can/m_can/tcan4x5x-regmap.c @@ -12,7 +12,7 @@ #define TCAN4X5X_SPI_INSTRUCTION_WRITE (0x61 << 24) #define TCAN4X5X_SPI_INSTRUCTION_READ (0x41 << 24) -#define TCAN4X5X_MAX_REGISTER 0x8ffc +#define TCAN4X5X_MAX_REGISTER 0x87fc static int tcan4x5x_regmap_gather_write(void *context, const void *reg, size_t reg_len, From f04aefd4659b7959e50e6d0d649936c6940f9d34 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 21 Jan 2022 09:22:34 +0100 Subject: [PATCH 0514/1295] can: flexcan: mark RX via mailboxes as supported on MCF5441X Most flexcan IP cores support 2 RX modes: - FIFO - mailbox The flexcan IP core on the MCF5441X cannot receive CAN RTR messages via mailboxes. However the mailbox mode is more performant. The commit | 1c45f5778a3b ("can: flexcan: add ethtool support to change rx-rtr setting during runtime") added support to switch from FIFO to mailbox mode on these cores. After testing the mailbox mode on the MCF5441X by Angelo Dureghello, this patch marks it (without RTR capability) as supported. Further the IP core overview table is updated, that RTR reception via mailboxes is not supported. Link: https://lore.kernel.org/all/20220121084425.3141218-1-mkl@pengutronix.de Tested-by: Angelo Dureghello Signed-off-by: Marc Kleine-Budde --- drivers/net/can/flexcan/flexcan-core.c | 1 + drivers/net/can/flexcan/flexcan.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/flexcan/flexcan-core.c b/drivers/net/can/flexcan/flexcan-core.c index 0bff1884d5cc..74d7fcbfd065 100644 --- a/drivers/net/can/flexcan/flexcan-core.c +++ b/drivers/net/can/flexcan/flexcan-core.c @@ -296,6 +296,7 @@ static_assert(sizeof(struct flexcan_regs) == 0x4 * 18 + 0xfb8); static const struct flexcan_devtype_data fsl_mcf5441x_devtype_data = { .quirks = FLEXCAN_QUIRK_BROKEN_PERR_STATE | FLEXCAN_QUIRK_NR_IRQ_3 | FLEXCAN_QUIRK_NR_MB_16 | + FLEXCAN_QUIRK_SUPPPORT_RX_MAILBOX | FLEXCAN_QUIRK_SUPPPORT_RX_FIFO, }; diff --git a/drivers/net/can/flexcan/flexcan.h b/drivers/net/can/flexcan/flexcan.h index fccdff8b1f0f..23fc09a7e10f 100644 --- a/drivers/net/can/flexcan/flexcan.h +++ b/drivers/net/can/flexcan/flexcan.h @@ -21,7 +21,7 @@ * Below is some version info we got: * SOC Version IP-Version Glitch- [TR]WRN_INT IRQ Err Memory err RTR rece- FD Mode MB * Filter? connected? Passive detection ption in MB Supported? - * MCF5441X FlexCAN2 ? no yes no no yes no 16 + * MCF5441X FlexCAN2 ? no yes no no no no 16 * MX25 FlexCAN2 03.00.00.00 no no no no no no 64 * MX28 FlexCAN2 03.00.04.00 yes yes no no no no 64 * MX35 FlexCAN2 03.00.00.00 no no no no no no 64 From 82880283d7fcd0a1d20964a56d6d1a5cc0df0713 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Thu, 20 Jan 2022 23:37:48 +0000 Subject: [PATCH 0515/1295] objtool: Fix truncated string warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On GCC 12, the build fails due to a possible truncated string: check.c: In function 'validate_call': check.c:2865:58: error: '%d' directive output may be truncated writing between 1 and 10 bytes into a region of size 9 [-Werror=format-truncation=] 2865 | snprintf(pvname, sizeof(pvname), "pv_ops[%d]", idx); | ^~ In theory it's a valid bug: static char pvname[16]; int idx; ... idx = (rel->addend / sizeof(void *)); snprintf(pvname, sizeof(pvname), "pv_ops[%d]", idx); There are only 7 chars for %d while it could take up to 9, so the printed "pv_ops[%d]" string could get truncated. In reality the bug should never happen, because pv_ops only has ~80 entries, so 7 chars for the integer is more than enough. Still, it's worth fixing. Bump the buffer size by 2 bytes to silence the warning. [ jpoimboe: changed size to 19; massaged changelog ] Fixes: db2b0c5d7b6f ("objtool: Support pv_opsindirect calls for noinstr") Reported-by: Adam Borowski Reported-by: Martin Liška Signed-off-by: Sergei Trofimovich Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20220120233748.2062559-1-slyich@gmail.com --- tools/objtool/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index c2d2ab9a2861..7c33ec67c4a9 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2854,7 +2854,7 @@ static inline bool func_uaccess_safe(struct symbol *func) static inline const char *call_dest_name(struct instruction *insn) { - static char pvname[16]; + static char pvname[19]; struct reloc *rel; int idx; From 165216533dda560f2620ce8f61381a9ee0ca57ba Mon Sep 17 00:00:00 2001 From: Aswath Govindraju Date: Thu, 23 Dec 2021 17:46:49 +0530 Subject: [PATCH 0516/1295] arm64: dts: ti: k3-j721s2: Move aliases to board dts Aliases are board specific and should be in board dts files. So, move aliases to board dts and trim the list to interfaces that are actually enabled. Signed-off-by: Aswath Govindraju Signed-off-by: Vignesh Raghavendra Signed-off-by: Nishanth Menon Reviewed-by: Kishon Vijay Abraham I Link: https://lore.kernel.org/r/20211223121650.26868-2-vigneshr@ti.com --- .../dts/ti/k3-j721s2-common-proc-board.dts | 10 +++++++++ arch/arm64/boot/dts/ti/k3-j721s2.dtsi | 22 ------------------- 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/arch/arm64/boot/dts/ti/k3-j721s2-common-proc-board.dts b/arch/arm64/boot/dts/ti/k3-j721s2-common-proc-board.dts index a5a24f9f46c5..708c14338eb7 100644 --- a/arch/arm64/boot/dts/ti/k3-j721s2-common-proc-board.dts +++ b/arch/arm64/boot/dts/ti/k3-j721s2-common-proc-board.dts @@ -19,6 +19,16 @@ bootargs = "console=ttyS10,115200n8 earlycon=ns16550a,mmio32,2880000"; }; + aliases { + serial1 = &mcu_uart0; + serial10 = &main_uart8; + mmc0 = &main_sdhci0; + mmc1 = &main_sdhci1; + can0 = &main_mcan16; + can1 = &mcu_mcan0; + can2 = &mcu_mcan1; + }; + evm_12v0: fixedregulator-evm12v0 { /* main supply */ compatible = "regulator-fixed"; diff --git a/arch/arm64/boot/dts/ti/k3-j721s2.dtsi b/arch/arm64/boot/dts/ti/k3-j721s2.dtsi index 80d3cae03e88..fe5234c40f6c 100644 --- a/arch/arm64/boot/dts/ti/k3-j721s2.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j721s2.dtsi @@ -21,28 +21,6 @@ #address-cells = <2>; #size-cells = <2>; - aliases { - serial0 = &wkup_uart0; - serial1 = &mcu_uart0; - serial2 = &main_uart0; - serial3 = &main_uart1; - serial4 = &main_uart2; - serial5 = &main_uart3; - serial6 = &main_uart4; - serial7 = &main_uart5; - serial8 = &main_uart6; - serial9 = &main_uart7; - serial10 = &main_uart8; - serial11 = &main_uart9; - mmc0 = &main_sdhci0; - mmc1 = &main_sdhci1; - can0 = &main_mcan16; - can1 = &mcu_mcan0; - can2 = &mcu_mcan1; - can3 = &main_mcan3; - can4 = &main_mcan5; - }; - chosen { }; cpus { From aee744a37aaf277e74557e683cc524fbe6daeef7 Mon Sep 17 00:00:00 2001 From: Aswath Govindraju Date: Thu, 23 Dec 2021 17:46:50 +0530 Subject: [PATCH 0517/1295] arm64: dts: ti: k3-j721s2-common-proc-board: Alias console uart to serial2 On J721s2 Linux console is on main_uart8 but to be consistent with other J7 family of devices, alias it to ttyS2 (serial2). This also eliminates need to have higher number of 8250 runtime UARTs. Signed-off-by: Aswath Govindraju Signed-off-by: Vignesh Raghavendra Signed-off-by: Nishanth Menon Reviewed-by: Kishon Vijay Abraham I Link: https://lore.kernel.org/r/20211223121650.26868-3-vigneshr@ti.com --- arch/arm64/boot/dts/ti/k3-j721s2-common-proc-board.dts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/ti/k3-j721s2-common-proc-board.dts b/arch/arm64/boot/dts/ti/k3-j721s2-common-proc-board.dts index 708c14338eb7..b210cc07c539 100644 --- a/arch/arm64/boot/dts/ti/k3-j721s2-common-proc-board.dts +++ b/arch/arm64/boot/dts/ti/k3-j721s2-common-proc-board.dts @@ -15,13 +15,13 @@ model = "Texas Instruments J721S2 EVM"; chosen { - stdout-path = "serial10:115200n8"; - bootargs = "console=ttyS10,115200n8 earlycon=ns16550a,mmio32,2880000"; + stdout-path = "serial2:115200n8"; + bootargs = "console=ttyS2,115200n8 earlycon=ns16550a,mmio32,2880000"; }; aliases { serial1 = &mcu_uart0; - serial10 = &main_uart8; + serial2 = &main_uart8; mmc0 = &main_sdhci0; mmc1 = &main_sdhci1; can0 = &main_mcan16; From 79da533d3cc717ccc05ddbd3190da8a72bc2408b Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 23 Jan 2022 18:23:22 -0800 Subject: [PATCH 0518/1295] hwmon: (nct6775) Fix crash in clear_caseopen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Paweł Marciniak reports the following crash, observed when clearing the chassis intrusion alarm. BUG: kernel NULL pointer dereference, address: 0000000000000028 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 4815 Comm: bash Tainted: G S 5.16.2-200.fc35.x86_64 #1 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z97 Extreme4, BIOS P2.60A 05/03/2018 RIP: 0010:clear_caseopen+0x5a/0x120 [nct6775] Code: 68 70 e8 e9 32 b1 e3 85 c0 0f 85 d2 00 00 00 48 83 7c 24 ... RSP: 0018:ffffabcb02803dd8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000 RDX: ffff8e8808192880 RSI: 0000000000000000 RDI: ffff8e87c7509a68 RBP: 0000000000000000 R08: 0000000000000001 R09: 000000000000000a R10: 000000000000000a R11: f000000000000000 R12: 000000000000001f R13: ffff8e87c7509828 R14: ffff8e87c7509a68 R15: ffff8e88494527a0 FS: 00007f4db9151740(0000) GS:ffff8e8ebfec0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000028 CR3: 0000000166b66001 CR4: 00000000001706e0 Call Trace: kernfs_fop_write_iter+0x11c/0x1b0 new_sync_write+0x10b/0x180 vfs_write+0x209/0x2a0 ksys_write+0x4f/0xc0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The problem is that the device passed to clear_caseopen() is the hwmon device, not the platform device, and the platform data is not set in the hwmon device. Store the pointer to sio_data in struct nct6775_data and get if from there if needed. Fixes: 2e7b9886968b ("hwmon: (nct6775) Use superio_*() function pointers in sio_data.") Cc: Denis Pauk Cc: Bernhard Seibold Reported-by: Paweł Marciniak Tested-by: Denis Pauk Signed-off-by: Guenter Roeck --- drivers/hwmon/nct6775.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/hwmon/nct6775.c b/drivers/hwmon/nct6775.c index fd3f91cb01c6..098d12b9ecda 100644 --- a/drivers/hwmon/nct6775.c +++ b/drivers/hwmon/nct6775.c @@ -1175,7 +1175,7 @@ static inline u8 in_to_reg(u32 val, u8 nr) struct nct6775_data { int addr; /* IO base of hw monitor block */ - int sioreg; /* SIO register address */ + struct nct6775_sio_data *sio_data; enum kinds kind; const char *name; @@ -3559,7 +3559,7 @@ clear_caseopen(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nct6775_data *data = dev_get_drvdata(dev); - struct nct6775_sio_data *sio_data = dev_get_platdata(dev); + struct nct6775_sio_data *sio_data = data->sio_data; int nr = to_sensor_dev_attr(attr)->index - INTRUSION_ALARM_BASE; unsigned long val; u8 reg; @@ -3967,7 +3967,7 @@ static int nct6775_probe(struct platform_device *pdev) return -ENOMEM; data->kind = sio_data->kind; - data->sioreg = sio_data->sioreg; + data->sio_data = sio_data; if (sio_data->access == access_direct) { data->addr = res->start; From 926fd9f23b27ca6587492c3f58f4c7f4cd01dad5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 13 Jan 2022 11:44:38 -0800 Subject: [PATCH 0519/1295] ima: fix reference leak in asymmetric_verify() Don't leak a reference to the key if its algorithm is unknown. Fixes: 947d70597236 ("ima: Support EC keys for signature verification") Cc: # v5.13+ Signed-off-by: Eric Biggers Reviewed-by: Stefan Berger Reviewed-by: Tianjia Zhang Signed-off-by: Mimi Zohar --- security/integrity/digsig_asymmetric.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/security/integrity/digsig_asymmetric.c b/security/integrity/digsig_asymmetric.c index 23240d793b07..895f4b9ce8c6 100644 --- a/security/integrity/digsig_asymmetric.c +++ b/security/integrity/digsig_asymmetric.c @@ -109,22 +109,25 @@ int asymmetric_verify(struct key *keyring, const char *sig, pk = asymmetric_key_public_key(key); pks.pkey_algo = pk->pkey_algo; - if (!strcmp(pk->pkey_algo, "rsa")) + if (!strcmp(pk->pkey_algo, "rsa")) { pks.encoding = "pkcs1"; - else if (!strncmp(pk->pkey_algo, "ecdsa-", 6)) + } else if (!strncmp(pk->pkey_algo, "ecdsa-", 6)) { /* edcsa-nist-p192 etc. */ pks.encoding = "x962"; - else if (!strcmp(pk->pkey_algo, "ecrdsa") || - !strcmp(pk->pkey_algo, "sm2")) + } else if (!strcmp(pk->pkey_algo, "ecrdsa") || + !strcmp(pk->pkey_algo, "sm2")) { pks.encoding = "raw"; - else - return -ENOPKG; + } else { + ret = -ENOPKG; + goto out; + } pks.digest = (u8 *)data; pks.digest_size = datalen; pks.s = hdr->sig; pks.s_size = siglen; ret = verify_signature(key, &pks); +out: key_put(key); pr_debug("%s() = %d\n", __func__, ret); return ret; From 84d46e1fc33cc56b622b2e7a91703157161ce2e9 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 16 Dec 2021 11:11:03 +0800 Subject: [PATCH 0520/1295] drm/msm: remove variable set but not used The code that uses variable mdss has been removed, So the declaration and assignment of the variable can be removed. Eliminate the following clang warning: drivers/gpu/drm/msm/msm_drv.c:513:19: warning: variable 'mdss' set but not used [-Wunused-but-set-variable] Reported-by: Abaci Robot Fixes: 2027e5b3413d ("drm/msm: Initialize MDSS irq domain at probe time") Signed-off-by: Yang Li Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20211216031103.34146-1-yang.lee@linux.alibaba.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/msm/msm_drv.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index ad35a5d94053..59e30192cdf6 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -510,7 +510,6 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv) struct msm_drm_private *priv = dev_get_drvdata(dev); struct drm_device *ddev; struct msm_kms *kms; - struct msm_mdss *mdss; int ret, i; ddev = drm_dev_alloc(drv, dev); @@ -521,8 +520,6 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv) ddev->dev_private = priv; priv->dev = ddev; - mdss = priv->mdss; - priv->wq = alloc_ordered_workqueue("msm", 0); priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD; From c04c3148ca12227d92f91b355b4538cc333c9922 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 30 Dec 2021 07:09:40 +0000 Subject: [PATCH 0521/1295] drm/msm/dsi: Fix missing put_device() call in dsi_get_phy If of_find_device_by_node() succeeds, dsi_get_phy() doesn't a corresponding put_device(). Thus add put_device() to fix the exception handling. Fixes: ec31abf ("drm/msm/dsi: Separate PHY to another platform device") Signed-off-by: Miaoqian Lin Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20211230070943.18116-1-linmq006@gmail.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/msm/dsi/dsi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dsi/dsi.c b/drivers/gpu/drm/msm/dsi/dsi.c index 052548883d27..0fe02529b5e7 100644 --- a/drivers/gpu/drm/msm/dsi/dsi.c +++ b/drivers/gpu/drm/msm/dsi/dsi.c @@ -40,7 +40,12 @@ static int dsi_get_phy(struct msm_dsi *msm_dsi) of_node_put(phy_node); - if (!phy_pdev || !msm_dsi->phy) { + if (!phy_pdev) { + DRM_DEV_ERROR(&pdev->dev, "%s: phy driver is not ready\n", __func__); + return -EPROBE_DEFER; + } + if (!msm_dsi->phy) { + put_device(&phy_pdev->dev); DRM_DEV_ERROR(&pdev->dev, "%s: phy driver is not ready\n", __func__); return -EPROBE_DEFER; } From 774fe0cd838d1b1419d41ab4ea0613c80d4ecbd7 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Fri, 7 Jan 2022 08:50:22 +0000 Subject: [PATCH 0522/1295] drm/msm/hdmi: Fix missing put_device() call in msm_hdmi_get_phy The reference taken by 'of_find_device_by_node()' must be released when not needed anymore. Add the corresponding 'put_device()' in the error handling path. Fixes: e00012b256d4 ("drm/msm/hdmi: Make HDMI core get its PHY") Signed-off-by: Miaoqian Lin Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20220107085026.23831-1-linmq006@gmail.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/msm/hdmi/hdmi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/hdmi/hdmi.c b/drivers/gpu/drm/msm/hdmi/hdmi.c index 3acdeae25caf..719720709e9e 100644 --- a/drivers/gpu/drm/msm/hdmi/hdmi.c +++ b/drivers/gpu/drm/msm/hdmi/hdmi.c @@ -97,10 +97,15 @@ static int msm_hdmi_get_phy(struct hdmi *hdmi) of_node_put(phy_node); - if (!phy_pdev || !hdmi->phy) { + if (!phy_pdev) { DRM_DEV_ERROR(&pdev->dev, "phy driver is not ready\n"); return -EPROBE_DEFER; } + if (!hdmi->phy) { + DRM_DEV_ERROR(&pdev->dev, "phy driver is not ready\n"); + put_device(&phy_pdev->dev); + return -EPROBE_DEFER; + } hdmi->phy_dev = get_device(&phy_pdev->dev); From 170b22234d5495f5e0844246e23f004639ee89ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Exp=C3=B3sito?= Date: Sun, 9 Jan 2022 20:24:31 +0100 Subject: [PATCH 0523/1295] drm/msm/dpu: invalid parameter check in dpu_setup_dspp_pcc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function performs a check on the "ctx" input parameter, however, it is used before the check. Initialize the "base" variable after the sanity check to avoid a possible NULL pointer dereference. Fixes: 4259ff7ae509e ("drm/msm/dpu: add support for pcc color block in dpu driver") Addresses-Coverity-ID: 1493866 ("Null pointer dereference") Signed-off-by: José Expósito Link: https://lore.kernel.org/r/20220109192431.135949-1-jose.exposito89@gmail.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c index a98e964c3b6f..355894a3b48c 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c @@ -26,9 +26,16 @@ static void dpu_setup_dspp_pcc(struct dpu_hw_dspp *ctx, struct dpu_hw_pcc_cfg *cfg) { - u32 base = ctx->cap->sblk->pcc.base; + u32 base; - if (!ctx || !base) { + if (!ctx) { + DRM_ERROR("invalid ctx %pK\n", ctx); + return; + } + + base = ctx->cap->sblk->pcc.base; + + if (!base) { DRM_ERROR("invalid ctx %pK pcc base 0x%x\n", ctx, base); return; } From 0a727b459ee39bd4c5ced19d6024258ac87b6b2e Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Wed, 12 Jan 2022 20:33:34 +0800 Subject: [PATCH 0524/1295] drm/msm: Fix wrong size calculation For example, memory-region in .dts as below, reg = <0x0 0x50000000 0x0 0x20000000> We can get below values, struct resource r; r.start = 0x50000000; r.end = 0x6fffffff; So the size should be: size = r.end - r.start + 1 = 0x20000000 Signed-off-by: Xianting Tian Fixes: 072f1f9168ed ("drm/msm: add support for "stolen" mem") Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20220112123334.749776-1-xianting.tian@linux.alibaba.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/msm/msm_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 59e30192cdf6..555666e3f960 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -461,7 +461,7 @@ static int msm_init_vram(struct drm_device *dev) of_node_put(node); if (ret) return ret; - size = r.end - r.start; + size = r.end - r.start + 1; DRM_INFO("using VRAM carveout: %lx@%pa\n", size, &r.start); /* if we have no IOMMU, then we need to use carveout allocator. From 860a7b2a87b7c743154824d0597b6c3eb3b53154 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 13 Jan 2022 08:32:13 -0800 Subject: [PATCH 0525/1295] drm/msm/a6xx: Add missing suspend_count increment Reported-by: Danylo Piliaiev Fixes: 3ab1c5cc3939 ("drm/msm: Add param for userspace to query suspend count") Signed-off-by: Rob Clark Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20220113163215.215367-1-robdclark@gmail.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 51b83776951b..17cfad6424db 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1560,6 +1560,8 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu) for (i = 0; i < gpu->nr_rings; i++) a6xx_gpu->shadow[i] = 0; + gpu->suspend_count++; + return 0; } From 5e761a2287234bc402ba7ef07129f5103bcd775c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Exp=C3=B3sito?= Date: Sun, 16 Jan 2022 19:18:44 +0100 Subject: [PATCH 0526/1295] drm/msm/dsi: invalid parameter check in msm_dsi_phy_enable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function performs a check on the "phy" input parameter, however, it is used before the check. Initialize the "dev" variable after the sanity check to avoid a possible NULL pointer dereference. Fixes: 5c8290284402b ("drm/msm/dsi: Split PHY drivers to separate files") Addresses-Coverity-ID: 1493860 ("Null pointer dereference") Signed-off-by: José Expósito Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20220116181844.7400-1-jose.exposito89@gmail.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/msm/dsi/phy/dsi_phy.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c index c2ed177717c7..2027b38617ab 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c @@ -808,12 +808,14 @@ int msm_dsi_phy_enable(struct msm_dsi_phy *phy, struct msm_dsi_phy_clk_request *clk_req, struct msm_dsi_phy_shared_timings *shared_timings) { - struct device *dev = &phy->pdev->dev; + struct device *dev; int ret; if (!phy || !phy->cfg->ops.enable) return -EINVAL; + dev = &phy->pdev->dev; + ret = dsi_phy_enable_resource(phy); if (ret) { DRM_DEV_ERROR(dev, "%s: resource enable failed, %d\n", From 63ee956f69d8c181e5251c7ce58b84c1edec0f6a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 24 Jan 2022 20:20:51 -0800 Subject: [PATCH 0527/1295] bpf: Fix renaming task_getsecid_subj->current_getsecid_subj. The commit 6326948f940d missed renaming of task->current LSM hook in BTF_ID. Fix it to silence build warning: WARN: resolve_btfids: unresolved symbol bpf_lsm_task_getsecid_subj Fixes: 6326948f940d ("lsm: security_task_getsecid_subj() -> security_current_getsecid_subj()") Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 06062370c3b8..9e4ecc990647 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -207,7 +207,7 @@ BTF_ID(func, bpf_lsm_socket_socketpair) BTF_ID(func, bpf_lsm_syslog) BTF_ID(func, bpf_lsm_task_alloc) -BTF_ID(func, bpf_lsm_task_getsecid_subj) +BTF_ID(func, bpf_lsm_current_getsecid_subj) BTF_ID(func, bpf_lsm_task_getsecid_obj) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) From 61263b3a11a2594b4e898f166c31162236182b5c Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 11 Jan 2022 09:24:41 +0800 Subject: [PATCH 0528/1295] scsi: elx: efct: Don't use GFP_KERNEL under spin lock GFP_KERNEL/GFP_DMA can't be used under a spin lock. According the comment, els_ios_lock is used to protect els ios list so we can move down the spin lock to avoid using this flag under the lock. Link: https://lore.kernel.org/r/20220111012441.3232527-1-yangyingliang@huawei.com Fixes: 8f406ef72859 ("scsi: elx: libefc: Extended link Service I/O handling") Reported-by: Hulk Robot Reviewed-by: James Smart Signed-off-by: Yang Yingliang Signed-off-by: Martin K. Petersen --- drivers/scsi/elx/libefc/efc_els.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/elx/libefc/efc_els.c b/drivers/scsi/elx/libefc/efc_els.c index 7bb4f9aad2c8..84bc81d7ce76 100644 --- a/drivers/scsi/elx/libefc/efc_els.c +++ b/drivers/scsi/elx/libefc/efc_els.c @@ -46,18 +46,14 @@ efc_els_io_alloc_size(struct efc_node *node, u32 reqlen, u32 rsplen) efc = node->efc; - spin_lock_irqsave(&node->els_ios_lock, flags); - if (!node->els_io_enabled) { efc_log_err(efc, "els io alloc disabled\n"); - spin_unlock_irqrestore(&node->els_ios_lock, flags); return NULL; } els = mempool_alloc(efc->els_io_pool, GFP_ATOMIC); if (!els) { atomic_add_return(1, &efc->els_io_alloc_failed_count); - spin_unlock_irqrestore(&node->els_ios_lock, flags); return NULL; } @@ -74,7 +70,6 @@ efc_els_io_alloc_size(struct efc_node *node, u32 reqlen, u32 rsplen) &els->io.req.phys, GFP_KERNEL); if (!els->io.req.virt) { mempool_free(els, efc->els_io_pool); - spin_unlock_irqrestore(&node->els_ios_lock, flags); return NULL; } @@ -94,10 +89,11 @@ efc_els_io_alloc_size(struct efc_node *node, u32 reqlen, u32 rsplen) /* add els structure to ELS IO list */ INIT_LIST_HEAD(&els->list_entry); + spin_lock_irqsave(&node->els_ios_lock, flags); list_add_tail(&els->list_entry, &node->els_ios_list); + spin_unlock_irqrestore(&node->els_ios_lock, flags); } - spin_unlock_irqrestore(&node->els_ios_lock, flags); return els; } From a861790afaa8b6369eee8a88c5d5d73f5799c0c6 Mon Sep 17 00:00:00 2001 From: ZouMingzhe Date: Tue, 11 Jan 2022 13:47:42 +0800 Subject: [PATCH 0529/1295] scsi: target: iscsi: Make sure the np under each tpg is unique iscsit_tpg_check_network_portal() has nested for_each loops and is supposed to return true when a match is found. However, the tpg loop will still continue after existing the tpg_np loop. If this tpg_np is not the last the match value will be changed. Break the outer loop after finding a match and make sure the np under each tpg is unique. Link: https://lore.kernel.org/r/20220111054742.19582-1-mingzhe.zou@easystack.cn Signed-off-by: ZouMingzhe Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/target/iscsi/iscsi_target_tpg.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/target/iscsi/iscsi_target_tpg.c b/drivers/target/iscsi/iscsi_target_tpg.c index 8075f60fd02c..2d5cf1714ae0 100644 --- a/drivers/target/iscsi/iscsi_target_tpg.c +++ b/drivers/target/iscsi/iscsi_target_tpg.c @@ -443,6 +443,9 @@ static bool iscsit_tpg_check_network_portal( break; } spin_unlock(&tpg->tpg_np_lock); + + if (match) + break; } spin_unlock(&tiqn->tiqn_tpg_lock); From a65b32748f4566f986ba2495a8236c141fa42a26 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Sun, 16 Jan 2022 11:06:49 +0800 Subject: [PATCH 0530/1295] scsi: ufs: ufshcd-pltfrm: Check the return value of devm_kstrdup() devm_kstrdup() returns pointer to allocated string on success, NULL on failure. So it is better to check the return value of it. Link: https://lore.kernel.org/r/tencent_4257E15D4A94FF9020DDCC4BB9B21C041408@qq.com Reviewed-by: Bean Huo Signed-off-by: Xiaoke Wang Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd-pltfrm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/scsi/ufs/ufshcd-pltfrm.c b/drivers/scsi/ufs/ufshcd-pltfrm.c index 8b16bbbcb806..87975d1a21c8 100644 --- a/drivers/scsi/ufs/ufshcd-pltfrm.c +++ b/drivers/scsi/ufs/ufshcd-pltfrm.c @@ -92,6 +92,11 @@ static int ufshcd_parse_clock_info(struct ufs_hba *hba) clki->min_freq = clkfreq[i]; clki->max_freq = clkfreq[i+1]; clki->name = devm_kstrdup(dev, name, GFP_KERNEL); + if (!clki->name) { + ret = -ENOMEM; + goto out; + } + if (!strcmp(name, "ref_clk")) clki->keep_link_active = true; dev_dbg(dev, "%s: min %u max %u name %s\n", "freq-table-hz", @@ -127,6 +132,8 @@ static int ufshcd_populate_vreg(struct device *dev, const char *name, return -ENOMEM; vreg->name = devm_kstrdup(dev, name, GFP_KERNEL); + if (!vreg->name) + return -ENOMEM; snprintf(prop_name, MAX_PROP_SIZE, "%s-max-microamp", name); if (of_property_read_u32(np, prop_name, &vreg->max_uA)) { From b70a99fd13282d7885f69bf1372e28b7506a1613 Mon Sep 17 00:00:00 2001 From: Saurav Kashyap Date: Mon, 17 Jan 2022 05:53:09 -0800 Subject: [PATCH 0531/1295] scsi: qedf: Add stag_work to all the vports Call trace seen when creating NPIV ports, only 32 out of 64 show online. stag work was not initialized for vport, hence initialize the stag work. WARNING: CPU: 8 PID: 645 at kernel/workqueue.c:1635 __queue_delayed_work+0x68/0x80 CPU: 8 PID: 645 Comm: kworker/8:1 Kdump: loaded Tainted: G IOE --------- -- 4.18.0-348.el8.x86_64 #1 Hardware name: Dell Inc. PowerEdge MX740c/0177V9, BIOS 2.12.2 07/09/2021 Workqueue: events fc_lport_timeout [libfc] RIP: 0010:__queue_delayed_work+0x68/0x80 Code: 89 b2 88 00 00 00 44 89 82 90 00 00 00 48 01 c8 48 89 42 50 41 81 f8 00 20 00 00 75 1d e9 60 24 07 00 44 89 c7 e9 98 f6 ff ff <0f> 0b eb c5 0f 0b eb a1 0f 0b eb a7 0f 0b eb ac 44 89 c6 e9 40 23 RSP: 0018:ffffae514bc3be40 EFLAGS: 00010006 RAX: ffff8d25d6143750 RBX: 0000000000000202 RCX: 0000000000000002 RDX: ffff8d2e31383748 RSI: ffff8d25c000d600 RDI: ffff8d2e31383788 RBP: ffff8d2e31380de0 R08: 0000000000002000 R09: ffff8d2e31383750 R10: ffffffffc0c957e0 R11: ffff8d2624800000 R12: ffff8d2e31380a58 R13: ffff8d2d915eb000 R14: ffff8d25c499b5c0 R15: ffff8d2e31380e18 FS: 0000000000000000(0000) GS:ffff8d2d1fb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055fd0484b8b8 CR3: 00000008ffc10006 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: queue_delayed_work_on+0x36/0x40 qedf_elsct_send+0x57/0x60 [qedf] fc_lport_enter_flogi+0x90/0xc0 [libfc] fc_lport_timeout+0xb7/0x140 [libfc] process_one_work+0x1a7/0x360 ? create_worker+0x1a0/0x1a0 worker_thread+0x30/0x390 ? create_worker+0x1a0/0x1a0 kthread+0x116/0x130 ? kthread_flush_work_fn+0x10/0x10 ret_from_fork+0x35/0x40 ---[ end trace 008f00f722f2c2ff ]-- Initialize stag work for all the vports. Link: https://lore.kernel.org/r/20220117135311.6256-2-njavali@marvell.com Signed-off-by: Saurav Kashyap Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen --- drivers/scsi/qedf/qedf_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c index cdc66e2a9488..3a5ce540cfc4 100644 --- a/drivers/scsi/qedf/qedf_main.c +++ b/drivers/scsi/qedf/qedf_main.c @@ -1864,6 +1864,7 @@ static int qedf_vport_create(struct fc_vport *vport, bool disabled) vport_qedf->cmd_mgr = base_qedf->cmd_mgr; init_completion(&vport_qedf->flogi_compl); INIT_LIST_HEAD(&vport_qedf->fcports); + INIT_DELAYED_WORK(&vport_qedf->stag_work, qedf_stag_change_work); rc = qedf_vport_libfc_config(vport, vn_port); if (rc) { From 5239ab63f17cee643bd4bf6addfedebaa7d4f41e Mon Sep 17 00:00:00 2001 From: Saurav Kashyap Date: Mon, 17 Jan 2022 05:53:10 -0800 Subject: [PATCH 0532/1295] scsi: qedf: Fix refcount issue when LOGO is received during TMF Hung task call trace was seen during LOGO processing. [ 974.309060] [0000:00:00.0]:[qedf_eh_device_reset:868]: 1:0:2:0: LUN RESET Issued... [ 974.309065] [0000:00:00.0]:[qedf_initiate_tmf:2422]: tm_flags 0x10 sc_cmd 00000000c16b930f op = 0x2a target_id = 0x2 lun=0 [ 974.309178] [0000:00:00.0]:[qedf_initiate_tmf:2431]: portid=016900 tm_flags =LUN RESET [ 974.309222] [0000:00:00.0]:[qedf_initiate_tmf:2438]: orig io_req = 00000000ec78df8f xid = 0x180 ref_cnt = 1. [ 974.309625] host1: rport 016900: Received LOGO request while in state Ready [ 974.309627] host1: rport 016900: Delete port [ 974.309642] host1: rport 016900: work event 3 [ 974.309644] host1: rport 016900: lld callback ev 3 [ 974.313243] [0000:61:00.2]:[qedf_execute_tmf:2383]:1: fcport is uploading, not executing flush. [ 974.313295] [0000:61:00.2]:[qedf_execute_tmf:2400]:1: task mgmt command success... [ 984.031088] INFO: task jbd2/dm-15-8:7645 blocked for more than 120 seconds. [ 984.031136] Not tainted 4.18.0-305.el8.x86_64 #1 [ 984.031166] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 984.031209] jbd2/dm-15-8 D 0 7645 2 0x80004080 [ 984.031212] Call Trace: [ 984.031222] __schedule+0x2c4/0x700 [ 984.031230] ? unfreeze_partials.isra.83+0x16e/0x1a0 [ 984.031233] ? bit_wait_timeout+0x90/0x90 [ 984.031235] schedule+0x38/0xa0 [ 984.031238] io_schedule+0x12/0x40 [ 984.031240] bit_wait_io+0xd/0x50 [ 984.031243] __wait_on_bit+0x6c/0x80 [ 984.031248] ? free_buffer_head+0x21/0x50 [ 984.031251] out_of_line_wait_on_bit+0x91/0xb0 [ 984.031257] ? init_wait_var_entry+0x50/0x50 [ 984.031268] jbd2_journal_commit_transaction+0x112e/0x19f0 [jbd2] [ 984.031280] kjournald2+0xbd/0x270 [jbd2] [ 984.031284] ? finish_wait+0x80/0x80 [ 984.031291] ? commit_timeout+0x10/0x10 [jbd2] [ 984.031294] kthread+0x116/0x130 [ 984.031300] ? kthread_flush_work_fn+0x10/0x10 [ 984.031305] ret_from_fork+0x1f/0x40 There was a ref count issue when LOGO is received during TMF. This leads to one of the I/Os hanging with the driver. Fix the ref count. Link: https://lore.kernel.org/r/20220117135311.6256-3-njavali@marvell.com Signed-off-by: Saurav Kashyap Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen --- drivers/scsi/qedf/qedf_io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/qedf/qedf_io.c b/drivers/scsi/qedf/qedf_io.c index 99a56ca1fb16..fab43dabe5b3 100644 --- a/drivers/scsi/qedf/qedf_io.c +++ b/drivers/scsi/qedf/qedf_io.c @@ -2250,6 +2250,7 @@ process_els: io_req->tm_flags == FCP_TMF_TGT_RESET) { clear_bit(QEDF_CMD_OUTSTANDING, &io_req->flags); io_req->sc_cmd = NULL; + kref_put(&io_req->refcount, qedf_release_cmd); complete(&io_req->tm_done); } From 64fd4af6274eb0f49d29772c228fffcf6bde1635 Mon Sep 17 00:00:00 2001 From: Saurav Kashyap Date: Mon, 17 Jan 2022 05:53:11 -0800 Subject: [PATCH 0533/1295] scsi: qedf: Change context reset messages to ratelimited If FCoE is not configured, libfc/libfcoe keeps on retrying FLOGI and after 3 retries driver does a context reset and tries fipvlan again. This leads to context reset message flooding the logs. Hence ratelimit the message to prevent flooding the logs. Link: https://lore.kernel.org/r/20220117135311.6256-4-njavali@marvell.com Signed-off-by: Saurav Kashyap Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen --- drivers/scsi/qedf/qedf_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c index 3a5ce540cfc4..6ad28bc8e948 100644 --- a/drivers/scsi/qedf/qedf_main.c +++ b/drivers/scsi/qedf/qedf_main.c @@ -911,7 +911,7 @@ void qedf_ctx_soft_reset(struct fc_lport *lport) struct qed_link_output if_link; if (lport->vport) { - QEDF_ERR(NULL, "Cannot issue host reset on NPIV port.\n"); + printk_ratelimited("Cannot issue host reset on NPIV port.\n"); return; } @@ -3981,7 +3981,9 @@ void qedf_stag_change_work(struct work_struct *work) struct qedf_ctx *qedf = container_of(work, struct qedf_ctx, stag_work.work); - QEDF_ERR(&qedf->dbg_ctx, "Performing software context reset.\n"); + printk_ratelimited("[%s]:[%s:%d]:%d: Performing software context reset.", + dev_name(&qedf->pdev->dev), __func__, __LINE__, + qedf->dbg_ctx.host_no); qedf_ctx_soft_reset(qedf->lport); } From 62afb379a0fee7e9c2f9f68e1abeb85ceddf51b9 Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 18 Jan 2022 20:15:05 +0800 Subject: [PATCH 0534/1295] scsi: pm8001: Fix bogus FW crash for maxcpus=1 According to the comment in check_fw_ready() we should not check the IOP1_READY field in register SCRATCH_PAD_1 for 8008 or 8009 controllers. However we check this very field in process_oq() for processing the highest index interrupt vector. The highest interrupt vector is checked as the FW is programmed to signal fatal errors through this irq. Change that function to not check IOP1_READY for those mentioned controllers, but do check ILA_READY in both cases. The reason I assume that this was not hit earlier was because we always allocated 64 MSI(X), and just did not pass the vector index check in process_oq(), i.e. the handler never ran for vector index 63. Link: https://lore.kernel.org/r/1642508105-95432-1-git-send-email-john.garry@huawei.com Tested-by: Damien Le Moal Reviewed-by: Damien Le Moal Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm80xx_hwi.c | 16 ++++++++++++++-- drivers/scsi/pm8001/pm80xx_hwi.h | 6 +++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index bbf538fe15b3..2530d1365556 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -4151,10 +4151,22 @@ static int process_oq(struct pm8001_hba_info *pm8001_ha, u8 vec) u32 ret = MPI_IO_STATUS_FAIL; u32 regval; + /* + * Fatal errors are programmed to be signalled in irq vector + * pm8001_ha->max_q_num - 1 through pm8001_ha->main_cfg_tbl.pm80xx_tbl. + * fatal_err_interrupt + */ if (vec == (pm8001_ha->max_q_num - 1)) { + u32 mipsall_ready; + + if (pm8001_ha->chip_id == chip_8008 || + pm8001_ha->chip_id == chip_8009) + mipsall_ready = SCRATCH_PAD_MIPSALL_READY_8PORT; + else + mipsall_ready = SCRATCH_PAD_MIPSALL_READY_16PORT; + regval = pm8001_cr32(pm8001_ha, 0, MSGU_SCRATCH_PAD_1); - if ((regval & SCRATCH_PAD_MIPSALL_READY) != - SCRATCH_PAD_MIPSALL_READY) { + if ((regval & mipsall_ready) != mipsall_ready) { pm8001_ha->controller_fatal_error = true; pm8001_dbg(pm8001_ha, FAIL, "Firmware Fatal error! Regval:0x%x\n", diff --git a/drivers/scsi/pm8001/pm80xx_hwi.h b/drivers/scsi/pm8001/pm80xx_hwi.h index c7e5d93bea92..c41ed039c92a 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.h +++ b/drivers/scsi/pm8001/pm80xx_hwi.h @@ -1405,8 +1405,12 @@ typedef struct SASProtocolTimerConfig SASProtocolTimerConfig_t; #define SCRATCH_PAD_BOOT_LOAD_SUCCESS 0x0 #define SCRATCH_PAD_IOP0_READY 0xC00 #define SCRATCH_PAD_IOP1_READY 0x3000 -#define SCRATCH_PAD_MIPSALL_READY (SCRATCH_PAD_IOP1_READY | \ +#define SCRATCH_PAD_MIPSALL_READY_16PORT (SCRATCH_PAD_IOP1_READY | \ SCRATCH_PAD_IOP0_READY | \ + SCRATCH_PAD_ILA_READY | \ + SCRATCH_PAD_RAAE_READY) +#define SCRATCH_PAD_MIPSALL_READY_8PORT (SCRATCH_PAD_IOP0_READY | \ + SCRATCH_PAD_ILA_READY | \ SCRATCH_PAD_RAAE_READY) /* boot loader state */ From 8c9db6679be4348b8aae108e11d4be2f83976e30 Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Tue, 18 Jan 2022 17:58:03 +0100 Subject: [PATCH 0535/1295] scsi: zfcp: Fix failed recovery on gone remote port with non-NPIV FCP devices Suppose we have an environment with a number of non-NPIV FCP devices (virtual HBAs / FCP devices / zfcp "adapter"s) sharing the same physical FCP channel (HBA port) and its I_T nexus. Plus a number of storage target ports zoned to such shared channel. Now one target port logs out of the fabric causing an RSCN. Zfcp reacts with an ADISC ELS and subsequent port recovery depending on the ADISC result. This happens on all such FCP devices (in different Linux images) concurrently as they all receive a copy of this RSCN. In the following we look at one of those FCP devices. Requests other than FSF_QTCB_FCP_CMND can be slow until they get a response. Depending on which requests are affected by slow responses, there are different recovery outcomes. Here we want to fix failed recoveries on port or adapter level by avoiding recovery requests that can be slow. We need the cached N_Port_ID for the remote port "link" test with ADISC. Just before sending the ADISC, we now intentionally forget the old cached N_Port_ID. The idea is that on receiving an RSCN for a port, we have to assume that any cached information about this port is stale. This forces a fresh new GID_PN [FC-GS] nameserver lookup on any subsequent recovery for the same port. Since we typically can still communicate with the nameserver efficiently, we now reach steady state quicker: Either the nameserver still does not know about the port so we stop recovery, or the nameserver already knows the port potentially with a new N_Port_ID and we can successfully and quickly perform open port recovery. For the one case, where ADISC returns successfully, we re-initialize port->d_id because that case does not involve any port recovery. This also solves a problem if the storage WWPN quickly logs into the fabric again but with a different N_Port_ID. Such as on virtual WWPN takeover during target NPIV failover. [https://www.redbooks.ibm.com/abstracts/redp5477.html] In that case the RSCN from the storage FDISC was ignored by zfcp and we could not successfully recover the failover. On some later failback on the storage, we could have been lucky if the virtual WWPN got the same old N_Port_ID from the SAN switch as we still had cached. Then the related RSCN triggered a successful port reopen recovery. However, there is no guarantee to get the same N_Port_ID on NPIV FDISC. Even though NPIV-enabled FCP devices are not affected by this problem, this code change optimizes recovery time for gone remote ports as a side effect. The timely drop of cached N_Port_IDs prevents unnecessary slow open port attempts. While the problem might have been in code before v2.6.32 commit 799b76d09aee ("[SCSI] zfcp: Decouple gid_pn requests from erp") this fix depends on the gid_pn_work introduced with that commit, so we mark it as culprit to satisfy fix dependencies. Note: Point-to-point remote port is already handled separately and gets its N_Port_ID from the cached peer_d_id. So resetting port->d_id in general does not affect PtP. Link: https://lore.kernel.org/r/20220118165803.3667947-1-maier@linux.ibm.com Fixes: 799b76d09aee ("[SCSI] zfcp: Decouple gid_pn requests from erp") Cc: #2.6.32+ Suggested-by: Benjamin Block Reviewed-by: Benjamin Block Signed-off-by: Steffen Maier Signed-off-by: Martin K. Petersen --- drivers/s390/scsi/zfcp_fc.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/s390/scsi/zfcp_fc.c b/drivers/s390/scsi/zfcp_fc.c index d24cafe02708..511bf8e0a436 100644 --- a/drivers/s390/scsi/zfcp_fc.c +++ b/drivers/s390/scsi/zfcp_fc.c @@ -521,6 +521,8 @@ static void zfcp_fc_adisc_handler(void *data) goto out; } + /* re-init to undo drop from zfcp_fc_adisc() */ + port->d_id = ntoh24(adisc_resp->adisc_port_id); /* port is good, unblock rport without going through erp */ zfcp_scsi_schedule_rport_register(port); out: @@ -534,6 +536,7 @@ static int zfcp_fc_adisc(struct zfcp_port *port) struct zfcp_fc_req *fc_req; struct zfcp_adapter *adapter = port->adapter; struct Scsi_Host *shost = adapter->scsi_host; + u32 d_id; int ret; fc_req = kmem_cache_zalloc(zfcp_fc_req_cache, GFP_ATOMIC); @@ -558,7 +561,15 @@ static int zfcp_fc_adisc(struct zfcp_port *port) fc_req->u.adisc.req.adisc_cmd = ELS_ADISC; hton24(fc_req->u.adisc.req.adisc_port_id, fc_host_port_id(shost)); - ret = zfcp_fsf_send_els(adapter, port->d_id, &fc_req->ct_els, + d_id = port->d_id; /* remember as destination for send els below */ + /* + * Force fresh GID_PN lookup on next port recovery. + * Must happen after request setup and before sending request, + * to prevent race with port->d_id re-init in zfcp_fc_adisc_handler(). + */ + port->d_id = 0; + + ret = zfcp_fsf_send_els(adapter, d_id, &fc_req->ct_els, ZFCP_FC_CTELS_TMO); if (ret) kmem_cache_free(zfcp_fc_req_cache, fc_req); From 847f9ea4c5186fdb7b84297e3eeed9e340e83fce Mon Sep 17 00:00:00 2001 From: John Meneghini Date: Fri, 14 Jan 2022 23:00:44 -0500 Subject: [PATCH 0536/1295] scsi: bnx2fc: Flush destroy_work queue before calling bnx2fc_interface_put() The bnx2fc_destroy() functions are removing the interface before calling destroy_work. This results multiple WARNings from sysfs_remove_group() as the controller rport device attributes are removed too early. Replace the fcoe_port's destroy_work queue. It's not needed. The problem is easily reproducible with the following steps. Example: $ dmesg -w & $ systemctl enable --now fcoe $ fipvlan -s -c ens2f1 $ fcoeadm -d ens2f1.802 [ 583.464488] host2: libfc: Link down on port (7500a1) [ 583.472651] bnx2fc: 7500a1 - rport not created Yet!! [ 583.490468] ------------[ cut here ]------------ [ 583.538725] sysfs group 'power' not found for kobject 'rport-2:0-0' [ 583.568814] WARNING: CPU: 3 PID: 192 at fs/sysfs/group.c:279 sysfs_remove_group+0x6f/0x80 [ 583.607130] Modules linked in: dm_service_time 8021q garp mrp stp llc bnx2fc cnic uio rpcsec_gss_krb5 auth_rpcgss nfsv4 ... [ 583.942994] CPU: 3 PID: 192 Comm: kworker/3:2 Kdump: loaded Not tainted 5.14.0-39.el9.x86_64 #1 [ 583.984105] Hardware name: HP ProLiant DL120 G7, BIOS J01 07/01/2013 [ 584.016535] Workqueue: fc_wq_2 fc_rport_final_delete [scsi_transport_fc] [ 584.050691] RIP: 0010:sysfs_remove_group+0x6f/0x80 [ 584.074725] Code: ff 5b 48 89 ef 5d 41 5c e9 ee c0 ff ff 48 89 ef e8 f6 b8 ff ff eb d1 49 8b 14 24 48 8b 33 48 c7 c7 ... [ 584.162586] RSP: 0018:ffffb567c15afdc0 EFLAGS: 00010282 [ 584.188225] RAX: 0000000000000000 RBX: ffffffff8eec4220 RCX: 0000000000000000 [ 584.221053] RDX: ffff8c1586ce84c0 RSI: ffff8c1586cd7cc0 RDI: ffff8c1586cd7cc0 [ 584.255089] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffb567c15afc00 [ 584.287954] R10: ffffb567c15afbf8 R11: ffffffff8fbe7f28 R12: ffff8c1486326400 [ 584.322356] R13: ffff8c1486326480 R14: ffff8c1483a4a000 R15: 0000000000000004 [ 584.355379] FS: 0000000000000000(0000) GS:ffff8c1586cc0000(0000) knlGS:0000000000000000 [ 584.394419] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 584.421123] CR2: 00007fe95a6f7840 CR3: 0000000107674002 CR4: 00000000000606e0 [ 584.454888] Call Trace: [ 584.466108] device_del+0xb2/0x3e0 [ 584.481701] device_unregister+0x13/0x60 [ 584.501306] bsg_unregister_queue+0x5b/0x80 [ 584.522029] bsg_remove_queue+0x1c/0x40 [ 584.541884] fc_rport_final_delete+0xf3/0x1d0 [scsi_transport_fc] [ 584.573823] process_one_work+0x1e3/0x3b0 [ 584.592396] worker_thread+0x50/0x3b0 [ 584.609256] ? rescuer_thread+0x370/0x370 [ 584.628877] kthread+0x149/0x170 [ 584.643673] ? set_kthread_struct+0x40/0x40 [ 584.662909] ret_from_fork+0x22/0x30 [ 584.680002] ---[ end trace 53575ecefa942ece ]--- Link: https://lore.kernel.org/r/20220115040044.1013475-1-jmeneghi@redhat.com Fixes: 0cbf32e1681d ("[SCSI] bnx2fc: Avoid calling bnx2fc_if_destroy with unnecessary locks") Tested-by: Guangwu Zhang Co-developed-by: Maurizio Lombardi Signed-off-by: Maurizio Lombardi Signed-off-by: John Meneghini Signed-off-by: Martin K. Petersen --- drivers/scsi/bnx2fc/bnx2fc_fcoe.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c index 71fa62bd3083..9be273c320e2 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c +++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c @@ -82,7 +82,7 @@ static int bnx2fc_bind_pcidev(struct bnx2fc_hba *hba); static void bnx2fc_unbind_pcidev(struct bnx2fc_hba *hba); static struct fc_lport *bnx2fc_if_create(struct bnx2fc_interface *interface, struct device *parent, int npiv); -static void bnx2fc_destroy_work(struct work_struct *work); +static void bnx2fc_port_destroy(struct fcoe_port *port); static struct bnx2fc_hba *bnx2fc_hba_lookup(struct net_device *phys_dev); static struct bnx2fc_interface *bnx2fc_interface_lookup(struct net_device @@ -907,9 +907,6 @@ static void bnx2fc_indicate_netevent(void *context, unsigned long event, __bnx2fc_destroy(interface); } mutex_unlock(&bnx2fc_dev_lock); - - /* Ensure ALL destroy work has been completed before return */ - flush_workqueue(bnx2fc_wq); return; default: @@ -1215,8 +1212,8 @@ static int bnx2fc_vport_destroy(struct fc_vport *vport) mutex_unlock(&n_port->lp_mutex); bnx2fc_free_vport(interface->hba, port->lport); bnx2fc_port_shutdown(port->lport); + bnx2fc_port_destroy(port); bnx2fc_interface_put(interface); - queue_work(bnx2fc_wq, &port->destroy_work); return 0; } @@ -1525,7 +1522,6 @@ static struct fc_lport *bnx2fc_if_create(struct bnx2fc_interface *interface, port->lport = lport; port->priv = interface; port->get_netdev = bnx2fc_netdev; - INIT_WORK(&port->destroy_work, bnx2fc_destroy_work); /* Configure fcoe_port */ rc = bnx2fc_lport_config(lport); @@ -1653,8 +1649,8 @@ static void __bnx2fc_destroy(struct bnx2fc_interface *interface) bnx2fc_interface_cleanup(interface); bnx2fc_stop(interface); list_del(&interface->list); + bnx2fc_port_destroy(port); bnx2fc_interface_put(interface); - queue_work(bnx2fc_wq, &port->destroy_work); } /** @@ -1694,15 +1690,12 @@ netdev_err: return rc; } -static void bnx2fc_destroy_work(struct work_struct *work) +static void bnx2fc_port_destroy(struct fcoe_port *port) { - struct fcoe_port *port; struct fc_lport *lport; - port = container_of(work, struct fcoe_port, destroy_work); lport = port->lport; - - BNX2FC_HBA_DBG(lport, "Entered bnx2fc_destroy_work\n"); + BNX2FC_HBA_DBG(lport, "Entered %s, destroying lport %p\n", __func__, lport); bnx2fc_if_destroy(lport); } @@ -2556,9 +2549,6 @@ static void bnx2fc_ulp_exit(struct cnic_dev *dev) __bnx2fc_destroy(interface); mutex_unlock(&bnx2fc_dev_lock); - /* Ensure ALL destroy work has been completed before return */ - flush_workqueue(bnx2fc_wq); - bnx2fc_ulp_stop(hba); /* unregister cnic device */ if (test_and_clear_bit(BNX2FC_CNIC_REGISTERED, &hba->reg_with_cnic)) From fb8d5ea8fd907faa3751a9e5df5d01b5f3803e35 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 15 Jan 2022 09:53:03 +0100 Subject: [PATCH 0537/1295] scsi: 3w-sas: Remove useless DMA-32 fallback configuration As stated in [1], dma_set_mask() with a 64-bit mask never fails if dev->dma_mask is non-NULL. So, if it fails, the 32-bit case will also fail for the same reason. Simplify code and remove some dead code accordingly. [1]: https://lore.kernel.org/linux-kernel/YL3vSPK5DXTNvgdx@infradead.org/#t Link: https://lore.kernel.org/r/dbbe8671ca760972d80f8d35f3170b4609bee368.1642236763.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/scsi/3w-sas.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c index b9482da79512..3ebe66151dcb 100644 --- a/drivers/scsi/3w-sas.c +++ b/drivers/scsi/3w-sas.c @@ -1567,8 +1567,6 @@ static int twl_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id) pci_try_set_mwi(pdev); retval = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); - if (retval) - retval = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); if (retval) { TW_PRINTK(host, TW_DRIVER, 0x18, "Failed to set dma mask"); retval = -ENODEV; @@ -1786,8 +1784,6 @@ static int __maybe_unused twl_resume(struct device *dev) pci_try_set_mwi(pdev); retval = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); - if (retval) - retval = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); if (retval) { TW_PRINTK(host, TW_DRIVER, 0x25, "Failed to set dma mask during resume"); retval = -ENODEV; From 8001fa240fc0af1c3538a9fbaccd2c345ff9ab62 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 15 Jan 2022 10:05:22 +0100 Subject: [PATCH 0538/1295] scsi: hisi_sas: Remove useless DMA-32 fallback configuration As stated in [1], dma_set_mask() with a 64-bit mask never fails if dev->dma_mask is non-NULL. So, if it fails, the 32-bit case will also fail for the same reason. Simplify code and remove some dead code accordingly. [1]: https://lore.kernel.org/linux-kernel/YL3vSPK5DXTNvgdx@infradead.org/#t Link: https://lore.kernel.org/r/1bf2d3660178b0e6f172e5208bc0bd68d31d9268.1642237482.git.christophe.jaillet@wanadoo.fr Acked-by: John Garry Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_main.c | 3 --- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 2 -- 2 files changed, 5 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index a05ec7aece5a..2f53a2ee024a 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -2666,9 +2666,6 @@ static struct Scsi_Host *hisi_sas_shost_alloc(struct platform_device *pdev, goto err_out; error = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); - if (error) - error = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32)); - if (error) { dev_err(dev, "No usable DMA addressing method\n"); goto err_out; diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index a45ef9a5e12e..a01a3a7b706b 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -4695,8 +4695,6 @@ hisi_sas_v3_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err_out; rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); - if (rc) - rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); if (rc) { dev_err(dev, "No usable DMA addressing method\n"); rc = -ENODEV; From 012d98dae453821ac31da25595ffa26d4ad49c8c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 15 Jan 2022 10:15:41 +0100 Subject: [PATCH 0539/1295] scsi: bfa: Remove useless DMA-32 fallback configuration As stated in [1], dma_set_mask() with a 64-bit mask never fails if dev->dma_mask is non-NULL. So, if it fails, the 32-bit case will also fail for the same reason. Simplify code and remove some dead code accordingly. [1]: https://lore.kernel.org/linux-kernel/YL3vSPK5DXTNvgdx@infradead.org/#t Link: https://lore.kernel.org/r/5663cef9b54004fa56cca7ce65f51eadfc3ecddb.1642238127.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/scsi/bfa/bfad.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/scsi/bfa/bfad.c b/drivers/scsi/bfa/bfad.c index 440ef32be048..e5aa982ffedc 100644 --- a/drivers/scsi/bfa/bfad.c +++ b/drivers/scsi/bfa/bfad.c @@ -732,9 +732,6 @@ bfad_pci_init(struct pci_dev *pdev, struct bfad_s *bfad) pci_set_master(pdev); rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); - if (rc) - rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); - if (rc) { rc = -ENODEV; printk(KERN_ERR "dma_set_mask_and_coherent fail %p\n", pdev); @@ -1559,9 +1556,6 @@ bfad_pci_slot_reset(struct pci_dev *pdev) pci_set_master(pdev); rc = dma_set_mask_and_coherent(&bfad->pcidev->dev, DMA_BIT_MASK(64)); - if (rc) - rc = dma_set_mask_and_coherent(&bfad->pcidev->dev, - DMA_BIT_MASK(32)); if (rc) goto out_disable_device; From ad6c8a426446873febc98140d81d5353f8c0825b Mon Sep 17 00:00:00 2001 From: Kiwoong Kim Date: Fri, 21 Jan 2022 14:33:02 +0900 Subject: [PATCH 0540/1295] scsi: ufs: Use generic error code in ufshcd_set_dev_pwr_mode() The return value of ufshcd_set_dev_pwr_mode() is passed to device PM core. However, the function currently returns a SCSI result which the PM core doesn't understand. This might lead to unexpected behaviors in userland; a platform reset was observed in Android. Use a generic error code for SSU failures. Link: https://lore.kernel.org/r/1642743182-54098-1-git-send-email-kwmad.kim@samsung.com Reviewed-by: Bart Van Assche Signed-off-by: Kiwoong Kim Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 460d2b440d2e..50b12d60dc1b 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -8613,7 +8613,7 @@ static void ufshcd_hba_exit(struct ufs_hba *hba) * @pwr_mode: device power mode to set * * Returns 0 if requested power mode is set successfully - * Returns non-zero if failed to set the requested power mode + * Returns < 0 if failed to set the requested power mode */ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba, enum ufs_dev_pwr_mode pwr_mode) @@ -8667,8 +8667,11 @@ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba, sdev_printk(KERN_WARNING, sdp, "START_STOP failed for power mode: %d, result %x\n", pwr_mode, ret); - if (ret > 0 && scsi_sense_valid(&sshdr)) - scsi_print_sense_hdr(sdp, NULL, &sshdr); + if (ret > 0) { + if (scsi_sense_valid(&sshdr)) + scsi_print_sense_hdr(sdp, NULL, &sshdr); + ret = -EIO; + } } if (!ret) From c99b9b2301492b665b6e51ba6c06ec362eddcd10 Mon Sep 17 00:00:00 2001 From: Kiwoong Kim Date: Fri, 21 Jan 2022 14:37:55 +0900 Subject: [PATCH 0541/1295] scsi: ufs: Treat link loss as fatal error This event is raised when link is lost as specified in UFSHCI spec and that means communication is not possible. Thus initializing UFS interface needs to be done. Make UFS driver considers Link Lost as fatal in the INT_FATAL_ERRORS mask. This will trigger a host reset whenever a link lost interrupt occurs. Link: https://lore.kernel.org/r/1642743475-54275-1-git-send-email-kwmad.kim@samsung.com Signed-off-by: Kiwoong Kim Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshci.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufshci.h b/drivers/scsi/ufs/ufshci.h index 6a295c88d850..a7ff0e5b5494 100644 --- a/drivers/scsi/ufs/ufshci.h +++ b/drivers/scsi/ufs/ufshci.h @@ -142,7 +142,8 @@ static inline u32 ufshci_version(u32 major, u32 minor) #define INT_FATAL_ERRORS (DEVICE_FATAL_ERROR |\ CONTROLLER_FATAL_ERROR |\ SYSTEM_BUS_FATAL_ERROR |\ - CRYPTO_ENGINE_FATAL_ERROR) + CRYPTO_ENGINE_FATAL_ERROR |\ + UIC_LINK_LOST) /* HCS - Host Controller Status 30h */ #define DEVICE_PRESENT 0x1 From efd7bb1d75cf6808d67c869a29245c88a990bdea Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 23 Jan 2022 17:55:30 +0000 Subject: [PATCH 0542/1295] scsi: 53c700: Remove redundant assignment to pointer SCp Pointer SCp is being re-assigned the same value that it was initialized to a few lines earlier, the assignment is redundant and can be removed. Link: https://lore.kernel.org/r/20220123175530.110462-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Martin K. Petersen --- drivers/scsi/53c700.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/53c700.c b/drivers/scsi/53c700.c index 3ad3ebaca8e9..ad4972c0fc53 100644 --- a/drivers/scsi/53c700.c +++ b/drivers/scsi/53c700.c @@ -1507,7 +1507,6 @@ NCR_700_intr(int irq, void *dev_id) struct scsi_cmnd *SCp = hostdata->cmd; handled = 1; - SCp = hostdata->cmd; if(istat & SCSI_INT_PENDING) { udelay(10); From 4db09593af0b0b4d7d4805ebb3273df51d7cc30d Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Sun, 23 Jan 2022 14:57:17 -0800 Subject: [PATCH 0543/1295] scsi: myrs: Fix crash in error case In myrs_detect(), cs->disable_intr is NULL when privdata->hw_init() fails with non-zero. In this case, myrs_cleanup(cs) will call a NULL ptr and crash the kernel. [ 1.105606] myrs 0000:00:03.0: Unknown Initialization Error 5A [ 1.105872] myrs 0000:00:03.0: Failed to initialize Controller [ 1.106082] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 1.110774] Call Trace: [ 1.110950] myrs_cleanup+0xe4/0x150 [myrs] [ 1.111135] myrs_probe.cold+0x91/0x56a [myrs] [ 1.111302] ? DAC960_GEM_intr_handler+0x1f0/0x1f0 [myrs] [ 1.111500] local_pci_probe+0x48/0x90 Link: https://lore.kernel.org/r/20220123225717.1069538-1-ztong0001@gmail.com Reviewed-by: Hannes Reinecke Signed-off-by: Tong Zhang Signed-off-by: Martin K. Petersen --- drivers/scsi/myrs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/myrs.c b/drivers/scsi/myrs.c index 253ceca54a84..7eb8c39da366 100644 --- a/drivers/scsi/myrs.c +++ b/drivers/scsi/myrs.c @@ -2267,7 +2267,8 @@ static void myrs_cleanup(struct myrs_hba *cs) myrs_unmap(cs); if (cs->mmio_base) { - cs->disable_intr(cs); + if (cs->disable_intr) + cs->disable_intr(cs); iounmap(cs->mmio_base); cs->mmio_base = NULL; } From 5ec1cebd59300ddd26dbaa96c17c508764eef911 Mon Sep 17 00:00:00 2001 From: Manasi Navare Date: Mon, 4 Oct 2021 04:59:13 -0700 Subject: [PATCH 0544/1295] drm/atomic: Add the crtc to affected crtc only if uapi.enable = true MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case of a modeset where a mode gets split across multiple CRTCs in the driver specific implementation (bigjoiner in i915) we wrongly count the affected CRTCs based on the drm_crtc_mask and indicate the stolen CRTC as an affected CRTC in atomic_check_only(). This triggers a warning since affected CRTCs doent match requested CRTC. To fix this in such bigjoiner configurations, we should only increment affected crtcs if that CRTC is enabled in UAPI not if it is just used internally in the driver to split the mode. v3: Add the same uapi crtc_state->enable check in requested crtc calc (Ville) Cc: Ville Syrjälä Cc: Simon Ser Cc: Pekka Paalanen Cc: Daniel Stone Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Cc: # v5.11+ Fixes: 919c2299a893 ("drm/i915: Enable bigjoiner") Signed-off-by: Manasi Navare Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20211004115913.23889-1-manasi.d.navare@intel.com --- drivers/gpu/drm/drm_atomic.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c index ff1416cd609a..a1e4c7905ebb 100644 --- a/drivers/gpu/drm/drm_atomic.c +++ b/drivers/gpu/drm/drm_atomic.c @@ -1310,8 +1310,10 @@ int drm_atomic_check_only(struct drm_atomic_state *state) DRM_DEBUG_ATOMIC("checking %p\n", state); - for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) - requested_crtc |= drm_crtc_mask(crtc); + for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) { + if (new_crtc_state->enable) + requested_crtc |= drm_crtc_mask(crtc); + } for_each_oldnew_plane_in_state(state, plane, old_plane_state, new_plane_state, i) { ret = drm_atomic_plane_check(old_plane_state, new_plane_state); @@ -1360,8 +1362,10 @@ int drm_atomic_check_only(struct drm_atomic_state *state) } } - for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) - affected_crtc |= drm_crtc_mask(crtc); + for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) { + if (new_crtc_state->enable) + affected_crtc |= drm_crtc_mask(crtc); + } /* * For commits that allow modesets drivers can add other CRTCs to the From 22f7ff0dea9491e90b6fe808ed40c30bd791e5c2 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 22 Jan 2022 20:55:30 +1000 Subject: [PATCH 0545/1295] KVM: PPC: Book3S HV Nested: Fix nested HFSCR being clobbered with multiple vCPUs The L0 is storing HFSCR requested by the L1 for the L2 in struct kvm_nested_guest when the L1 requests a vCPU enter L2. kvm_nested_guest is not a per-vCPU structure. Hilarity ensues. Fix it by moving the nested hfscr into the vCPU structure together with the other per-vCPU nested fields. Fixes: 8b210a880b35 ("KVM: PPC: Book3S HV Nested: Make nested HFSCR state accessible") Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220122105530.3477250-1-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_book3s_64.h | 1 - arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s_hv.c | 3 +-- arch/powerpc/kvm/book3s_hv_nested.c | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index fe07558173ef..827038a33064 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -39,7 +39,6 @@ struct kvm_nested_guest { pgd_t *shadow_pgtable; /* our page table for this guest */ u64 l1_gr_to_hr; /* L1's addr of part'n-scoped table */ u64 process_table; /* process table entry for this guest */ - u64 hfscr; /* HFSCR that the L1 requested for this nested guest */ long refcnt; /* number of pointers to this struct */ struct mutex tlb_lock; /* serialize page faults and tlbies */ struct kvm_nested_guest *next; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index a770443cd6e0..d9bf60bf0816 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -818,6 +818,7 @@ struct kvm_vcpu_arch { /* For support of nested guests */ struct kvm_nested_guest *nested; + u64 nested_hfscr; /* HFSCR that the L1 requested for the nested guest */ u32 nested_vcpu_id; gpa_t nested_io_gpr; #endif diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index d1817cd9a691..84c89f08ae9a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1816,7 +1816,6 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) { - struct kvm_nested_guest *nested = vcpu->arch.nested; int r; int srcu_idx; @@ -1922,7 +1921,7 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) * it into a HEAI. */ if (!(vcpu->arch.hfscr_permitted & (1UL << cause)) || - (nested->hfscr & (1UL << cause))) { + (vcpu->arch.nested_hfscr & (1UL << cause))) { vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST; /* diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 8f8daaeeb3b7..9d373f8963ee 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -363,7 +363,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) /* set L1 state to L2 state */ vcpu->arch.nested = l2; vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token; - l2->hfscr = l2_hv.hfscr; + vcpu->arch.nested_hfscr = l2_hv.hfscr; vcpu->arch.regs = l2_regs; /* Guest must always run with ME enabled, HV disabled. */ From 8defc2a5dd8f4c0cb19ecbaca8d3e89ab98524da Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 25 Jan 2022 00:39:28 +1000 Subject: [PATCH 0546/1295] powerpc/64s/interrupt: Fix decrementer storm The decrementer exception can fail to be cleared when the interrupt returns in the case where the decrementer wraps with the next timer still beyond decrementer_max. This results in a decrementer interrupt storm. This is triggerable with small decrementer system with hard and soft watchdogs disabled. Fix this by always programming the decrementer if there was no timer. Fixes: 0faf20a1ad16 ("powerpc/64s/interrupt: Don't enable MSR[EE] in irq handlers unless perf is in use") Reported-by: Alexey Kardashevskiy Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220124143930.3923442-1-npiggin@gmail.com --- arch/powerpc/kernel/time.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 62361cc7281c..cd0b8b71ecdd 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -649,8 +649,9 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt) __this_cpu_inc(irq_stat.timer_irqs_event); } else { now = *next_tb - now; - if (now <= decrementer_max) - set_dec_or_work(now); + if (now > decrementer_max) + now = decrementer_max; + set_dec_or_work(now); __this_cpu_inc(irq_stat.timer_irqs_others); } From 817f7c9335ec01e0f5e8caffc4f1dcd5e458a4c0 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 24 Jan 2022 15:32:51 +0000 Subject: [PATCH 0547/1295] ASoC: ops: Reject out of bounds values in snd_soc_put_volsw() We don't currently validate that the values being set are within the range we advertised to userspace as being valid, do so and reject any values that are out of range. Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220124153253.3548853-2-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index 08eaa9ddf191..fbe5d326b0f2 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -316,13 +316,27 @@ int snd_soc_put_volsw(struct snd_kcontrol *kcontrol, if (sign_bit) mask = BIT(sign_bit + 1) - 1; - val = ((ucontrol->value.integer.value[0] + min) & mask); + val = ucontrol->value.integer.value[0]; + if (mc->platform_max && val > mc->platform_max) + return -EINVAL; + if (val > max - min) + return -EINVAL; + if (val < 0) + return -EINVAL; + val = (val + min) & mask; if (invert) val = max - val; val_mask = mask << shift; val = val << shift; if (snd_soc_volsw_is_stereo(mc)) { - val2 = ((ucontrol->value.integer.value[1] + min) & mask); + val2 = ucontrol->value.integer.value[1]; + if (mc->platform_max && val2 > mc->platform_max) + return -EINVAL; + if (val2 > max - min) + return -EINVAL; + if (val2 < 0) + return -EINVAL; + val2 = (val2 + min) & mask; if (invert) val2 = max - val2; if (reg == reg2) { From 4f1e50d6a9cf9c1b8c859d449b5031cacfa8404e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 24 Jan 2022 15:32:52 +0000 Subject: [PATCH 0548/1295] ASoC: ops: Reject out of bounds values in snd_soc_put_volsw_sx() We don't currently validate that the values being set are within the range we advertised to userspace as being valid, do so and reject any values that are out of range. Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220124153253.3548853-3-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index fbe5d326b0f2..c31e63b27193 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -423,8 +423,15 @@ int snd_soc_put_volsw_sx(struct snd_kcontrol *kcontrol, int err = 0; unsigned int val, val_mask; + val = ucontrol->value.integer.value[0]; + if (mc->platform_max && val > mc->platform_max) + return -EINVAL; + if (val > max - min) + return -EINVAL; + if (val < 0) + return -EINVAL; val_mask = mask << shift; - val = (ucontrol->value.integer.value[0] + min) & mask; + val = (val + min) & mask; val = val << shift; err = snd_soc_component_update_bits(component, reg, val_mask, val); From 4cf28e9ae6e2e11a044be1bcbcfa1b0d8675fe4d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 24 Jan 2022 15:32:53 +0000 Subject: [PATCH 0549/1295] ASoC: ops: Reject out of bounds values in snd_soc_put_xr_sx() We don't currently validate that the values being set are within the range we advertised to userspace as being valid, do so and reject any values that are out of range. Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220124153253.3548853-4-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index c31e63b27193..dc0e7c8d31f3 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -879,6 +879,8 @@ int snd_soc_put_xr_sx(struct snd_kcontrol *kcontrol, long val = ucontrol->value.integer.value[0]; unsigned int i; + if (val < mc->min || val > mc->max) + return -EINVAL; if (invert) val = max - val; val &= mask; From c74ead223deb88bdf18af8c772d7ca5a9b6c3c2b Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Sun, 23 Jan 2022 23:54:58 +0800 Subject: [PATCH 0550/1295] net: stmmac: reduce unnecessary wakeups from eee sw timer Currently, on EEE capable platforms, if EEE SW timer is used, the SW timer cause 1 wakeup/s even if the TX has successfully entered EEE. Remove this unnecessary wakeup by only calling mod_timer() if we haven't successfully entered EEE. Signed-off-by: Jisheng Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 6708ca2aa4f7..54071ae73879 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -402,7 +402,7 @@ static void stmmac_lpi_entry_timer_config(struct stmmac_priv *priv, bool en) * Description: this function is to verify and enter in LPI mode in case of * EEE. */ -static void stmmac_enable_eee_mode(struct stmmac_priv *priv) +static int stmmac_enable_eee_mode(struct stmmac_priv *priv) { u32 tx_cnt = priv->plat->tx_queues_to_use; u32 queue; @@ -412,13 +412,14 @@ static void stmmac_enable_eee_mode(struct stmmac_priv *priv) struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue]; if (tx_q->dirty_tx != tx_q->cur_tx) - return; /* still unfinished work */ + return -EBUSY; /* still unfinished work */ } /* Check and enter in LPI mode */ if (!priv->tx_path_in_lpi_mode) stmmac_set_eee_mode(priv, priv->hw, priv->plat->en_tx_lpi_clockgating); + return 0; } /** @@ -450,8 +451,8 @@ static void stmmac_eee_ctrl_timer(struct timer_list *t) { struct stmmac_priv *priv = from_timer(priv, t, eee_ctrl_timer); - stmmac_enable_eee_mode(priv); - mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(priv->tx_lpi_timer)); + if (stmmac_enable_eee_mode(priv)) + mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(priv->tx_lpi_timer)); } /** @@ -2647,8 +2648,8 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue) if (priv->eee_enabled && !priv->tx_path_in_lpi_mode && priv->eee_sw_timer_en) { - stmmac_enable_eee_mode(priv); - mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(priv->tx_lpi_timer)); + if (stmmac_enable_eee_mode(priv)) + mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(priv->tx_lpi_timer)); } /* We still have pending packets, let's call for a new scheduling */ From 29eb31542787e1019208a2e1047bb7c76c069536 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Mon, 24 Jan 2022 11:29:54 +0800 Subject: [PATCH 0551/1295] yam: fix a memory leak in yam_siocdevprivate() ym needs to be free when ym->cmd != SIOCYAMSMCS. Fixes: 0781168e23a2 ("yam: fix a missing-check bug") Signed-off-by: Hangyu Hua Signed-off-by: David S. Miller --- drivers/net/hamradio/yam.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index 6376b8485976..980f2be32f05 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c @@ -950,9 +950,7 @@ static int yam_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __ ym = memdup_user(data, sizeof(struct yamdrv_ioctl_mcs)); if (IS_ERR(ym)) return PTR_ERR(ym); - if (ym->cmd != SIOCYAMSMCS) - return -EINVAL; - if (ym->bitrate > YAM_MAXBITRATE) { + if (ym->cmd != SIOCYAMSMCS || ym->bitrate > YAM_MAXBITRATE) { kfree(ym); return -EINVAL; } From c63003e3d99761afb280add3b30de1cf30fa522b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 24 Jan 2022 15:35:29 +0100 Subject: [PATCH 0552/1295] net: cpsw: Properly initialise struct page_pool_params MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cpsw driver didn't properly initialise the struct page_pool_params before calling page_pool_create(), which leads to crashes after the struct has been expanded with new parameters. The second Fixes tag below is where the buggy code was introduced, but because the code was moved around this patch will only apply on top of the commit in the first Fixes tag. Fixes: c5013ac1dd0e ("net: ethernet: ti: cpsw: move set of common functions in cpsw_priv") Fixes: 9ed4050c0d75 ("net: ethernet: ti: cpsw: add XDP support") Reported-by: Colin Foster Signed-off-by: Toke Høiland-Jørgensen Tested-by: Colin Foster Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw_priv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index ba220593e6db..8f6817f346ba 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -1146,7 +1146,7 @@ int cpsw_fill_rx_channels(struct cpsw_priv *priv) static struct page_pool *cpsw_create_page_pool(struct cpsw_common *cpsw, int size) { - struct page_pool_params pp_params; + struct page_pool_params pp_params = {}; struct page_pool *pool; pp_params.order = 0; From 74afa30630976861a1308c5ec93391f8b037a2ae Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 24 Jan 2022 09:22:49 -0800 Subject: [PATCH 0553/1295] net: fec_mpc52xx: don't discard const from netdev->dev_addr Recent changes made netdev->dev_addr const, and it's passed directly to mpc52xx_fec_set_paddr(). Similar problem exists on the probe patch, the driver needs to call eth_hw_addr_set(). Reported-by: Geert Uytterhoeven Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_mpc52xx.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c index bbbde9f701c2..be0bd4b44926 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c @@ -99,13 +99,13 @@ static void mpc52xx_fec_tx_timeout(struct net_device *dev, unsigned int txqueue) netif_wake_queue(dev); } -static void mpc52xx_fec_set_paddr(struct net_device *dev, u8 *mac) +static void mpc52xx_fec_set_paddr(struct net_device *dev, const u8 *mac) { struct mpc52xx_fec_priv *priv = netdev_priv(dev); struct mpc52xx_fec __iomem *fec = priv->fec; - out_be32(&fec->paddr1, *(u32 *)(&mac[0])); - out_be32(&fec->paddr2, (*(u16 *)(&mac[4]) << 16) | FEC_PADDR2_TYPE); + out_be32(&fec->paddr1, *(const u32 *)(&mac[0])); + out_be32(&fec->paddr2, (*(const u16 *)(&mac[4]) << 16) | FEC_PADDR2_TYPE); } static int mpc52xx_fec_set_mac_address(struct net_device *dev, void *addr) @@ -893,13 +893,15 @@ static int mpc52xx_fec_probe(struct platform_device *op) rv = of_get_ethdev_address(np, ndev); if (rv) { struct mpc52xx_fec __iomem *fec = priv->fec; + u8 addr[ETH_ALEN] __aligned(4); /* * If the MAC addresse is not provided via DT then read * it back from the controller regs */ - *(u32 *)(&ndev->dev_addr[0]) = in_be32(&fec->paddr1); - *(u16 *)(&ndev->dev_addr[4]) = in_be32(&fec->paddr2) >> 16; + *(u32 *)(&addr[0]) = in_be32(&fec->paddr1); + *(u16 *)(&addr[4]) = in_be32(&fec->paddr2) >> 16; + eth_hw_addr_set(ndev, addr); } /* From 2f61353cd2f789a4229b6f5c1c24a40a613357bb Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Tue, 25 Jan 2022 15:03:12 +0800 Subject: [PATCH 0554/1295] net: hns3: handle empty unknown interrupt for VF Since some interrupt states may be cleared by hardware, the driver may receive an empty interrupt. Currently, the VF driver directly disables the vector0 interrupt in this case. As a result, the VF is unavailable. Therefore, the vector0 interrupt should be enabled in this case. Fixes: b90fcc5bd904 ("net: hns3: add reset handling for VF when doing Core/Global/IMP reset") Signed-off-by: Yufeng Mo Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 7df87610ad96..21442a9bb996 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -2043,8 +2043,7 @@ static irqreturn_t hclgevf_misc_irq_handle(int irq, void *data) break; } - if (event_cause != HCLGEVF_VECTOR0_EVENT_OTHER) - hclgevf_enable_vector(&hdev->misc_vector, true); + hclgevf_enable_vector(&hdev->misc_vector, true); return IRQ_HANDLED; } From 5c89be1dd5cfb697614bc13626ba3bd0781aa160 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 24 Jan 2022 11:36:05 +0100 Subject: [PATCH 0555/1295] KVM: x86: Move CPUID.(EAX=0x12,ECX=1) mangling to __kvm_update_cpuid_runtime() Full equality check of CPUID data on update (kvm_cpuid_check_equal()) may fail for SGX enabled CPUs as CPUID.(EAX=0x12,ECX=1) is currently being mangled in kvm_vcpu_after_set_cpuid(). Move it to __kvm_update_cpuid_runtime() and split off cpuid_get_supported_xcr0() helper as 'vcpu->arch.guest_supported_xcr0' update needs (logically) to stay in kvm_vcpu_after_set_cpuid(). Cc: stable@vger.kernel.org Fixes: feb627e8d6f6 ("KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN") Signed-off-by: Vitaly Kuznetsov Message-Id: <20220124103606.2630588-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 54 +++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 3902c28fb6cb..89d7822a8f5b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -196,10 +196,26 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) vcpu->arch.pv_cpuid.features = best->eax; } +/* + * Calculate guest's supported XCR0 taking into account guest CPUID data and + * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0). + */ +static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) +{ + struct kvm_cpuid_entry2 *best; + + best = cpuid_entry2_find(entries, nent, 0xd, 0); + if (!best) + return 0; + + return (best->eax | ((u64)best->edx << 32)) & supported_xcr0; +} + static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, int nent) { struct kvm_cpuid_entry2 *best; + u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent); best = cpuid_entry2_find(entries, nent, 1, 0); if (best) { @@ -238,6 +254,21 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } + + /* + * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate + * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's + * requested XCR0 value. The enclave's XFRM must be a subset of XCRO + * at the time of EENTER, thus adjust the allowed XFRM by the guest's + * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to + * '1' even on CPUs that don't support XSAVE. + */ + best = cpuid_entry2_find(entries, nent, 0x12, 0x1); + if (best) { + best->ecx &= guest_supported_xcr0 & 0xffffffff; + best->edx &= guest_supported_xcr0 >> 32; + best->ecx |= XFEATURE_MASK_FPSSE; + } } void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) @@ -261,27 +292,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_apic_set_version(vcpu); } - best = kvm_find_cpuid_entry(vcpu, 0xD, 0); - if (!best) - vcpu->arch.guest_supported_xcr0 = 0; - else - vcpu->arch.guest_supported_xcr0 = - (best->eax | ((u64)best->edx << 32)) & supported_xcr0; - - /* - * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate - * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's - * requested XCR0 value. The enclave's XFRM must be a subset of XCRO - * at the time of EENTER, thus adjust the allowed XFRM by the guest's - * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to - * '1' even on CPUs that don't support XSAVE. - */ - best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1); - if (best) { - best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff; - best->edx &= vcpu->arch.guest_supported_xcr0 >> 32; - best->ecx |= XFEATURE_MASK_FPSSE; - } + vcpu->arch.guest_supported_xcr0 = + cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); kvm_update_pv_runtime(vcpu); From b9bed78e2fa9571b7c983b20666efa0009030c71 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 20 Jan 2022 00:06:24 +0000 Subject: [PATCH 0556/1295] KVM: VMX: Set vmcs.PENDING_DBG.BS on #DB in STI/MOVSS blocking shadow Set vmcs.GUEST_PENDING_DBG_EXCEPTIONS.BS, a.k.a. the pending single-step breakpoint flag, when re-injecting a #DB with RFLAGS.TF=1, and STI or MOVSS blocking is active. Setting the flag is necessary to make VM-Entry consistency checks happy, as VMX has an invariant that if RFLAGS.TF is set and STI/MOVSS blocking is true, then the previous instruction must have been STI or MOV/POP, and therefore a single-step #DB must be pending since the RFLAGS.TF cannot have been set by the previous instruction, i.e. the one instruction delay after setting RFLAGS.TF must have already expired. Normally, the CPU sets vmcs.GUEST_PENDING_DBG_EXCEPTIONS.BS appropriately when recording guest state as part of a VM-Exit, but #DB VM-Exits intentionally do not treat the #DB as "guest state" as interception of the #DB effectively makes the #DB host-owned, thus KVM needs to manually set PENDING_DBG.BS when forwarding/re-injecting the #DB to the guest. Note, although this bug can be triggered by guest userspace, doing so requires IOPL=3, and guest userspace running with IOPL=3 has full access to all I/O ports (from the guest's perspective) and can crash/reboot the guest any number of ways. IOPL=3 is required because STI blocking kicks in if and only if RFLAGS.IF is toggled 0=>1, and if CPL>IOPL, STI either takes a #GP or modifies RFLAGS.VIF, not RFLAGS.IF. MOVSS blocking can be initiated by userspace, but can be coincident with a #DB if and only if DR7.GD=1 (General Detect enabled) and a MOV DR is executed in the MOVSS shadow. MOV DR #GPs at CPL>0, thus MOVSS blocking is problematic only for CPL0 (and only if the guest is crazy enough to access a DR in a MOVSS shadow). All other sources of #DBs are either suppressed by MOVSS blocking (single-step, code fetch, data, and I/O), are mutually exclusive with MOVSS blocking (T-bit task switch), or are already handled by KVM (ICEBP, a.k.a. INT1). This bug was originally found by running tests[1] created for XSA-308[2]. Note that Xen's userspace test emits ICEBP in the MOVSS shadow, which is presumably why the Xen bug was deemed to be an exploitable DOS from guest userspace. KVM already handles ICEBP by skipping the ICEBP instruction and thus clears MOVSS blocking as a side effect of its "emulation". [1] http://xenbits.xenproject.org/docs/xtf/xsa-308_2main_8c_source.html [2] https://xenbits.xen.org/xsa/advisory-308.html Reported-by: David Woodhouse Reported-by: Alexander Graf Signed-off-by: Sean Christopherson Message-Id: <20220120000624.655815-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 705e5c082738..2833b095a804 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4905,8 +4905,33 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) dr6 = vmx_get_exit_qual(vcpu); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + /* + * If the #DB was due to ICEBP, a.k.a. INT1, skip the + * instruction. ICEBP generates a trap-like #DB, but + * despite its interception control being tied to #DB, + * is an instruction intercept, i.e. the VM-Exit occurs + * on the ICEBP itself. Note, skipping ICEBP also + * clears STI and MOVSS blocking. + * + * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS + * if single-step is enabled in RFLAGS and STI or MOVSS + * blocking is active, as the CPU doesn't set the bit + * on VM-Exit due to #DB interception. VM-Entry has a + * consistency check that a single-step #DB is pending + * in this scenario as the previous instruction cannot + * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV + * don't modify RFLAGS), therefore the one instruction + * delay when activating single-step breakpoints must + * have already expired. Note, the CPU sets/clears BS + * as appropriate for all other VM-Exits types. + */ if (is_icebp(intr_info)) WARN_ON(!skip_emulated_instruction(vcpu)); + else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); return 1; From de1956f48543e90f94b1194395f33140898b39b2 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 20 Jan 2022 00:38:26 +0000 Subject: [PATCH 0557/1295] KVM: selftests: Re-enable access_tracking_perf_test This selftest was accidentally removed by commit 6a58150859fd ("selftest: KVM: Add intra host migration tests"). Add it back. Fixes: 6a58150859fd ("selftest: KVM: Add intra host migration tests") Signed-off-by: David Matlack Message-Id: <20220120003826.2805036-1-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 81ebf99d6ff0..0e4926bc9a58 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -85,6 +85,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_pi_mmio_test TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests TEST_GEN_PROGS_x86_64 += x86_64/amx_test +TEST_GEN_PROGS_x86_64 += access_tracking_perf_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_perf_test From d081a343dd18f6733f2f4d9a2521db92de9f7b75 Mon Sep 17 00:00:00 2001 From: Quanfa Fu Date: Sun, 19 Dec 2021 17:14:46 +0800 Subject: [PATCH 0558/1295] KVM/X86: Make kvm_vcpu_reload_apic_access_page() static Make kvm_vcpu_reload_apic_access_page() static as it is no longer invoked directly by vmx and it is also no longer exported. No functional change intended. Signed-off-by: Quanfa Fu Message-Id: <20211219091446.174584-1-quanfafu@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/x86.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 682ad02a4e58..9b2fffeeab16 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1856,7 +1856,6 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); -void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 55518b7d3b96..9ceeab1e5bdc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9753,7 +9753,7 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); } -void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) +static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { if (!lapic_in_kernel(vcpu)) return; From 0cea730cac824edf78ffd3302938ed5fe2b9d50d Mon Sep 17 00:00:00 2001 From: Padmanabha Srinivasaiah Date: Fri, 31 Dec 2021 20:54:03 +0100 Subject: [PATCH 0559/1295] staging: vc04_services: Fix RCU dereference check In service_callback path RCU dereferenced pointer struct vchiq_service need to be accessed inside rcu read-critical section. Also userdata/user_service part of vchiq_service is accessed around different synchronization mechanism, getting an extra reference to a pointer keeps sematics simpler and avoids prolonged graceperiod. Accessing vchiq_service with rcu_read_[lock/unlock] fixes below issue. [ 32.201659] ============================= [ 32.201664] WARNING: suspicious RCU usage [ 32.201670] 5.15.11-rt24-v8+ #3 Not tainted [ 32.201680] ----------------------------- [ 32.201685] drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h:529 suspicious rcu_dereference_check() usage! [ 32.201695] [ 32.201695] other info that might help us debug this: [ 32.201695] [ 32.201700] [ 32.201700] rcu_scheduler_active = 2, debug_locks = 1 [ 32.201708] no locks held by vchiq-slot/0/98. [ 32.201715] [ 32.201715] stack backtrace: [ 32.201723] CPU: 1 PID: 98 Comm: vchiq-slot/0 Not tainted 5.15.11-rt24-v8+ #3 [ 32.201733] Hardware name: Raspberry Pi 4 Model B Rev 1.4 (DT) [ 32.201739] Call trace: [ 32.201742] dump_backtrace+0x0/0x1b8 [ 32.201772] show_stack+0x20/0x30 [ 32.201784] dump_stack_lvl+0x8c/0xb8 [ 32.201799] dump_stack+0x18/0x34 [ 32.201808] lockdep_rcu_suspicious+0xe4/0xf8 [ 32.201817] service_callback+0x124/0x400 [ 32.201830] slot_handler_func+0xf60/0x1e20 [ 32.201839] kthread+0x19c/0x1a8 [ 32.201849] ret_from_fork+0x10/0x20 Tested-by: Stefan Wahren Signed-off-by: Padmanabha Srinivasaiah Link: https://lore.kernel.org/r/20211231195406.5479-1-treasure4paddy@gmail.com Signed-off-by: Greg Kroah-Hartman --- .../interface/vchiq_arm/vchiq_arm.c | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c index 6759a6261500..3a2e4582db8e 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c @@ -1058,15 +1058,27 @@ service_callback(enum vchiq_reason reason, struct vchiq_header *header, DEBUG_TRACE(SERVICE_CALLBACK_LINE); + rcu_read_lock(); service = handle_to_service(handle); - if (WARN_ON(!service)) + if (WARN_ON(!service)) { + rcu_read_unlock(); return VCHIQ_SUCCESS; + } user_service = (struct user_service *)service->base.userdata; instance = user_service->instance; - if (!instance || instance->closing) + if (!instance || instance->closing) { + rcu_read_unlock(); return VCHIQ_SUCCESS; + } + + /* + * As hopping around different synchronization mechanism, + * taking an extra reference results in simpler implementation. + */ + vchiq_service_get(service); + rcu_read_unlock(); vchiq_log_trace(vchiq_arm_log_level, "%s - service %lx(%d,%p), reason %d, header %lx, instance %lx, bulk_userdata %lx", @@ -1097,6 +1109,7 @@ service_callback(enum vchiq_reason reason, struct vchiq_header *header, bulk_userdata); if (status != VCHIQ_SUCCESS) { DEBUG_TRACE(SERVICE_CALLBACK_LINE); + vchiq_service_put(service); return status; } } @@ -1105,10 +1118,12 @@ service_callback(enum vchiq_reason reason, struct vchiq_header *header, if (wait_for_completion_interruptible(&user_service->remove_event)) { vchiq_log_info(vchiq_arm_log_level, "%s interrupted", __func__); DEBUG_TRACE(SERVICE_CALLBACK_LINE); + vchiq_service_put(service); return VCHIQ_RETRY; } else if (instance->closing) { vchiq_log_info(vchiq_arm_log_level, "%s closing", __func__); DEBUG_TRACE(SERVICE_CALLBACK_LINE); + vchiq_service_put(service); return VCHIQ_ERROR; } DEBUG_TRACE(SERVICE_CALLBACK_LINE); @@ -1137,6 +1152,7 @@ service_callback(enum vchiq_reason reason, struct vchiq_header *header, header = NULL; } DEBUG_TRACE(SERVICE_CALLBACK_LINE); + vchiq_service_put(service); if (skip_completion) return VCHIQ_SUCCESS; From 426aca16e903b387a0b0001d62207a745c67cfd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 18 Jan 2022 19:13:37 +0100 Subject: [PATCH 0560/1295] staging: fbtft: Fix error path in fbtft_driver_module_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If registering the platform driver fails, the function must not return without undoing the spi driver registration first. Fixes: c296d5f9957c ("staging: fbtft: core support") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20220118181338.207943-1-u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/staging/fbtft/fbtft.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/staging/fbtft/fbtft.h b/drivers/staging/fbtft/fbtft.h index 4cdec34e23d2..55677efc0138 100644 --- a/drivers/staging/fbtft/fbtft.h +++ b/drivers/staging/fbtft/fbtft.h @@ -334,7 +334,10 @@ static int __init fbtft_driver_module_init(void) \ ret = spi_register_driver(&fbtft_driver_spi_driver); \ if (ret < 0) \ return ret; \ - return platform_driver_register(&fbtft_driver_platform_driver); \ + ret = platform_driver_register(&fbtft_driver_platform_driver); \ + if (ret < 0) \ + spi_unregister_driver(&fbtft_driver_spi_driver); \ + return ret; \ } \ \ static void __exit fbtft_driver_module_exit(void) \ From 167a668ab0edf92bfd043bafd24e7f895d074173 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 8 Jan 2022 10:09:10 -0800 Subject: [PATCH 0561/1295] drm/msm/gpu: Wait for idle before suspending System suspend uses pm_runtime_force_suspend(), which cheekily bypasses the runpm reference counts. This doesn't actually work so well when the GPU is active. So add a reasonable delay waiting for the GPU to become idle. Alternatively we could just return -EBUSY in this case, but that has the disadvantage of causing system suspend to fail. v2: s/ret/remaining [sboyd], and switch to using active_submits count to ensure we aren't racing with submit cleanup (and devfreq idle work getting scheduled, etc) v3: fix inverted logic Signed-off-by: Rob Clark Reviewed-by: Bjorn Andersson Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20220108180913.814448-2-robdclark@gmail.com Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/adreno_device.c | 18 ++++++++++++++++++ drivers/gpu/drm/msm/msm_gpu.c | 3 +++ drivers/gpu/drm/msm/msm_gpu.h | 3 +++ 3 files changed, 24 insertions(+) diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index 93005839b5da..fb261930ad1c 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -608,9 +608,27 @@ static int adreno_resume(struct device *dev) return gpu->funcs->pm_resume(gpu); } +static int active_submits(struct msm_gpu *gpu) +{ + int active_submits; + mutex_lock(&gpu->active_lock); + active_submits = gpu->active_submits; + mutex_unlock(&gpu->active_lock); + return active_submits; +} + static int adreno_suspend(struct device *dev) { struct msm_gpu *gpu = dev_to_gpu(dev); + int remaining; + + remaining = wait_event_timeout(gpu->retire_event, + active_submits(gpu) == 0, + msecs_to_jiffies(1000)); + if (remaining == 0) { + dev_err(dev, "Timeout waiting for GPU to suspend\n"); + return -EBUSY; + } return gpu->funcs->pm_suspend(gpu); } diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 0f78c2615272..2c1049c0ea14 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -703,6 +703,8 @@ static void retire_submits(struct msm_gpu *gpu) } } } + + wake_up_all(&gpu->retire_event); } static void retire_worker(struct kthread_work *work) @@ -848,6 +850,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, INIT_LIST_HEAD(&gpu->active_list); mutex_init(&gpu->active_lock); mutex_init(&gpu->lock); + init_waitqueue_head(&gpu->retire_event); kthread_init_work(&gpu->retire_work, retire_worker); kthread_init_work(&gpu->recover_work, recover_worker); kthread_init_work(&gpu->fault_work, fault_worker); diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index 445c6bfd4b6b..92aa1e9196c6 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -230,6 +230,9 @@ struct msm_gpu { /* work for handling GPU recovery: */ struct kthread_work recover_work; + /** retire_event: notified when submits are retired: */ + wait_queue_head_t retire_event; + /* work for handling active-list retiring: */ struct kthread_work retire_work; From 6aa89ae1fb049614b7e03e24485bbfb96754a02b Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 8 Jan 2022 10:09:11 -0800 Subject: [PATCH 0562/1295] drm/msm/gpu: Cancel idle/boost work on suspend With system suspend using pm_runtime_force_suspend() we can't rely on the pm_runtime_get_if_in_use() trick to deal with devfreq callbacks after (or racing with) suspend. So flush any pending idle or boost work in the suspend path. Signed-off-by: Rob Clark Link: https://lore.kernel.org/r/20220108180913.814448-3-robdclark@gmail.com Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gpu_devfreq.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gpu_devfreq.c b/drivers/gpu/drm/msm/msm_gpu_devfreq.c index 62405e980925..9bf319be11f6 100644 --- a/drivers/gpu/drm/msm/msm_gpu_devfreq.c +++ b/drivers/gpu/drm/msm/msm_gpu_devfreq.c @@ -133,6 +133,18 @@ void msm_devfreq_init(struct msm_gpu *gpu) CLOCK_MONOTONIC, HRTIMER_MODE_REL); } +static void cancel_idle_work(struct msm_gpu_devfreq *df) +{ + hrtimer_cancel(&df->idle_work.timer); + kthread_cancel_work_sync(&df->idle_work.work); +} + +static void cancel_boost_work(struct msm_gpu_devfreq *df) +{ + hrtimer_cancel(&df->boost_work.timer); + kthread_cancel_work_sync(&df->boost_work.work); +} + void msm_devfreq_cleanup(struct msm_gpu *gpu) { struct msm_gpu_devfreq *df = &gpu->devfreq; @@ -152,7 +164,12 @@ void msm_devfreq_resume(struct msm_gpu *gpu) void msm_devfreq_suspend(struct msm_gpu *gpu) { - devfreq_suspend_device(gpu->devfreq.devfreq); + struct msm_gpu_devfreq *df = &gpu->devfreq; + + devfreq_suspend_device(df->devfreq); + + cancel_idle_work(df); + cancel_boost_work(df); } static void msm_devfreq_boost_work(struct kthread_work *work) @@ -196,7 +213,7 @@ void msm_devfreq_active(struct msm_gpu *gpu) /* * Cancel any pending transition to idle frequency: */ - hrtimer_cancel(&df->idle_work.timer); + cancel_idle_work(df); idle_time = ktime_to_ms(ktime_sub(ktime_get(), df->idle_time)); From 8bdd24940b69c0018b64b496aa3b03a25f7295ca Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Tue, 25 Jan 2022 15:40:06 +0100 Subject: [PATCH 0563/1295] amd: declance: use eth_hw_addr_set() Copy scattered mac address octets into an array then eth_hw_addr_set(). Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Thomas Bogendoerfer Link: https://lore.kernel.org/r/20220125144007.64407-1-tsbogend@alpha.franken.de Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/declance.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/declance.c b/drivers/net/ethernet/amd/declance.c index 493b0cefcc2a..ec8df05e7bf6 100644 --- a/drivers/net/ethernet/amd/declance.c +++ b/drivers/net/ethernet/amd/declance.c @@ -1032,6 +1032,7 @@ static int dec_lance_probe(struct device *bdev, const int type) int i, ret; unsigned long esar_base; unsigned char *esar; + u8 addr[ETH_ALEN]; const char *desc; if (dec_lance_debug && version_printed++ == 0) @@ -1228,7 +1229,8 @@ static int dec_lance_probe(struct device *bdev, const int type) break; } for (i = 0; i < 6; i++) - dev->dev_addr[i] = esar[i * 4]; + addr[i] = esar[i * 4]; + eth_hw_addr_set(dev, addr); printk("%s: %s, addr = %pM, irq = %d\n", name, desc, dev->dev_addr, dev->irq); From c9d967b2ce40d71e968eb839f36c936b8a9cf1ea Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 13 Jan 2022 19:44:20 +0100 Subject: [PATCH 0564/1295] PM: wakeup: simplify the output logic of pm_show_wakelocks() The buffer handling in pm_show_wakelocks() is tricky, and hopefully correct. Ensure it really is correct by using sysfs_emit_at() which handles all of the tricky string handling logic in a PAGE_SIZE buffer for us automatically as this is a sysfs file being read from. Signed-off-by: Greg Kroah-Hartman Reviewed-by: Lee Jones Signed-off-by: Rafael J. Wysocki --- kernel/power/wakelock.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 105df4dfc783..52571dcad768 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -39,23 +39,20 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active) { struct rb_node *node; struct wakelock *wl; - char *str = buf; - char *end = buf + PAGE_SIZE; + int len = 0; mutex_lock(&wakelocks_lock); for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { wl = rb_entry(node, struct wakelock, node); if (wl->ws->active == show_active) - str += scnprintf(str, end - str, "%s ", wl->name); + len += sysfs_emit_at(buf, len, "%s ", wl->name); } - if (str > buf) - str--; - str += scnprintf(str, end - str, "\n"); + len += sysfs_emit_at(buf, len, "\n"); mutex_unlock(&wakelocks_lock); - return (str - buf); + return len; } #if CONFIG_PM_WAKELOCKS_LIMIT > 0 From 945c37ed564770c78dfe6b9f08bed57a1b4e60ef Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Mon, 10 Jan 2022 20:43:28 +0800 Subject: [PATCH 0565/1295] usb: roles: fix include/linux/usb/role.h compile issue when CONFIG_USB_ROLE_SWITCH is not defined, add usb_role_switch_find_by_fwnode() definition which return NULL. Fixes: c6919d5e0cd1 ("usb: roles: Add usb_role_switch_find_by_fwnode()") Signed-off-by: Linyu Yuan Link: https://lore.kernel.org/r/1641818608-25039-1-git-send-email-quic_linyyuan@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/role.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/usb/role.h b/include/linux/usb/role.h index 031f148ab373..b5deafd91f67 100644 --- a/include/linux/usb/role.h +++ b/include/linux/usb/role.h @@ -91,6 +91,12 @@ fwnode_usb_role_switch_get(struct fwnode_handle *node) static inline void usb_role_switch_put(struct usb_role_switch *sw) { } +static inline struct usb_role_switch * +usb_role_switch_find_by_fwnode(const struct fwnode_handle *fwnode) +{ + return NULL; +} + static inline struct usb_role_switch * usb_role_switch_register(struct device *parent, const struct usb_role_switch_desc *desc) From 33569ef3c754a82010f266b7b938a66a3ccf90a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Wed, 19 Jan 2022 11:47:51 +0100 Subject: [PATCH 0566/1295] PM: hibernate: Remove register_nosave_region_late() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is an unused wrapper forcing kmalloc allocation for registering nosave regions. Also, rename __register_nosave_region() to register_nosave_region() now that there is no need for disambiguation. Signed-off-by: Amadeusz Sławiński Reviewed-by: Cezary Rojewski Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 11 +---------- kernel/power/snapshot.c | 21 +++++++-------------- 2 files changed, 8 insertions(+), 24 deletions(-) diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 5785d909c321..3e8ecdebe601 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -430,15 +430,7 @@ struct platform_hibernation_ops { #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ -extern void __register_nosave_region(unsigned long b, unsigned long e, int km); -static inline void __init register_nosave_region(unsigned long b, unsigned long e) -{ - __register_nosave_region(b, e, 0); -} -static inline void __init register_nosave_region_late(unsigned long b, unsigned long e) -{ - __register_nosave_region(b, e, 1); -} +extern void register_nosave_region(unsigned long b, unsigned long e); extern int swsusp_page_is_forbidden(struct page *); extern void swsusp_set_page_free(struct page *); extern void swsusp_unset_page_free(struct page *); @@ -458,7 +450,6 @@ int pfn_is_nosave(unsigned long pfn); int hibernate_quiet_exec(int (*func)(void *data), void *data); #else /* CONFIG_HIBERNATION */ static inline void register_nosave_region(unsigned long b, unsigned long e) {} -static inline void register_nosave_region_late(unsigned long b, unsigned long e) {} static inline int swsusp_page_is_forbidden(struct page *p) { return 0; } static inline void swsusp_set_page_free(struct page *p) {} static inline void swsusp_unset_page_free(struct page *p) {} diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f7a986078213..330d49937692 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -978,8 +978,7 @@ static void memory_bm_recycle(struct memory_bitmap *bm) * Register a range of page frames the contents of which should not be saved * during hibernation (to be used in the early initialization code). */ -void __init __register_nosave_region(unsigned long start_pfn, - unsigned long end_pfn, int use_kmalloc) +void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pfn) { struct nosave_region *region; @@ -995,18 +994,12 @@ void __init __register_nosave_region(unsigned long start_pfn, goto Report; } } - if (use_kmalloc) { - /* During init, this shouldn't fail */ - region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); - BUG_ON(!region); - } else { - /* This allocation cannot fail */ - region = memblock_alloc(sizeof(struct nosave_region), - SMP_CACHE_BYTES); - if (!region) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct nosave_region)); - } + /* This allocation cannot fail */ + region = memblock_alloc(sizeof(struct nosave_region), + SMP_CACHE_BYTES); + if (!region) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); From 5638b0dfb6921f69943c705383ff40fb64b987f2 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Thu, 13 Jan 2022 17:29:43 +0800 Subject: [PATCH 0567/1295] usb: typec: tcpci: don't touch CC line if it's Vconn source With the AMS and Collision Avoidance, tcpm often needs to change the CC's termination. When one CC line is sourcing Vconn, if we still change its termination, the voltage of the another CC line is likely to be fluctuant and unstable. Therefore, we should verify whether a CC line is sourcing Vconn before changing its termination and only change the termination that is not a Vconn line. This can be done by reading the Vconn Present bit of POWER_ STATUS register. To determine the polarity, we can read the Plug Orientation bit of TCPC_CONTROL register. Since Vconn can only be sourced if Plug Orientation is set. Fixes: 0908c5aca31e ("usb: typec: tcpm: AMS and Collision Avoidance") cc: Reviewed-by: Guenter Roeck Acked-by: Heikki Krogerus Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20220113092943.752372-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpci.c | 26 ++++++++++++++++++++++++++ drivers/usb/typec/tcpm/tcpci.h | 1 + 2 files changed, 27 insertions(+) diff --git a/drivers/usb/typec/tcpm/tcpci.c b/drivers/usb/typec/tcpm/tcpci.c index 35a1307349a2..e07d26a3cd8e 100644 --- a/drivers/usb/typec/tcpm/tcpci.c +++ b/drivers/usb/typec/tcpm/tcpci.c @@ -75,9 +75,25 @@ static int tcpci_write16(struct tcpci *tcpci, unsigned int reg, u16 val) static int tcpci_set_cc(struct tcpc_dev *tcpc, enum typec_cc_status cc) { struct tcpci *tcpci = tcpc_to_tcpci(tcpc); + bool vconn_pres; + enum typec_cc_polarity polarity = TYPEC_POLARITY_CC1; unsigned int reg; int ret; + ret = regmap_read(tcpci->regmap, TCPC_POWER_STATUS, ®); + if (ret < 0) + return ret; + + vconn_pres = !!(reg & TCPC_POWER_STATUS_VCONN_PRES); + if (vconn_pres) { + ret = regmap_read(tcpci->regmap, TCPC_TCPC_CTRL, ®); + if (ret < 0) + return ret; + + if (reg & TCPC_TCPC_CTRL_ORIENTATION) + polarity = TYPEC_POLARITY_CC2; + } + switch (cc) { case TYPEC_CC_RA: reg = (TCPC_ROLE_CTRL_CC_RA << TCPC_ROLE_CTRL_CC1_SHIFT) | @@ -112,6 +128,16 @@ static int tcpci_set_cc(struct tcpc_dev *tcpc, enum typec_cc_status cc) break; } + if (vconn_pres) { + if (polarity == TYPEC_POLARITY_CC2) { + reg &= ~(TCPC_ROLE_CTRL_CC1_MASK << TCPC_ROLE_CTRL_CC1_SHIFT); + reg |= (TCPC_ROLE_CTRL_CC_OPEN << TCPC_ROLE_CTRL_CC1_SHIFT); + } else { + reg &= ~(TCPC_ROLE_CTRL_CC2_MASK << TCPC_ROLE_CTRL_CC2_SHIFT); + reg |= (TCPC_ROLE_CTRL_CC_OPEN << TCPC_ROLE_CTRL_CC2_SHIFT); + } + } + ret = regmap_write(tcpci->regmap, TCPC_ROLE_CTRL, reg); if (ret < 0) return ret; diff --git a/drivers/usb/typec/tcpm/tcpci.h b/drivers/usb/typec/tcpm/tcpci.h index 2be7a77d400e..b2edd45f13c6 100644 --- a/drivers/usb/typec/tcpm/tcpci.h +++ b/drivers/usb/typec/tcpm/tcpci.h @@ -98,6 +98,7 @@ #define TCPC_POWER_STATUS_SOURCING_VBUS BIT(4) #define TCPC_POWER_STATUS_VBUS_DET BIT(3) #define TCPC_POWER_STATUS_VBUS_PRES BIT(2) +#define TCPC_POWER_STATUS_VCONN_PRES BIT(1) #define TCPC_POWER_STATUS_SINKING_VBUS BIT(0) #define TCPC_FAULT_STATUS 0x1f From 7817adb03cfb52ebb5bdb25fd9fc8f683a1a09d9 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Mon, 24 Jan 2022 12:02:27 +0300 Subject: [PATCH 0568/1295] usb: typec: Only attempt to link USB ports if there is fwnode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code that creates the links to the USB ports attached to a connector inside the system assumed that the ACPI nodes (fwnodes) always exist for the connectors, but it can not do that. There is no guarantee that every USB Type-C connector has ACPI device node representing it in the ACPI tables, and even if there are the nodes in the ACPI tables, the _STA method in those nodes may still return 0 (which means the device does not exist from ACPI PoW). This fixes NULL pointer dereference that happens if the nodes are missing. Fixes: 730b49aac426 ("usb: typec: port-mapper: Convert to the component framework") Reported-and-tested-by: Robert Święcki Reported-by: Mikhail Gavrilov Tested-by: Marc Zyngier Acked-by: Marc Zyngier Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20220124090228.41396-2-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/port-mapper.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/typec/port-mapper.c b/drivers/usb/typec/port-mapper.c index 07d307418b47..b6e0c6acc628 100644 --- a/drivers/usb/typec/port-mapper.c +++ b/drivers/usb/typec/port-mapper.c @@ -56,6 +56,9 @@ int typec_link_ports(struct typec_port *con) { struct each_port_arg arg = { .port = con, .match = NULL }; + if (!has_acpi_companion(&con->dev)) + return 0; + bus_for_each_dev(&acpi_bus_type, NULL, &arg, typec_port_match); /* @@ -74,5 +77,6 @@ int typec_link_ports(struct typec_port *con) void typec_unlink_ports(struct typec_port *con) { - component_master_del(&con->dev, &typec_aggregate_ops); + if (has_acpi_companion(&con->dev)) + component_master_del(&con->dev, &typec_aggregate_ops); } From 147ab5376f18045da9f22a8262185707745bbf77 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Mon, 24 Jan 2022 12:02:28 +0300 Subject: [PATCH 0569/1295] usb: typec: Don't try to register component master without components This fixes NULL pointer dereference that happens if component master is registered with empty component match list. Fixes: 730b49aac426 ("usb: typec: port-mapper: Convert to the component framework") Reported-by: Mikhail Gavrilov Tested-by: John Stultz Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20220124090228.41396-3-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/port-mapper.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/typec/port-mapper.c b/drivers/usb/typec/port-mapper.c index b6e0c6acc628..a7d507802509 100644 --- a/drivers/usb/typec/port-mapper.c +++ b/drivers/usb/typec/port-mapper.c @@ -60,6 +60,8 @@ int typec_link_ports(struct typec_port *con) return 0; bus_for_each_dev(&acpi_bus_type, NULL, &arg, typec_port_match); + if (!arg.match) + return 0; /* * REVISIT: Now each connector can have only a single component master. From e464121f2d40eabc7d11823fb26db807ce945df4 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 21 Jan 2022 09:47:38 -0800 Subject: [PATCH 0570/1295] x86/cpu: Add Xeon Icelake-D to list of CPUs that support PPIN Missed adding the Icelake-D CPU to the list. It uses the same MSRs to control and read the inventory number as all the other models. Fixes: dc6b025de95b ("x86/mce: Add Xeon Icelake to list of CPUs that support PPIN") Reported-by: Ailin Xu Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Cc: Link: https://lore.kernel.org/r/20220121174743.1875294-2-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/intel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index bb9a46a804bf..baafbb37be67 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -486,6 +486,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_ICELAKE_D: case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: From 90b8aa9f5b09edae6928c0561f933fec9f7a9987 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 21 Jan 2022 17:55:19 -0800 Subject: [PATCH 0571/1295] usb: typec: tcpm: Do not disconnect while receiving VBUS off With some chargers, vbus might momentarily raise above VSAFE5V and fall back to 0V before tcpm gets to read port->tcpc->get_vbus. This will will report a VBUS off event causing TCPM to transition to SNK_UNATTACHED where it should be waiting in either SNK_ATTACH_WAIT or SNK_DEBOUNCED state. This patch makes TCPM avoid vbus off events while in SNK_ATTACH_WAIT or SNK_DEBOUNCED state. Stub from the spec: "4.5.2.2.4.2 Exiting from AttachWait.SNK State A Sink shall transition to Unattached.SNK when the state of both the CC1 and CC2 pins is SNK.Open for at least tPDDebounce. A DRP shall transition to Unattached.SRC when the state of both the CC1 and CC2 pins is SNK.Open for at least tPDDebounce." [23.194131] CC1: 0 -> 0, CC2: 0 -> 5 [state SNK_UNATTACHED, polarity 0, connected] [23.201777] state change SNK_UNATTACHED -> SNK_ATTACH_WAIT [rev3 NONE_AMS] [23.209949] pending state change SNK_ATTACH_WAIT -> SNK_DEBOUNCED @ 170 ms [rev3 NONE_AMS] [23.300579] VBUS off [23.300668] state change SNK_ATTACH_WAIT -> SNK_UNATTACHED [rev3 NONE_AMS] [23.301014] VBUS VSAFE0V [23.301111] Start toggling Fixes: f0690a25a140b8 ("staging: typec: USB Type-C Port Manager (tcpm)") Cc: stable@vger.kernel.org Acked-by: Heikki Krogerus Signed-off-by: Badhri Jagan Sridharan Link: https://lore.kernel.org/r/20220122015520.332507-1-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 59d4fa2443f2..b8afe3d8c882 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -5156,7 +5156,8 @@ static void _tcpm_pd_vbus_off(struct tcpm_port *port) case SNK_TRYWAIT_DEBOUNCE: break; case SNK_ATTACH_WAIT: - tcpm_set_state(port, SNK_UNATTACHED, 0); + case SNK_DEBOUNCED: + /* Do nothing, as TCPM is still waiting for vbus to reaach VSAFE5V to connect */ break; case SNK_NEGOTIATE_CAPABILITIES: From 746f96e7d6f7a276726860f696671766bfb24cf0 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 21 Jan 2022 17:55:20 -0800 Subject: [PATCH 0572/1295] usb: typec: tcpm: Do not disconnect when receiving VSAFE0V With some chargers, vbus might momentarily raise above VSAFE5V and fall back to 0V causing VSAFE0V to be triggered. This will will report a VBUS off event causing TCPM to transition to SNK_UNATTACHED state where it should be waiting in either SNK_ATTACH_WAIT or SNK_DEBOUNCED state. This patch makes TCPM avoid VSAFE0V events while in SNK_ATTACH_WAIT or SNK_DEBOUNCED state. Stub from the spec: "4.5.2.2.4.2 Exiting from AttachWait.SNK State A Sink shall transition to Unattached.SNK when the state of both the CC1 and CC2 pins is SNK.Open for at least tPDDebounce. A DRP shall transition to Unattached.SRC when the state of both the CC1 and CC2 pins is SNK.Open for at least tPDDebounce." [23.194131] CC1: 0 -> 0, CC2: 0 -> 5 [state SNK_UNATTACHED, polarity 0, connected] [23.201777] state change SNK_UNATTACHED -> SNK_ATTACH_WAIT [rev3 NONE_AMS] [23.209949] pending state change SNK_ATTACH_WAIT -> SNK_DEBOUNCED @ 170 ms [rev3 NONE_AMS] [23.300579] VBUS off [23.300668] state change SNK_ATTACH_WAIT -> SNK_UNATTACHED [rev3 NONE_AMS] [23.301014] VBUS VSAFE0V [23.301111] Start toggling Fixes: 28b43d3d746b8 ("usb: typec: tcpm: Introduce vsafe0v for vbus") Cc: stable@vger.kernel.org Acked-by: Heikki Krogerus Signed-off-by: Badhri Jagan Sridharan Link: https://lore.kernel.org/r/20220122015520.332507-2-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index b8afe3d8c882..5fce795b69c7 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -5264,6 +5264,10 @@ static void _tcpm_pd_vbus_vsafe0v(struct tcpm_port *port) case PR_SWAP_SNK_SRC_SOURCE_ON: /* Do nothing, vsafe0v is expected during transition */ break; + case SNK_ATTACH_WAIT: + case SNK_DEBOUNCED: + /*Do nothing, still waiting for VSAFE5V for connect */ + break; default: if (port->pwr_role == TYPEC_SINK && port->auto_vbus_discharge_enabled) tcpm_set_state(port, SNK_UNATTACHED, 0); From 5b67b315037250a61861119683e7fcb509deea25 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 24 Jan 2022 15:14:40 -0500 Subject: [PATCH 0573/1295] usb-storage: Add unusual-devs entry for VL817 USB-SATA bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two people have reported (and mentioned numerous other reports on the web) that VIA's VL817 USB-SATA bridge does not work with the uas driver. Typical log messages are: [ 3606.232149] sd 14:0:0:0: [sdg] tag#2 uas_zap_pending 0 uas-tag 1 inflight: CMD [ 3606.232154] sd 14:0:0:0: [sdg] tag#2 CDB: Write(16) 8a 00 00 00 00 00 18 0c c9 80 00 00 00 80 00 00 [ 3606.306257] usb 4-4.4: reset SuperSpeed Plus Gen 2x1 USB device number 11 using xhci_hcd [ 3606.328584] scsi host14: uas_eh_device_reset_handler success Surprisingly, the devices do seem to work okay for some other people. The cause of the differing behaviors is not known. In the hope of getting the devices to work for the most users, even at the possible cost of degraded performance for some, this patch adds an unusual_devs entry for the VL817 to block it from binding to the uas driver by default. Users will be able to override this entry by means of a module parameter, if they want. CC: Reported-by: DocMAX Reported-and-tested-by: Thomas Weißschuh Signed-off-by: Alan Stern Link: https://lore.kernel.org/r/Ye8IsK2sjlEv1rqU@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_devs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index 29191d33c0e3..1a05e3dcfec8 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -2301,6 +2301,16 @@ UNUSUAL_DEV( 0x2027, 0xa001, 0x0000, 0x9999, USB_SC_DEVICE, USB_PR_DEVICE, usb_stor_euscsi_init, US_FL_SCM_MULT_TARG ), +/* + * Reported by DocMAX + * and Thomas Weißschuh + */ +UNUSUAL_DEV( 0x2109, 0x0715, 0x9999, 0x9999, + "VIA Labs, Inc.", + "VL817 SATA Bridge", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_IGNORE_UAS), + UNUSUAL_DEV( 0x2116, 0x0320, 0x0001, 0x0001, "ST", "2A", From 26fbe9772b8c459687930511444ce443011f86bf Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 24 Jan 2022 15:23:45 -0500 Subject: [PATCH 0574/1295] USB: core: Fix hang in usb_kill_urb by adding memory barriers The syzbot fuzzer has identified a bug in which processes hang waiting for usb_kill_urb() to return. It turns out the issue is not unlinking the URB; that works just fine. Rather, the problem arises when the wakeup notification that the URB has completed is not received. The reason is memory-access ordering on SMP systems. In outline form, usb_kill_urb() and __usb_hcd_giveback_urb() operating concurrently on different CPUs perform the following actions: CPU 0 CPU 1 ---------------------------- --------------------------------- usb_kill_urb(): __usb_hcd_giveback_urb(): ... ... atomic_inc(&urb->reject); atomic_dec(&urb->use_count); ... ... wait_event(usb_kill_urb_queue, atomic_read(&urb->use_count) == 0); if (atomic_read(&urb->reject)) wake_up(&usb_kill_urb_queue); Confining your attention to urb->reject and urb->use_count, you can see that the overall pattern of accesses on CPU 0 is: write urb->reject, then read urb->use_count; whereas the overall pattern of accesses on CPU 1 is: write urb->use_count, then read urb->reject. This pattern is referred to in memory-model circles as SB (for "Store Buffering"), and it is well known that without suitable enforcement of the desired order of accesses -- in the form of memory barriers -- it is entirely possible for one or both CPUs to execute their reads ahead of their writes. The end result will be that sometimes CPU 0 sees the old un-decremented value of urb->use_count while CPU 1 sees the old un-incremented value of urb->reject. Consequently CPU 0 ends up on the wait queue and never gets woken up, leading to the observed hang in usb_kill_urb(). The same pattern of accesses occurs in usb_poison_urb() and the failure pathway of usb_hcd_submit_urb(). The problem is fixed by adding suitable memory barriers. To provide proper memory-access ordering in the SB pattern, a full barrier is required on both CPUs. The atomic_inc() and atomic_dec() accesses themselves don't provide any memory ordering, but since they are present, we can use the optimized smp_mb__after_atomic() memory barrier in the various routines to obtain the desired effect. This patch adds the necessary memory barriers. CC: Reported-and-tested-by: syzbot+76629376e06e2c2ad626@syzkaller.appspotmail.com Signed-off-by: Alan Stern Link: https://lore.kernel.org/r/Ye8K0QYee0Q0Nna2@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 14 ++++++++++++++ drivers/usb/core/urb.c | 12 ++++++++++++ 2 files changed, 26 insertions(+) diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 3e01dd6e509b..d9712c2602af 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1563,6 +1563,13 @@ int usb_hcd_submit_urb (struct urb *urb, gfp_t mem_flags) urb->hcpriv = NULL; INIT_LIST_HEAD(&urb->urb_list); atomic_dec(&urb->use_count); + /* + * Order the write of urb->use_count above before the read + * of urb->reject below. Pairs with the memory barriers in + * usb_kill_urb() and usb_poison_urb(). + */ + smp_mb__after_atomic(); + atomic_dec(&urb->dev->urbnum); if (atomic_read(&urb->reject)) wake_up(&usb_kill_urb_queue); @@ -1665,6 +1672,13 @@ static void __usb_hcd_giveback_urb(struct urb *urb) usb_anchor_resume_wakeups(anchor); atomic_dec(&urb->use_count); + /* + * Order the write of urb->use_count above before the read + * of urb->reject below. Pairs with the memory barriers in + * usb_kill_urb() and usb_poison_urb(). + */ + smp_mb__after_atomic(); + if (unlikely(atomic_read(&urb->reject))) wake_up(&usb_kill_urb_queue); usb_put_urb(urb); diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c index 30727729a44c..33d62d7e3929 100644 --- a/drivers/usb/core/urb.c +++ b/drivers/usb/core/urb.c @@ -715,6 +715,12 @@ void usb_kill_urb(struct urb *urb) if (!(urb && urb->dev && urb->ep)) return; atomic_inc(&urb->reject); + /* + * Order the write of urb->reject above before the read + * of urb->use_count below. Pairs with the barriers in + * __usb_hcd_giveback_urb() and usb_hcd_submit_urb(). + */ + smp_mb__after_atomic(); usb_hcd_unlink_urb(urb, -ENOENT); wait_event(usb_kill_urb_queue, atomic_read(&urb->use_count) == 0); @@ -756,6 +762,12 @@ void usb_poison_urb(struct urb *urb) if (!urb) return; atomic_inc(&urb->reject); + /* + * Order the write of urb->reject above before the read + * of urb->use_count below. Pairs with the barriers in + * __usb_hcd_giveback_urb() and usb_hcd_submit_urb(). + */ + smp_mb__after_atomic(); if (!urb->dev || !urb->ep) return; From e3d26528e083e612314d4dcd713f3d5a26143ddc Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Thu, 6 Jan 2022 19:10:21 +0100 Subject: [PATCH 0575/1295] drm/etnaviv: relax submit size limits While all userspace tried to limit commandstreams to 64K in size, a bug in the Mesa driver lead to command streams of up to 128K being submitted. Allow those to avoid breaking existing userspace. Fixes: 6dfa2fab8ddd ("drm/etnaviv: limit submit sizes") Cc: stable@vger.kernel.org Signed-off-by: Lucas Stach Reviewed-by: Christian Gmeiner --- drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c index b03c20c14ca1..a17313282e8b 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c @@ -469,8 +469,8 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data, return -EINVAL; } - if (args->stream_size > SZ_64K || args->nr_relocs > SZ_64K || - args->nr_bos > SZ_64K || args->nr_pmrs > 128) { + if (args->stream_size > SZ_128K || args->nr_relocs > SZ_128K || + args->nr_bos > SZ_128K || args->nr_pmrs > 128) { DRM_ERROR("submit arguments out of size limits\n"); return -EINVAL; } From 7938d61591d33394a21bdd7797a245b65428f44c Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 19 Oct 2021 13:27:10 +0100 Subject: [PATCH 0576/1295] drm/i915: Flush TLBs before releasing backing store We need to flush TLBs before releasing backing store otherwise userspace is able to encounter stale entries if a) it is not declaring access to certain buffers and b) it races with the backing store release from a such undeclared execution already executing on the GPU in parallel. The approach taken is to mark any buffer objects which were ever bound to the GPU and to trigger a serialized TLB flush when their backing store is released. Alternatively the flushing could be done on VMA unbind, at which point we would be able to ascertain whether there is potential a parallel GPU execution (which could race), but essentially it boils down to paying the cost of TLB flushes potentially needlessly at VMA unbind time (when the backing store is not known to be going away so not needed for safety), versus potentially needlessly at backing store relase time (since we at that point cannot tell whether there is anything executing on the GPU which uses that object). Thereforce simplicity of implementation has been chosen for now with scope to benchmark and refine later as required. Signed-off-by: Tvrtko Ursulin Reported-by: Sushma Venkatesh Reddy Reviewed-by: Daniel Vetter Acked-by: Dave Airlie Cc: Daniel Vetter Cc: Jon Bloomfield Cc: Joonas Lahtinen Cc: Jani Nikula Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- .../gpu/drm/i915/gem/i915_gem_object_types.h | 1 + drivers/gpu/drm/i915/gem/i915_gem_pages.c | 10 ++ drivers/gpu/drm/i915/gt/intel_gt.c | 108 ++++++++++++++++++ drivers/gpu/drm/i915/gt/intel_gt.h | 2 + drivers/gpu/drm/i915/gt/intel_gt_types.h | 2 + drivers/gpu/drm/i915/i915_reg.h | 11 ++ drivers/gpu/drm/i915/i915_vma.c | 3 + drivers/gpu/drm/i915/intel_uncore.c | 26 ++++- drivers/gpu/drm/i915/intel_uncore.h | 2 + 9 files changed, 161 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index 4b4829eb16c2..0dd107dcecc2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -311,6 +311,7 @@ struct drm_i915_gem_object { #define I915_BO_READONLY BIT(6) #define I915_TILING_QUIRK_BIT 7 /* unknown swizzling; do not release! */ #define I915_BO_PROTECTED BIT(8) +#define I915_BO_WAS_BOUND_BIT 9 /** * @mem_flags - Mutable placement-related flags * diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c index 9f429ed6e78a..a50f884973bc 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c @@ -10,6 +10,8 @@ #include "i915_gem_lmem.h" #include "i915_gem_mman.h" +#include "gt/intel_gt.h" + void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj, struct sg_table *pages, unsigned int sg_page_sizes) @@ -221,6 +223,14 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj) __i915_gem_object_reset_page_iter(obj); obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0; + if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) { + struct drm_i915_private *i915 = to_i915(obj->base.dev); + intel_wakeref_t wakeref; + + with_intel_runtime_pm_if_active(&i915->runtime_pm, wakeref) + intel_gt_invalidate_tlbs(to_gt(i915)); + } + return pages; } diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index f98f0fb21efb..35d0fcd3a86c 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -29,6 +29,8 @@ void __intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915) { spin_lock_init(>->irq_lock); + mutex_init(>->tlb_invalidate_lock); + INIT_LIST_HEAD(>->closed_vma); spin_lock_init(>->closed_lock); @@ -912,3 +914,109 @@ void intel_gt_info_print(const struct intel_gt_info *info, intel_sseu_dump(&info->sseu, p); } + +struct reg_and_bit { + i915_reg_t reg; + u32 bit; +}; + +static struct reg_and_bit +get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8, + const i915_reg_t *regs, const unsigned int num) +{ + const unsigned int class = engine->class; + struct reg_and_bit rb = { }; + + if (drm_WARN_ON_ONCE(&engine->i915->drm, + class >= num || !regs[class].reg)) + return rb; + + rb.reg = regs[class]; + if (gen8 && class == VIDEO_DECODE_CLASS) + rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */ + else + rb.bit = engine->instance; + + rb.bit = BIT(rb.bit); + + return rb; +} + +void intel_gt_invalidate_tlbs(struct intel_gt *gt) +{ + static const i915_reg_t gen8_regs[] = { + [RENDER_CLASS] = GEN8_RTCR, + [VIDEO_DECODE_CLASS] = GEN8_M1TCR, /* , GEN8_M2TCR */ + [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR, + [COPY_ENGINE_CLASS] = GEN8_BTCR, + }; + static const i915_reg_t gen12_regs[] = { + [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR, + [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR, + [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR, + [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR, + }; + struct drm_i915_private *i915 = gt->i915; + struct intel_uncore *uncore = gt->uncore; + struct intel_engine_cs *engine; + enum intel_engine_id id; + const i915_reg_t *regs; + unsigned int num = 0; + + if (I915_SELFTEST_ONLY(gt->awake == -ENODEV)) + return; + + if (GRAPHICS_VER(i915) == 12) { + regs = gen12_regs; + num = ARRAY_SIZE(gen12_regs); + } else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) { + regs = gen8_regs; + num = ARRAY_SIZE(gen8_regs); + } else if (GRAPHICS_VER(i915) < 8) { + return; + } + + if (drm_WARN_ONCE(&i915->drm, !num, + "Platform does not implement TLB invalidation!")) + return; + + GEM_TRACE("\n"); + + assert_rpm_wakelock_held(&i915->runtime_pm); + + mutex_lock(>->tlb_invalidate_lock); + intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); + + for_each_engine(engine, gt, id) { + /* + * HW architecture suggest typical invalidation time at 40us, + * with pessimistic cases up to 100us and a recommendation to + * cap at 1ms. We go a bit higher just in case. + */ + const unsigned int timeout_us = 100; + const unsigned int timeout_ms = 4; + struct reg_and_bit rb; + + rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); + if (!i915_mmio_reg_offset(rb.reg)) + continue; + + intel_uncore_write_fw(uncore, rb.reg, rb.bit); + if (__intel_wait_for_register_fw(uncore, + rb.reg, rb.bit, 0, + timeout_us, timeout_ms, + NULL)) + drm_err_ratelimited(>->i915->drm, + "%s TLB invalidation did not complete in %ums!\n", + engine->name, timeout_ms); + } + + /* + * Use delayed put since a) we mostly expect a flurry of TLB + * invalidations so it is good to avoid paying the forcewake cost and + * b) it works around a bug in Icelake which cannot cope with too rapid + * transitions. + */ + intel_uncore_forcewake_put_delayed(uncore, FORCEWAKE_ALL); + mutex_unlock(>->tlb_invalidate_lock); +} diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h index 3ace129eb2af..a913fb6ffec3 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.h +++ b/drivers/gpu/drm/i915/gt/intel_gt.h @@ -91,4 +91,6 @@ void intel_gt_info_print(const struct intel_gt_info *info, void intel_gt_watchdog_work(struct work_struct *work); +void intel_gt_invalidate_tlbs(struct intel_gt *gt); + #endif /* __INTEL_GT_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h index 14216cc471b1..f20687796490 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h @@ -73,6 +73,8 @@ struct intel_gt { struct intel_uc uc; + struct mutex tlb_invalidate_lock; + struct i915_wa_list wa_list; struct intel_gt_timelines { diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 971d601fe751..c32420cb8ed5 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -2721,6 +2721,12 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING (1 << 28) #define GAMT_CHKN_DISABLE_I2M_CYCLE_ON_WR_PORT (1 << 24) +#define GEN8_RTCR _MMIO(0x4260) +#define GEN8_M1TCR _MMIO(0x4264) +#define GEN8_M2TCR _MMIO(0x4268) +#define GEN8_BTCR _MMIO(0x426c) +#define GEN8_VTCR _MMIO(0x4270) + #if 0 #define PRB0_TAIL _MMIO(0x2030) #define PRB0_HEAD _MMIO(0x2034) @@ -2819,6 +2825,11 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define FAULT_VA_HIGH_BITS (0xf << 0) #define FAULT_GTT_SEL (1 << 4) +#define GEN12_GFX_TLB_INV_CR _MMIO(0xced8) +#define GEN12_VD_TLB_INV_CR _MMIO(0xcedc) +#define GEN12_VE_TLB_INV_CR _MMIO(0xcee0) +#define GEN12_BLT_TLB_INV_CR _MMIO(0xcee4) + #define GEN12_AUX_ERR_DBG _MMIO(0x43f4) #define FPGA_DBG _MMIO(0x42300) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 29a858c53bdd..c0d6d5526abe 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -457,6 +457,9 @@ int i915_vma_bind(struct i915_vma *vma, vma->ops->bind_vma(vma->vm, NULL, vma, cache_level, bind_flags); } + if (vma->obj) + set_bit(I915_BO_WAS_BOUND_BIT, &vma->obj->flags); + atomic_or(bind_flags, &vma->flags); return 0; } diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index fc25ebf1a593..778da3179b3c 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -724,7 +724,8 @@ void intel_uncore_forcewake_get__locked(struct intel_uncore *uncore, } static void __intel_uncore_forcewake_put(struct intel_uncore *uncore, - enum forcewake_domains fw_domains) + enum forcewake_domains fw_domains, + bool delayed) { struct intel_uncore_forcewake_domain *domain; unsigned int tmp; @@ -739,7 +740,11 @@ static void __intel_uncore_forcewake_put(struct intel_uncore *uncore, continue; } - fw_domains_put(uncore, domain->mask); + if (delayed && + !(domain->uncore->fw_domains_timer & domain->mask)) + fw_domain_arm_timer(domain); + else + fw_domains_put(uncore, domain->mask); } } @@ -760,7 +765,20 @@ void intel_uncore_forcewake_put(struct intel_uncore *uncore, return; spin_lock_irqsave(&uncore->lock, irqflags); - __intel_uncore_forcewake_put(uncore, fw_domains); + __intel_uncore_forcewake_put(uncore, fw_domains, false); + spin_unlock_irqrestore(&uncore->lock, irqflags); +} + +void intel_uncore_forcewake_put_delayed(struct intel_uncore *uncore, + enum forcewake_domains fw_domains) +{ + unsigned long irqflags; + + if (!uncore->fw_get_funcs) + return; + + spin_lock_irqsave(&uncore->lock, irqflags); + __intel_uncore_forcewake_put(uncore, fw_domains, true); spin_unlock_irqrestore(&uncore->lock, irqflags); } @@ -802,7 +820,7 @@ void intel_uncore_forcewake_put__locked(struct intel_uncore *uncore, if (!uncore->fw_get_funcs) return; - __intel_uncore_forcewake_put(uncore, fw_domains); + __intel_uncore_forcewake_put(uncore, fw_domains, false); } void assert_forcewakes_inactive(struct intel_uncore *uncore) diff --git a/drivers/gpu/drm/i915/intel_uncore.h b/drivers/gpu/drm/i915/intel_uncore.h index 210fe2a71612..2a15b2b2e2fc 100644 --- a/drivers/gpu/drm/i915/intel_uncore.h +++ b/drivers/gpu/drm/i915/intel_uncore.h @@ -246,6 +246,8 @@ void intel_uncore_forcewake_get(struct intel_uncore *uncore, enum forcewake_domains domains); void intel_uncore_forcewake_put(struct intel_uncore *uncore, enum forcewake_domains domains); +void intel_uncore_forcewake_put_delayed(struct intel_uncore *uncore, + enum forcewake_domains domains); void intel_uncore_forcewake_flush(struct intel_uncore *uncore, enum forcewake_domains fw_domains); From f26d04331360d42dbd6b58448bd98e4edbfbe1c5 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 13 Jan 2022 18:54:38 -0500 Subject: [PATCH 0577/1295] audit: improve audit queue handling when "audit=1" on cmdline When an admin enables audit at early boot via the "audit=1" kernel command line the audit queue behavior is slightly different; the audit subsystem goes to greater lengths to avoid dropping records, which unfortunately can result in problems when the audit daemon is forcibly stopped for an extended period of time. This patch makes a number of changes designed to improve the audit queuing behavior so that leaving the audit daemon in a stopped state for an extended period does not cause a significant impact to the system. - kauditd_send_queue() is now limited to looping through the passed queue only once per call. This not only prevents the function from looping indefinitely when records are returned to the current queue, it also allows any recovery handling in kauditd_thread() to take place when kauditd_send_queue() returns. - Transient netlink send errors seen as -EAGAIN now cause the record to be returned to the retry queue instead of going to the hold queue. The intention of the hold queue is to store, perhaps for an extended period of time, the events which led up to the audit daemon going offline. The retry queue remains a temporary queue intended to protect against transient issues between the kernel and the audit daemon. - The retry queue is now limited by the audit_backlog_limit setting, the same as the other queues. This allows admins to bound the size of all of the audit queues on the system. - kauditd_rehold_skb() now returns records to the end of the hold queue to ensure ordering is preserved in the face of recent changes to kauditd_send_queue(). Cc: stable@vger.kernel.org Fixes: 5b52330bbfe63 ("audit: fix auditd/kernel connection state tracking") Fixes: f4b3ee3c85551 ("audit: improve robustness of the audit queue handling") Reported-by: Gaosheng Cui Tested-by: Gaosheng Cui Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 62 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/kernel/audit.c b/kernel/audit.c index e4bbe2c70c26..7690c29d4ee4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -541,20 +541,22 @@ static void kauditd_printk_skb(struct sk_buff *skb) /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record + * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ -static void kauditd_rehold_skb(struct sk_buff *skb) +static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { - /* put the record back in the queue at the same place */ - skb_queue_head(&audit_hold_queue, skb); + /* put the record back in the queue */ + skb_queue_tail(&audit_hold_queue, skb); } /** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record + * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this @@ -564,19 +566,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb) * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ -static void kauditd_hold_skb(struct sk_buff *skb) +static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb); /* can we just silently drop the message? */ - if (!audit_default) { - kfree_skb(skb); - return; + if (!audit_default) + goto drop; + + /* the hold queue is only for when the daemon goes away completely, + * not -EAGAIN failures; if we are in a -EAGAIN state requeue the + * record on the retry queue unless it's full, in which case drop it + */ + if (error == -EAGAIN) { + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + audit_log_lost("kauditd retry queue overflow"); + goto drop; } - /* if we have room, queue the message */ + /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); @@ -585,24 +599,32 @@ static void kauditd_hold_skb(struct sk_buff *skb) /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); +drop: kfree_skb(skb); } /** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record + * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ -static void kauditd_retry_skb(struct sk_buff *skb) +static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { - /* NOTE: because records should only live in the retry queue for a - * short period of time, before either being sent or moved to the hold - * queue, we don't currently enforce a limit on this queue */ - skb_queue_tail(&audit_retry_queue, skb); + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + + /* we have to drop the record, send it via printk as a last effort */ + kauditd_printk_skb(skb); + audit_log_lost("kauditd retry queue overflow"); + kfree_skb(skb); } /** @@ -640,7 +662,7 @@ static void auditd_reset(const struct auditd_connection *ac) /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) - kauditd_hold_skb(skb); + kauditd_hold_skb(skb, -ECONNREFUSED); } /** @@ -714,16 +736,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), - void (*err_hook)(struct sk_buff *skb)) + void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; - struct sk_buff *skb; + struct sk_buff *skb = NULL; + struct sk_buff *skb_tail; unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ - while ((skb = skb_dequeue(queue))) { + skb_tail = skb_peek_tail(queue); + while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); @@ -731,7 +755,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, -ECONNREFUSED); continue; } @@ -745,7 +769,7 @@ retry: rc == -ECONNREFUSED || rc == -EPERM) { sk = NULL; if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, rc); if (rc == -EAGAIN) rc = 0; /* continue to drain the queue */ From 235528072f28b3b0a1446279b7eaddda36dbf743 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Winiarski?= Date: Thu, 13 Jan 2022 00:36:57 +0100 Subject: [PATCH 0578/1295] kunit: tool: Import missing importlib.abc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Python 3.10.0 contains: 9e09849d20 ("bpo-41006: importlib.util no longer imports typing (GH-20938)") It causes importlib.util to no longer import importlib.abs, which leads to the following error when trying to use kunit with qemu: AttributeError: module 'importlib' has no attribute 'abc'. Did you mean: '_abc'? Add the missing import. Signed-off-by: Michał Winiarski Reviewed-by: Daniel Latypov Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_kernel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index 44bbe54f25f1..3c4196cef3ed 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -6,6 +6,7 @@ # Author: Felix Guo # Author: Brendan Higgins +import importlib.abc import importlib.util import logging import subprocess From f034cc1301e7d83d4ec428dd6b8ffb57ca446efb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Wed, 12 Jan 2022 14:41:42 -0500 Subject: [PATCH 0579/1295] selftests: rtc: Increase test timeout so that all tests run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The timeout setting for the rtc kselftest is currently 90 seconds. This setting is used by the kselftest runner to stop running a test if it takes longer than the assigned value. However, two of the test cases inside rtc set alarms. These alarms are set to the next beginning of the minute, so each of these test cases may take up to, in the worst case, 60 seconds. In order to allow for all test cases in rtc to run, even in the worst case, when using the kselftest runner, the timeout value should be increased to at least 120. Set it to 180, so there's some additional slack. Correct operation can be tested by running the following command right after the start of a minute (low second count), and checking that all test cases run: ./run_kselftest.sh -c rtc Signed-off-by: Nícolas F. R. A. Prado Acked-by: Alexandre Belloni Signed-off-by: Shuah Khan --- tools/testing/selftests/rtc/settings | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rtc/settings b/tools/testing/selftests/rtc/settings index ba4d85f74cd6..a953c96aa16e 100644 --- a/tools/testing/selftests/rtc/settings +++ b/tools/testing/selftests/rtc/settings @@ -1 +1 @@ -timeout=90 +timeout=180 From 40d70d4d60974c28054a60316f2aec8810833526 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Fri, 14 Jan 2022 18:21:26 -0500 Subject: [PATCH 0580/1295] selftests: cpufreq: Write test output to stdout as well MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use 'tee' to send the test output to stdout in addition to the current output file. This makes the output easier to handle in automated test systems and is superior to only later dumping the output file contents to stdout, since this way the test output can be interleaved with other log messages, like from the kernel, so that chronology is preserved, making it easier to detect issues. Signed-off-by: Nícolas F. R. A. Prado Acked-by: Viresh Kumar Signed-off-by: Shuah Khan --- tools/testing/selftests/cpufreq/main.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cpufreq/main.sh b/tools/testing/selftests/cpufreq/main.sh index 31f8c9a76c5f..60ce18ed0666 100755 --- a/tools/testing/selftests/cpufreq/main.sh +++ b/tools/testing/selftests/cpufreq/main.sh @@ -194,5 +194,5 @@ prerequisite # Run requested functions clear_dumps $OUTFILE -do_test >> $OUTFILE.txt +do_test | tee -a $OUTFILE.txt dmesg_dumps $OUTFILE From 92d25637a3a45904292c93f1863c6bbda4e3e38f Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 17 Dec 2021 17:29:55 +0800 Subject: [PATCH 0581/1295] kselftest: signal all child processes We have some many cases that will create child process as well, such as pidfd_wait. Previously, we will signal/kill the parent process when it is time out, but this signal will not be sent to its child process. In such case, if child process doesn't terminate itself, ksefltest framework will hang forever. Here we group all its child processes so that kill() can signal all of them in timeout. Fixed change log: Shuah Khan Suggested-by: yang xu Signed-off-by: Li Zhijian Acked-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest_harness.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 471eaa7b3a3f..11779405dc80 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -877,7 +877,8 @@ static void __timeout_handler(int sig, siginfo_t *info, void *ucontext) } t->timed_out = true; - kill(t->pid, SIGKILL); + // signal process group + kill(-(t->pid), SIGKILL); } void __wait_for_test(struct __test_metadata *t) @@ -987,6 +988,7 @@ void __run_test(struct __fixture_metadata *f, ksft_print_msg("ERROR SPAWNING TEST CHILD\n"); t->passed = 0; } else if (t->pid == 0) { + setpgrp(); t->fn(t, variant); if (t->skip) _exit(255); From 809232619f5b15e31fb3563985e705454f32621f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 17 Jan 2022 15:30:10 -0500 Subject: [PATCH 0582/1295] sched/membarrier: Fix membarrier-rseq fence command missing from query bitmask The membarrier command MEMBARRIER_CMD_QUERY allows querying the available membarrier commands. When the membarrier-rseq fence commands were added, a new MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK was introduced with the intent to expose them with the MEMBARRIER_CMD_QUERY command, the but it was never added to MEMBARRIER_CMD_BITMASK. The membarrier-rseq fence commands are therefore not wired up with the query command. Rename MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK to MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK (the bitmask is not a command per-se), and change the erroneous MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK (which does not actually exist) to MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ. Wire up MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK in MEMBARRIER_CMD_BITMASK. Fixing this allows discovering availability of the membarrier-rseq fence feature. Fixes: 2a36ab717e8f ("rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ") Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: # 5.10+ Link: https://lkml.kernel.org/r/20220117203010.30129-1-mathieu.desnoyers@efficios.com --- kernel/sched/membarrier.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index b5add64d9698..3d2825408e3a 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -147,11 +147,11 @@ #endif #ifdef CONFIG_RSEQ -#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \ +#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ - | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK) + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ) #else -#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 #endif #define MEMBARRIER_CMD_BITMASK \ @@ -159,7 +159,8 @@ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ - | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) + | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ + | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK) static void ipi_mb(void *info) { From 5f0c749158158f89eba7647bdc4e8096979de981 Mon Sep 17 00:00:00 2001 From: Meenakshikumar Somasundaram Date: Mon, 15 Nov 2021 01:51:37 -0500 Subject: [PATCH 0583/1295] drm/amd/display: Fix for otg synchronization logic [Why] During otg sync trigger, plane states are used to decide whether the otg is already synchronized or not. There are scenarions when otgs are disabled without plane state getting disabled and in such case the otg is excluded from synchronization. [How] Introduced pipe_idx_syncd in pipe_ctx that tracks each otgs master pipe. When a otg is disabled/enabled, pipe_idx_syncd is reset to itself. On sync trigger, pipe_idx_syncd is checked to decide whether a otg is already synchronized and the otg is further included or excluded from synchronization. v2: Don't drop is_blanked logic Reviewed-by: Jun Lei Reviewed-by: Mustapha Ghaddar Acked-by: Bhawanpreet Lakha Signed-off-by: meenakshikumar somasundaram Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Harry Wentland Cc: torvalds@linux-foundation.org Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 40 +++++++++----- .../gpu/drm/amd/display/dc/core/dc_resource.c | 54 +++++++++++++++++++ drivers/gpu/drm/amd/display/dc/dc.h | 1 + .../display/dc/dce110/dce110_hw_sequencer.c | 8 +++ .../drm/amd/display/dc/dcn31/dcn31_resource.c | 3 ++ .../gpu/drm/amd/display/dc/inc/core_types.h | 1 + drivers/gpu/drm/amd/display/dc/inc/resource.h | 11 ++++ 7 files changed, 105 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 01c8849b9db2..6f5528d34093 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -1404,20 +1404,34 @@ static void program_timing_sync( status->timing_sync_info.master = false; } - /* remove any other unblanked pipes as they have already been synced */ - for (j = j + 1; j < group_size; j++) { - bool is_blanked; - if (pipe_set[j]->stream_res.opp->funcs->dpg_is_blanked) - is_blanked = - pipe_set[j]->stream_res.opp->funcs->dpg_is_blanked(pipe_set[j]->stream_res.opp); - else - is_blanked = - pipe_set[j]->stream_res.tg->funcs->is_blanked(pipe_set[j]->stream_res.tg); - if (!is_blanked) { - group_size--; - pipe_set[j] = pipe_set[group_size]; - j--; + /* remove any other pipes that are already been synced */ + if (dc->config.use_pipe_ctx_sync_logic) { + /* check pipe's syncd to decide which pipe to be removed */ + for (j = 1; j < group_size; j++) { + if (pipe_set[j]->pipe_idx_syncd == pipe_set[0]->pipe_idx_syncd) { + group_size--; + pipe_set[j] = pipe_set[group_size]; + j--; + } else + /* link slave pipe's syncd with master pipe */ + pipe_set[j]->pipe_idx_syncd = pipe_set[0]->pipe_idx_syncd; + } + } else { + for (j = j + 1; j < group_size; j++) { + bool is_blanked; + + if (pipe_set[j]->stream_res.opp->funcs->dpg_is_blanked) + is_blanked = + pipe_set[j]->stream_res.opp->funcs->dpg_is_blanked(pipe_set[j]->stream_res.opp); + else + is_blanked = + pipe_set[j]->stream_res.tg->funcs->is_blanked(pipe_set[j]->stream_res.tg); + if (!is_blanked) { + group_size--; + pipe_set[j] = pipe_set[group_size]; + j--; + } } } diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c index d4ff6cc6b8d9..b3912ff9dc91 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c @@ -3217,6 +3217,60 @@ struct hpo_dp_link_encoder *resource_get_hpo_dp_link_enc_for_det_lt( } #endif +void reset_syncd_pipes_from_disabled_pipes(struct dc *dc, + struct dc_state *context) +{ + int i, j; + struct pipe_ctx *pipe_ctx_old, *pipe_ctx, *pipe_ctx_syncd; + + /* If pipe backend is reset, need to reset pipe syncd status */ + for (i = 0; i < dc->res_pool->pipe_count; i++) { + pipe_ctx_old = &dc->current_state->res_ctx.pipe_ctx[i]; + pipe_ctx = &context->res_ctx.pipe_ctx[i]; + + if (!pipe_ctx_old->stream) + continue; + + if (pipe_ctx_old->top_pipe || pipe_ctx_old->prev_odm_pipe) + continue; + + if (!pipe_ctx->stream || + pipe_need_reprogram(pipe_ctx_old, pipe_ctx)) { + + /* Reset all the syncd pipes from the disabled pipe */ + for (j = 0; j < dc->res_pool->pipe_count; j++) { + pipe_ctx_syncd = &context->res_ctx.pipe_ctx[j]; + if ((GET_PIPE_SYNCD_FROM_PIPE(pipe_ctx_syncd) == pipe_ctx_old->pipe_idx) || + !IS_PIPE_SYNCD_VALID(pipe_ctx_syncd)) + SET_PIPE_SYNCD_TO_PIPE(pipe_ctx_syncd, j); + } + } + } +} + +void check_syncd_pipes_for_disabled_master_pipe(struct dc *dc, + struct dc_state *context, + uint8_t disabled_master_pipe_idx) +{ + int i; + struct pipe_ctx *pipe_ctx, *pipe_ctx_check; + + pipe_ctx = &context->res_ctx.pipe_ctx[disabled_master_pipe_idx]; + if ((GET_PIPE_SYNCD_FROM_PIPE(pipe_ctx) != disabled_master_pipe_idx) || + !IS_PIPE_SYNCD_VALID(pipe_ctx)) + SET_PIPE_SYNCD_TO_PIPE(pipe_ctx, disabled_master_pipe_idx); + + /* for the pipe disabled, check if any slave pipe exists and assert */ + for (i = 0; i < dc->res_pool->pipe_count; i++) { + pipe_ctx_check = &context->res_ctx.pipe_ctx[i]; + + if ((GET_PIPE_SYNCD_FROM_PIPE(pipe_ctx_check) == disabled_master_pipe_idx) && + IS_PIPE_SYNCD_VALID(pipe_ctx_check) && (i != disabled_master_pipe_idx)) + DC_ERR("DC: Failure: pipe_idx[%d] syncd with disabled master pipe_idx[%d]\n", + i, disabled_master_pipe_idx); + } +} + uint8_t resource_transmitter_to_phy_idx(const struct dc *dc, enum transmitter transmitter) { /* TODO - get transmitter to phy idx mapping from DMUB */ diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index da2c78ce14d6..288e7b01f561 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -344,6 +344,7 @@ struct dc_config { uint8_t vblank_alignment_max_frame_time_diff; bool is_asymmetric_memory; bool is_single_rank_dimm; + bool use_pipe_ctx_sync_logic; }; enum visual_confirm { diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c index 78192ecba102..f1593186e964 100644 --- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c @@ -1566,6 +1566,10 @@ static enum dc_status apply_single_controller_ctx_to_hw( &pipe_ctx->stream->audio_info); } + /* make sure no pipes syncd to the pipe being enabled */ + if (!pipe_ctx->stream->apply_seamless_boot_optimization && dc->config.use_pipe_ctx_sync_logic) + check_syncd_pipes_for_disabled_master_pipe(dc, context, pipe_ctx->pipe_idx); + #if defined(CONFIG_DRM_AMD_DC_DCN) /* DCN3.1 FPGA Workaround * Need to enable HPO DP Stream Encoder before setting OTG master enable. @@ -2297,6 +2301,10 @@ enum dc_status dce110_apply_ctx_to_hw( enum dc_status status; int i; + /* reset syncd pipes from disabled pipes */ + if (dc->config.use_pipe_ctx_sync_logic) + reset_syncd_pipes_from_disabled_pipes(dc, context); + /* Reset old context */ /* look up the targets that have been removed since last commit */ hws->funcs.reset_hw_ctx_wrap(dc, context); diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c index 42ed47e8133d..8d64187478e4 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c @@ -2260,6 +2260,9 @@ static bool dcn31_resource_construct( dc->caps.color.mpc.ogam_rom_caps.hlg = 0; dc->caps.color.mpc.ocsc = 1; + /* Use pipe context based otg sync logic */ + dc->config.use_pipe_ctx_sync_logic = true; + /* read VBIOS LTTPR caps */ { if (ctx->dc_bios->funcs->get_lttpr_caps) { diff --git a/drivers/gpu/drm/amd/display/dc/inc/core_types.h b/drivers/gpu/drm/amd/display/dc/inc/core_types.h index 890280026e69..943240e2809e 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/core_types.h +++ b/drivers/gpu/drm/amd/display/dc/inc/core_types.h @@ -382,6 +382,7 @@ struct pipe_ctx { struct pll_settings pll_settings; uint8_t pipe_idx; + uint8_t pipe_idx_syncd; struct pipe_ctx *top_pipe; struct pipe_ctx *bottom_pipe; diff --git a/drivers/gpu/drm/amd/display/dc/inc/resource.h b/drivers/gpu/drm/amd/display/dc/inc/resource.h index 4249bf306e09..dbfe6690ded8 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/resource.h +++ b/drivers/gpu/drm/amd/display/dc/inc/resource.h @@ -34,6 +34,10 @@ #define MEMORY_TYPE_HBM 2 +#define IS_PIPE_SYNCD_VALID(pipe) ((((pipe)->pipe_idx_syncd) & 0x80)?1:0) +#define GET_PIPE_SYNCD_FROM_PIPE(pipe) ((pipe)->pipe_idx_syncd & 0x7F) +#define SET_PIPE_SYNCD_TO_PIPE(pipe, pipe_syncd) ((pipe)->pipe_idx_syncd = (0x80 | pipe_syncd)) + enum dce_version resource_parse_asic_id( struct hw_asic_id asic_id); @@ -208,6 +212,13 @@ struct hpo_dp_link_encoder *resource_get_hpo_dp_link_enc_for_det_lt( const struct dc_link *link); #endif +void reset_syncd_pipes_from_disabled_pipes(struct dc *dc, + struct dc_state *context); + +void check_syncd_pipes_for_disabled_master_pipe(struct dc *dc, + struct dc_state *context, + uint8_t disabled_master_pipe_idx); + uint8_t resource_transmitter_to_phy_idx(const struct dc *dc, enum transmitter transmitter); #endif /* DRIVERS_GPU_DRM_AMD_DC_DEV_DC_INC_RESOURCE_H_ */ From ac46d93235074a6c5d280d35771c23fd8620e7d9 Mon Sep 17 00:00:00 2001 From: Zhan Liu Date: Wed, 19 Jan 2022 16:55:16 -0500 Subject: [PATCH 0584/1295] drm/amd/display: Correct MPC split policy for DCN301 [Why] DCN301 has seamless boot enabled. With MPC split enabled at the same time, system will hang. [How] Revert MPC split policy back to "MPC_SPLIT_AVOID". Since we have ODM combine enabled on DCN301, pipe split is not necessary here. Signed-off-by: Zhan Liu Reviewed-by: Charlene Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c index c1c6e602b06c..b4001233867c 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c @@ -686,7 +686,7 @@ static const struct dc_debug_options debug_defaults_drv = { .disable_clock_gate = true, .disable_pplib_clock_request = true, .disable_pplib_wm_range = true, - .pipe_split_policy = MPC_SPLIT_DYNAMIC, + .pipe_split_policy = MPC_SPLIT_AVOID, .force_single_disp_pipe_split = false, .disable_dcc = DCC_ENABLE, .vsr_support = true, From 7e38ac562b820915faa33a5077ca9bccf42d39d2 Mon Sep 17 00:00:00 2001 From: Zhan Liu Date: Wed, 19 Jan 2022 17:07:53 -0500 Subject: [PATCH 0585/1295] drm/amd/display: change FIFO reset condition to embedded display only [Why] FIFO reset is only necessary for fast boot sequence, where otg is disabled and dig fe is enabled when changing dispclk. Fast boot is only enabled on embedded displays. [How] Change FIFO reset condition to "embedded display only". Signed-off-by: Zhan Liu Reviewed-by: Charlene Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c index f1593186e964..f3ff141b706a 100644 --- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c @@ -1608,7 +1608,7 @@ static enum dc_status apply_single_controller_ctx_to_hw( pipe_ctx->stream_res.stream_enc, pipe_ctx->stream_res.tg->inst); - if (dc_is_dp_signal(pipe_ctx->stream->signal) && + if (dc_is_embedded_signal(pipe_ctx->stream->signal) && pipe_ctx->stream_res.stream_enc->funcs->reset_fifo) pipe_ctx->stream_res.stream_enc->funcs->reset_fifo( pipe_ctx->stream_res.stream_enc); From 9e5a14bce2402e84251a10269df0235cd7ce9234 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 20 Jan 2022 12:17:07 -0500 Subject: [PATCH 0586/1295] drm/amdgpu: filter out radeon secondary ids as well MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Older radeon boards (r2xx-r5xx) had secondary PCI functions which we solely there for supporting multi-head on OSs with special requirements. Add them to the unsupported list as well so we don't attempt to bind to them. The driver would fail to bind to them anyway, but this does so in a cleaner way that should not confuse the user. Cc: stable@vger.kernel.org Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 81 +++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index b21bcdc97460..4c83f1db8a24 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1525,6 +1525,87 @@ static const u16 amdgpu_unsupported_pciidlist[] = { 0x99A0, 0x99A2, 0x99A4, + /* radeon secondary ids */ + 0x3171, + 0x3e70, + 0x4164, + 0x4165, + 0x4166, + 0x4168, + 0x4170, + 0x4171, + 0x4172, + 0x4173, + 0x496e, + 0x4a69, + 0x4a6a, + 0x4a6b, + 0x4a70, + 0x4a74, + 0x4b69, + 0x4b6b, + 0x4b6c, + 0x4c6e, + 0x4e64, + 0x4e65, + 0x4e66, + 0x4e67, + 0x4e68, + 0x4e69, + 0x4e6a, + 0x4e71, + 0x4f73, + 0x5569, + 0x556b, + 0x556d, + 0x556f, + 0x5571, + 0x5854, + 0x5874, + 0x5940, + 0x5941, + 0x5b72, + 0x5b73, + 0x5b74, + 0x5b75, + 0x5d44, + 0x5d45, + 0x5d6d, + 0x5d6f, + 0x5d72, + 0x5d77, + 0x5e6b, + 0x5e6d, + 0x7120, + 0x7124, + 0x7129, + 0x712e, + 0x712f, + 0x7162, + 0x7163, + 0x7166, + 0x7167, + 0x7172, + 0x7173, + 0x71a0, + 0x71a1, + 0x71a3, + 0x71a7, + 0x71bb, + 0x71e0, + 0x71e1, + 0x71e2, + 0x71e6, + 0x71e7, + 0x71f2, + 0x7269, + 0x726b, + 0x726e, + 0x72a0, + 0x72a8, + 0x72b1, + 0x72b3, + 0x793f, }; static const struct pci_device_id pciidlist[] = { From dc919d670c6fd1ac81ebf31625cd19579f7b3d4c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 20 Jan 2022 12:52:13 -0500 Subject: [PATCH 0587/1295] drm/amdgpu/display: adjust msleep limit in dp_wait_for_training_aux_rd_interval Some architectures (e.g., ARM) have relatively low udelay limits. On most architectures, anything longer than 2000us is not recommended. Change the check to align with other similar checks in DC. Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c index 05e216524370..a012899e9dd4 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c @@ -202,7 +202,7 @@ void dp_wait_for_training_aux_rd_interval( uint32_t wait_in_micro_secs) { #if defined(CONFIG_DRM_AMD_DC_DCN) - if (wait_in_micro_secs > 16000) + if (wait_in_micro_secs > 1000) msleep(wait_in_micro_secs/1000); else udelay(wait_in_micro_secs); From 98fdcacb45f7cd2092151d6af2e60152811eb79c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 20 Jan 2022 12:57:33 -0500 Subject: [PATCH 0588/1295] drm/amdgpu/display: use msleep rather than udelay for long delays Some architectures (e.g., ARM) throw an compilation error if the udelay is too long. In general udelays of longer than 2000us are not recommended on any architecture. Switch to msleep in these cases. Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c index a012899e9dd4..4c3ab2575e4b 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c @@ -6935,7 +6935,7 @@ bool dpcd_write_128b_132b_sst_payload_allocation_table( } } retries++; - udelay(5000); + msleep(5); } if (!result && retries == max_retries) { @@ -6987,7 +6987,7 @@ bool dpcd_poll_for_allocation_change_trigger(struct dc_link *link) break; } - udelay(5000); + msleep(5); } if (result == ACT_FAILED) { From ebc77bcc6e1660a011483c035d53c461c8dcc4f5 Mon Sep 17 00:00:00 2001 From: Zhou Qingyang Date: Tue, 25 Jan 2022 00:55:51 +0800 Subject: [PATCH 0589/1295] drm/amd/display/dc/calcs/dce_calcs: Fix a memleak in calculate_bandwidth() In calculate_bandwidth(), the tag free_sclk and free_yclk are reversed, which could lead to a memory leak of yclk. Fix this bug by changing the location of free_sclk and free_yclk. This bug was found by a static analyzer. Builds with 'make allyesconfig' show no new warnings, and our static analyzer no longer warns about this code. Fixes: 2be8989d0fc2 ("drm/amd/display/dc/calcs/dce_calcs: Move some large variables from the stack to the heap") Signed-off-by: Zhou Qingyang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/calcs/dce_calcs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/calcs/dce_calcs.c b/drivers/gpu/drm/amd/display/dc/calcs/dce_calcs.c index ff5bb152ef49..e6ef36de0825 100644 --- a/drivers/gpu/drm/amd/display/dc/calcs/dce_calcs.c +++ b/drivers/gpu/drm/amd/display/dc/calcs/dce_calcs.c @@ -2033,10 +2033,10 @@ static void calculate_bandwidth( kfree(surface_type); free_tiling_mode: kfree(tiling_mode); -free_yclk: - kfree(yclk); free_sclk: kfree(sclk); +free_yclk: + kfree(yclk); } /******************************************************************************* From 72a8d87b87270bff0c0b2fed4d59c48d0dd840d7 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Mon, 24 Jan 2022 01:23:35 +0100 Subject: [PATCH 0590/1295] drm/amd/display: Fix FP start/end for dcn30_internal_validate_bw. It calls populate_dml_pipes which uses doubles to initialize the scale_ratio_depth params. Mirrors the dcn20 logic. Cc: stable@vger.kernel.org Signed-off-by: Bas Nieuwenhuizen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c index 602ec9a08549..8ca26383b568 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.c @@ -1878,7 +1878,6 @@ noinline bool dcn30_internal_validate_bw( dc->res_pool->funcs->update_soc_for_wm_a(dc, context); pipe_cnt = dc->res_pool->funcs->populate_dml_pipes(dc, context, pipes, fast_validate); - DC_FP_START(); if (!pipe_cnt) { out = true; goto validate_out; @@ -2104,7 +2103,6 @@ validate_fail: out = false; validate_out: - DC_FP_END(); return out; } @@ -2306,7 +2304,9 @@ bool dcn30_validate_bandwidth(struct dc *dc, BW_VAL_TRACE_COUNT(); + DC_FP_START(); out = dcn30_internal_validate_bw(dc, context, pipes, &pipe_cnt, &vlevel, fast_validate); + DC_FP_END(); if (pipe_cnt == 0) goto validate_out; From 25f1488bdbba63415239ff301fe61a8546140d9f Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Mon, 24 Jan 2022 01:23:36 +0100 Subject: [PATCH 0591/1295] drm/amd/display: Wrap dcn301_calculate_wm_and_dlg for FPU. Mirrors the logic for dcn30. Cue lots of WARNs and some kernel panics without this fix. Cc: stable@vger.kernel.org Signed-off-by: Bas Nieuwenhuizen Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/dcn301/dcn301_resource.c | 11 +++++++++++ .../gpu/drm/amd/display/dc/dml/dcn301/dcn301_fpu.c | 2 +- .../gpu/drm/amd/display/dc/dml/dcn301/dcn301_fpu.h | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c index b4001233867c..5d9637b07429 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c @@ -1380,6 +1380,17 @@ static void set_wm_ranges( pp_smu->nv_funcs.set_wm_ranges(&pp_smu->nv_funcs.pp_smu, &ranges); } +static void dcn301_calculate_wm_and_dlg( + struct dc *dc, struct dc_state *context, + display_e2e_pipe_params_st *pipes, + int pipe_cnt, + int vlevel) +{ + DC_FP_START(); + dcn301_calculate_wm_and_dlg_fp(dc, context, pipes, pipe_cnt, vlevel); + DC_FP_END(); +} + static struct resource_funcs dcn301_res_pool_funcs = { .destroy = dcn301_destroy_resource_pool, .link_enc_create = dcn301_link_encoder_create, diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn301/dcn301_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn301/dcn301_fpu.c index 94c32832a0e7..0a7a33864973 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn301/dcn301_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn301/dcn301_fpu.c @@ -327,7 +327,7 @@ void dcn301_fpu_init_soc_bounding_box(struct bp_soc_bb_info bb_info) dcn3_01_soc.sr_exit_time_us = bb_info.dram_sr_exit_latency_100ns * 10; } -void dcn301_calculate_wm_and_dlg(struct dc *dc, +void dcn301_calculate_wm_and_dlg_fp(struct dc *dc, struct dc_state *context, display_e2e_pipe_params_st *pipes, int pipe_cnt, diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn301/dcn301_fpu.h b/drivers/gpu/drm/amd/display/dc/dml/dcn301/dcn301_fpu.h index fc7065d17842..774b0fdfc80b 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn301/dcn301_fpu.h +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn301/dcn301_fpu.h @@ -34,7 +34,7 @@ void dcn301_fpu_set_wm_ranges(int i, void dcn301_fpu_init_soc_bounding_box(struct bp_soc_bb_info bb_info); -void dcn301_calculate_wm_and_dlg(struct dc *dc, +void dcn301_calculate_wm_and_dlg_fp(struct dc *dc, struct dc_state *context, display_e2e_pipe_params_st *pipes, int pipe_cnt, From 2a807341ed1074ab83638f2fab08dffaa373f6b8 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Sun, 23 Jan 2022 03:38:28 +0100 Subject: [PATCH 0592/1295] drm/amdgpu/display: Remove t_srx_delay_us. Unused. Convert the divisions into asserts on the divisor, to debug why it is zero. The divide by zero is suspected of causing kernel panics. While I have no idea where the zero is coming from I think this patch is a positive either way. Cc: stable@vger.kernel.org Reviewed-by: Harry Wentland Signed-off-by: Bas Nieuwenhuizen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c | 1 - .../gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c | 2 -- .../drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c | 2 -- .../gpu/drm/amd/display/dc/dml/dcn21/display_rq_dlg_calc_21.c | 2 -- .../gpu/drm/amd/display/dc/dml/dcn30/display_rq_dlg_calc_30.c | 2 -- drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h | 1 - drivers/gpu/drm/amd/display/dc/dml/display_rq_dlg_helpers.c | 3 --- drivers/gpu/drm/amd/display/dc/dml/dml1_display_rq_dlg_calc.c | 4 ---- 8 files changed, 17 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c b/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c index ec19678a0702..e447c74be713 100644 --- a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c +++ b/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c @@ -503,7 +503,6 @@ static void dcn_bw_calc_rq_dlg_ttu( //input[in_idx].dout.output_standard; /*todo: soc->sr_enter_plus_exit_time??*/ - dlg_sys_param->t_srx_delay_us = dc->dcn_ip->dcfclk_cstate_latency / v->dcf_clk_deep_sleep; dml1_rq_dlg_get_rq_params(dml, rq_param, &input->pipe.src); dml1_extract_rq_regs(dml, rq_regs, rq_param); diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c b/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c index 246071c72f6b..548cdef8a8ad 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c @@ -1576,8 +1576,6 @@ void dml20_rq_dlg_get_dlg_reg(struct display_mode_lib *mode_lib, dlg_sys_param.total_flip_bytes = get_total_immediate_flip_bytes(mode_lib, e2e_pipe_param, num_pipes); - dlg_sys_param.t_srx_delay_us = mode_lib->ip.dcfclk_cstate_latency - / dlg_sys_param.deepsleep_dcfclk_mhz; // TODO: Deprecated print__dlg_sys_params_st(mode_lib, &dlg_sys_param); diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c b/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c index 015e7f2c0b16..0fc9f3e3ffae 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c @@ -1577,8 +1577,6 @@ void dml20v2_rq_dlg_get_dlg_reg(struct display_mode_lib *mode_lib, dlg_sys_param.total_flip_bytes = get_total_immediate_flip_bytes(mode_lib, e2e_pipe_param, num_pipes); - dlg_sys_param.t_srx_delay_us = mode_lib->ip.dcfclk_cstate_latency - / dlg_sys_param.deepsleep_dcfclk_mhz; // TODO: Deprecated print__dlg_sys_params_st(mode_lib, &dlg_sys_param); diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn21/display_rq_dlg_calc_21.c b/drivers/gpu/drm/amd/display/dc/dml/dcn21/display_rq_dlg_calc_21.c index 8bc27de4c104..618f4b682ab1 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn21/display_rq_dlg_calc_21.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn21/display_rq_dlg_calc_21.c @@ -1688,8 +1688,6 @@ void dml21_rq_dlg_get_dlg_reg( mode_lib, e2e_pipe_param, num_pipes); - dlg_sys_param.t_srx_delay_us = mode_lib->ip.dcfclk_cstate_latency - / dlg_sys_param.deepsleep_dcfclk_mhz; // TODO: Deprecated print__dlg_sys_params_st(mode_lib, &dlg_sys_param); diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_rq_dlg_calc_30.c b/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_rq_dlg_calc_30.c index aef854270054..747167083dea 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_rq_dlg_calc_30.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_rq_dlg_calc_30.c @@ -1858,8 +1858,6 @@ void dml30_rq_dlg_get_dlg_reg(struct display_mode_lib *mode_lib, dlg_sys_param.total_flip_bytes = get_total_immediate_flip_bytes(mode_lib, e2e_pipe_param, num_pipes); - dlg_sys_param.t_srx_delay_us = mode_lib->ip.dcfclk_cstate_latency - / dlg_sys_param.deepsleep_dcfclk_mhz; // TODO: Deprecated print__dlg_sys_params_st(mode_lib, &dlg_sys_param); diff --git a/drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h b/drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h index d46a2733024c..8f9f1d607f7c 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h +++ b/drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h @@ -546,7 +546,6 @@ struct _vcs_dpi_display_dlg_sys_params_st { double t_sr_wm_us; double t_extra_us; double mem_trip_us; - double t_srx_delay_us; double deepsleep_dcfclk_mhz; double total_flip_bw; unsigned int total_flip_bytes; diff --git a/drivers/gpu/drm/amd/display/dc/dml/display_rq_dlg_helpers.c b/drivers/gpu/drm/amd/display/dc/dml/display_rq_dlg_helpers.c index 71ea503cb32f..412e75eb4704 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/display_rq_dlg_helpers.c +++ b/drivers/gpu/drm/amd/display/dc/dml/display_rq_dlg_helpers.c @@ -141,9 +141,6 @@ void print__dlg_sys_params_st(struct display_mode_lib *mode_lib, const struct _v dml_print("DML_RQ_DLG_CALC: t_urg_wm_us = %3.2f\n", dlg_sys_param->t_urg_wm_us); dml_print("DML_RQ_DLG_CALC: t_sr_wm_us = %3.2f\n", dlg_sys_param->t_sr_wm_us); dml_print("DML_RQ_DLG_CALC: t_extra_us = %3.2f\n", dlg_sys_param->t_extra_us); - dml_print( - "DML_RQ_DLG_CALC: t_srx_delay_us = %3.2f\n", - dlg_sys_param->t_srx_delay_us); dml_print( "DML_RQ_DLG_CALC: deepsleep_dcfclk_mhz = %3.2f\n", dlg_sys_param->deepsleep_dcfclk_mhz); diff --git a/drivers/gpu/drm/amd/display/dc/dml/dml1_display_rq_dlg_calc.c b/drivers/gpu/drm/amd/display/dc/dml/dml1_display_rq_dlg_calc.c index 59dc2c5b58dd..3df559c591f8 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dml1_display_rq_dlg_calc.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dml1_display_rq_dlg_calc.c @@ -1331,10 +1331,6 @@ void dml1_rq_dlg_get_dlg_params( if (dual_plane) DTRACE("DLG: %s: swath_height_c = %d", __func__, swath_height_c); - DTRACE( - "DLG: %s: t_srx_delay_us = %3.2f", - __func__, - (double) dlg_sys_param->t_srx_delay_us); DTRACE("DLG: %s: line_time_in_us = %3.2f", __func__, (double) line_time_in_us); DTRACE("DLG: %s: vupdate_offset = %d", __func__, vupdate_offset); DTRACE("DLG: %s: vupdate_width = %d", __func__, vupdate_width); From 4b77e4abb32cddb9e666e2ac411b0b0d6b8331dd Mon Sep 17 00:00:00 2001 From: Sergio Paracuellos Date: Mon, 24 Jan 2022 12:30:02 +0100 Subject: [PATCH 0593/1295] PCI: mt7621: Drop of_match_ptr() to avoid unused variable We have stubs for most OF interfaces even when CONFIG_OF is not set, so we allow building of pcie-mt7621.c in that case for compile testing. When CONFIG_OF is not set, "of_match_ptr(mt7621_pcie_ids)" compiles to NULL, which leaves mt7621_pcie_ids unused: $ make W=1 drivers/pci/controller/pcie-mt7621.c:549:34: warning: unused variable 'mt7621_pcie_ids' [-Wunused-const-variable] Drop of_match_ptr() to avoid the unused variable warning. [bhelgaas: commit log] Fixes: 2bdd5238e756 ("PCI: mt7621: Add MediaTek MT7621 PCIe host controller driver") Link: https://lore.kernel.org/r/20220124113003.406224-2-sergio.paracuellos@gmail.com Link: https://lore.kernel.org/r/202201241754.igtHzgHv-lkp@intel.com Reported-by: kernel test robot Signed-off-by: Sergio Paracuellos Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pcie-mt7621.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pcie-mt7621.c b/drivers/pci/controller/pcie-mt7621.c index 3824862ea144..f2e567282d3e 100644 --- a/drivers/pci/controller/pcie-mt7621.c +++ b/drivers/pci/controller/pcie-mt7621.c @@ -557,7 +557,7 @@ static struct platform_driver mt7621_pcie_driver = { .remove = mt7621_pcie_remove, .driver = { .name = "mt7621-pci", - .of_match_table = of_match_ptr(mt7621_pcie_ids), + .of_match_table = mt7621_pcie_ids, }, }; builtin_platform_driver(mt7621_pcie_driver); From c035366d9c9fe48d947ee6c43465ab43d42e20f2 Mon Sep 17 00:00:00 2001 From: Sergio Paracuellos Date: Mon, 24 Jan 2022 12:30:03 +0100 Subject: [PATCH 0594/1295] PCI: mt7621: Remove unused function pcie_rmw() Function pcie_rmw() is not being used at all and can be deleted. Hence get rid of it, which fixes this warning: drivers/pci/controller/pcie-mt7621.c:112:20: warning: unused function 'pcie_rmw' [-Wunused-function] Fixes: 2bdd5238e756 ("PCI: mt7621: Add MediaTek MT7621 PCIe host controller driver") Link: https://lore.kernel.org/r/20220124113003.406224-3-sergio.paracuellos@gmail.com Link: https://lore.kernel.org/all/202201241754.igtHzgHv-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Sergio Paracuellos Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pcie-mt7621.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/pci/controller/pcie-mt7621.c b/drivers/pci/controller/pcie-mt7621.c index f2e567282d3e..33eb37a2225c 100644 --- a/drivers/pci/controller/pcie-mt7621.c +++ b/drivers/pci/controller/pcie-mt7621.c @@ -109,15 +109,6 @@ static inline void pcie_write(struct mt7621_pcie *pcie, u32 val, u32 reg) writel_relaxed(val, pcie->base + reg); } -static inline void pcie_rmw(struct mt7621_pcie *pcie, u32 reg, u32 clr, u32 set) -{ - u32 val = readl_relaxed(pcie->base + reg); - - val &= ~clr; - val |= set; - writel_relaxed(val, pcie->base + reg); -} - static inline u32 pcie_port_read(struct mt7621_pcie_port *port, u32 reg) { return readl_relaxed(port->base + reg); From 0e3135d3bfa5dfb658145238d2bc723a8e30c3a3 Mon Sep 17 00:00:00 2001 From: He Fengqing Date: Sat, 22 Jan 2022 10:29:36 +0000 Subject: [PATCH 0595/1295] bpf: Fix possible race in inc_misses_counter It seems inc_misses_counter() suffers from same issue fixed in the commit d979617aa84d ("bpf: Fixes possible race in update_prog_stats() for 32bit arches"): As it can run while interrupts are enabled, it could be re-entered and the u64_stats syncp could be mangled. Fixes: 9ed9e9ba2337 ("bpf: Count the number of times recursion was prevented") Signed-off-by: He Fengqing Acked-by: John Fastabend Link: https://lore.kernel.org/r/20220122102936.1219518-1-hefengqing@huawei.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 4b6974a195c1..5e7edf913060 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -550,11 +550,12 @@ static __always_inline u64 notrace bpf_prog_start_time(void) static void notrace inc_misses_counter(struct bpf_prog *prog) { struct bpf_prog_stats *stats; + unsigned int flags; stats = this_cpu_ptr(prog->stats); - u64_stats_update_begin(&stats->syncp); + flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->misses); - u64_stats_update_end(&stats->syncp); + u64_stats_update_end_irqrestore(&stats->syncp, flags); } /* The logic is similar to bpf_prog_run(), but with an explicit From e2bcbd7769ee8f05e1b3d10848aace98973844e4 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 24 Jan 2022 15:30:28 +0000 Subject: [PATCH 0596/1295] tools headers UAPI: remove stale lirc.h The lirc.h file is an old copy of lirc.h from the kernel sources. It is out of date, and the bpf lirc tests don't need a new copy anyway. As long as /usr/include/linux/lirc.h is from kernel v5.2 or newer, the tests will compile fine. Signed-off-by: Sean Young Reviewed-by: Shuah Khan Link: https://lore.kernel.org/r/20220124153028.394409-1-sean@mess.org Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/lirc.h | 229 ------------------ .../selftests/bpf/test_lirc_mode2_user.c | 1 - 2 files changed, 230 deletions(-) delete mode 100644 tools/include/uapi/linux/lirc.h diff --git a/tools/include/uapi/linux/lirc.h b/tools/include/uapi/linux/lirc.h deleted file mode 100644 index 45fcbf99d72e..000000000000 --- a/tools/include/uapi/linux/lirc.h +++ /dev/null @@ -1,229 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * lirc.h - linux infrared remote control header file - * last modified 2010/07/13 by Jarod Wilson - */ - -#ifndef _LINUX_LIRC_H -#define _LINUX_LIRC_H - -#include -#include - -#define PULSE_BIT 0x01000000 -#define PULSE_MASK 0x00FFFFFF - -#define LIRC_MODE2_SPACE 0x00000000 -#define LIRC_MODE2_PULSE 0x01000000 -#define LIRC_MODE2_FREQUENCY 0x02000000 -#define LIRC_MODE2_TIMEOUT 0x03000000 - -#define LIRC_VALUE_MASK 0x00FFFFFF -#define LIRC_MODE2_MASK 0xFF000000 - -#define LIRC_SPACE(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_SPACE) -#define LIRC_PULSE(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_PULSE) -#define LIRC_FREQUENCY(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_FREQUENCY) -#define LIRC_TIMEOUT(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_TIMEOUT) - -#define LIRC_VALUE(val) ((val)&LIRC_VALUE_MASK) -#define LIRC_MODE2(val) ((val)&LIRC_MODE2_MASK) - -#define LIRC_IS_SPACE(val) (LIRC_MODE2(val) == LIRC_MODE2_SPACE) -#define LIRC_IS_PULSE(val) (LIRC_MODE2(val) == LIRC_MODE2_PULSE) -#define LIRC_IS_FREQUENCY(val) (LIRC_MODE2(val) == LIRC_MODE2_FREQUENCY) -#define LIRC_IS_TIMEOUT(val) (LIRC_MODE2(val) == LIRC_MODE2_TIMEOUT) - -/* used heavily by lirc userspace */ -#define lirc_t int - -/*** lirc compatible hardware features ***/ - -#define LIRC_MODE2SEND(x) (x) -#define LIRC_SEND2MODE(x) (x) -#define LIRC_MODE2REC(x) ((x) << 16) -#define LIRC_REC2MODE(x) ((x) >> 16) - -#define LIRC_MODE_RAW 0x00000001 -#define LIRC_MODE_PULSE 0x00000002 -#define LIRC_MODE_MODE2 0x00000004 -#define LIRC_MODE_SCANCODE 0x00000008 -#define LIRC_MODE_LIRCCODE 0x00000010 - - -#define LIRC_CAN_SEND_RAW LIRC_MODE2SEND(LIRC_MODE_RAW) -#define LIRC_CAN_SEND_PULSE LIRC_MODE2SEND(LIRC_MODE_PULSE) -#define LIRC_CAN_SEND_MODE2 LIRC_MODE2SEND(LIRC_MODE_MODE2) -#define LIRC_CAN_SEND_LIRCCODE LIRC_MODE2SEND(LIRC_MODE_LIRCCODE) - -#define LIRC_CAN_SEND_MASK 0x0000003f - -#define LIRC_CAN_SET_SEND_CARRIER 0x00000100 -#define LIRC_CAN_SET_SEND_DUTY_CYCLE 0x00000200 -#define LIRC_CAN_SET_TRANSMITTER_MASK 0x00000400 - -#define LIRC_CAN_REC_RAW LIRC_MODE2REC(LIRC_MODE_RAW) -#define LIRC_CAN_REC_PULSE LIRC_MODE2REC(LIRC_MODE_PULSE) -#define LIRC_CAN_REC_MODE2 LIRC_MODE2REC(LIRC_MODE_MODE2) -#define LIRC_CAN_REC_SCANCODE LIRC_MODE2REC(LIRC_MODE_SCANCODE) -#define LIRC_CAN_REC_LIRCCODE LIRC_MODE2REC(LIRC_MODE_LIRCCODE) - -#define LIRC_CAN_REC_MASK LIRC_MODE2REC(LIRC_CAN_SEND_MASK) - -#define LIRC_CAN_SET_REC_CARRIER (LIRC_CAN_SET_SEND_CARRIER << 16) -#define LIRC_CAN_SET_REC_DUTY_CYCLE (LIRC_CAN_SET_SEND_DUTY_CYCLE << 16) - -#define LIRC_CAN_SET_REC_DUTY_CYCLE_RANGE 0x40000000 -#define LIRC_CAN_SET_REC_CARRIER_RANGE 0x80000000 -#define LIRC_CAN_GET_REC_RESOLUTION 0x20000000 -#define LIRC_CAN_SET_REC_TIMEOUT 0x10000000 -#define LIRC_CAN_SET_REC_FILTER 0x08000000 - -#define LIRC_CAN_MEASURE_CARRIER 0x02000000 -#define LIRC_CAN_USE_WIDEBAND_RECEIVER 0x04000000 - -#define LIRC_CAN_SEND(x) ((x)&LIRC_CAN_SEND_MASK) -#define LIRC_CAN_REC(x) ((x)&LIRC_CAN_REC_MASK) - -#define LIRC_CAN_NOTIFY_DECODE 0x01000000 - -/*** IOCTL commands for lirc driver ***/ - -#define LIRC_GET_FEATURES _IOR('i', 0x00000000, __u32) - -#define LIRC_GET_SEND_MODE _IOR('i', 0x00000001, __u32) -#define LIRC_GET_REC_MODE _IOR('i', 0x00000002, __u32) -#define LIRC_GET_REC_RESOLUTION _IOR('i', 0x00000007, __u32) - -#define LIRC_GET_MIN_TIMEOUT _IOR('i', 0x00000008, __u32) -#define LIRC_GET_MAX_TIMEOUT _IOR('i', 0x00000009, __u32) - -/* code length in bits, currently only for LIRC_MODE_LIRCCODE */ -#define LIRC_GET_LENGTH _IOR('i', 0x0000000f, __u32) - -#define LIRC_SET_SEND_MODE _IOW('i', 0x00000011, __u32) -#define LIRC_SET_REC_MODE _IOW('i', 0x00000012, __u32) -/* Note: these can reset the according pulse_width */ -#define LIRC_SET_SEND_CARRIER _IOW('i', 0x00000013, __u32) -#define LIRC_SET_REC_CARRIER _IOW('i', 0x00000014, __u32) -#define LIRC_SET_SEND_DUTY_CYCLE _IOW('i', 0x00000015, __u32) -#define LIRC_SET_TRANSMITTER_MASK _IOW('i', 0x00000017, __u32) - -/* - * when a timeout != 0 is set the driver will send a - * LIRC_MODE2_TIMEOUT data packet, otherwise LIRC_MODE2_TIMEOUT is - * never sent, timeout is disabled by default - */ -#define LIRC_SET_REC_TIMEOUT _IOW('i', 0x00000018, __u32) - -/* 1 enables, 0 disables timeout reports in MODE2 */ -#define LIRC_SET_REC_TIMEOUT_REPORTS _IOW('i', 0x00000019, __u32) - -/* - * if enabled from the next key press on the driver will send - * LIRC_MODE2_FREQUENCY packets - */ -#define LIRC_SET_MEASURE_CARRIER_MODE _IOW('i', 0x0000001d, __u32) - -/* - * to set a range use LIRC_SET_REC_CARRIER_RANGE with the - * lower bound first and later LIRC_SET_REC_CARRIER with the upper bound - */ -#define LIRC_SET_REC_CARRIER_RANGE _IOW('i', 0x0000001f, __u32) - -#define LIRC_SET_WIDEBAND_RECEIVER _IOW('i', 0x00000023, __u32) - -/* - * Return the recording timeout, which is either set by - * the ioctl LIRC_SET_REC_TIMEOUT or by the kernel after setting the protocols. - */ -#define LIRC_GET_REC_TIMEOUT _IOR('i', 0x00000024, __u32) - -/* - * struct lirc_scancode - decoded scancode with protocol for use with - * LIRC_MODE_SCANCODE - * - * @timestamp: Timestamp in nanoseconds using CLOCK_MONOTONIC when IR - * was decoded. - * @flags: should be 0 for transmit. When receiving scancodes, - * LIRC_SCANCODE_FLAG_TOGGLE or LIRC_SCANCODE_FLAG_REPEAT can be set - * depending on the protocol - * @rc_proto: see enum rc_proto - * @keycode: the translated keycode. Set to 0 for transmit. - * @scancode: the scancode received or to be sent - */ -struct lirc_scancode { - __u64 timestamp; - __u16 flags; - __u16 rc_proto; - __u32 keycode; - __u64 scancode; -}; - -/* Set if the toggle bit of rc-5 or rc-6 is enabled */ -#define LIRC_SCANCODE_FLAG_TOGGLE 1 -/* Set if this is a nec or sanyo repeat */ -#define LIRC_SCANCODE_FLAG_REPEAT 2 - -/** - * enum rc_proto - the Remote Controller protocol - * - * @RC_PROTO_UNKNOWN: Protocol not known - * @RC_PROTO_OTHER: Protocol known but proprietary - * @RC_PROTO_RC5: Philips RC5 protocol - * @RC_PROTO_RC5X_20: Philips RC5x 20 bit protocol - * @RC_PROTO_RC5_SZ: StreamZap variant of RC5 - * @RC_PROTO_JVC: JVC protocol - * @RC_PROTO_SONY12: Sony 12 bit protocol - * @RC_PROTO_SONY15: Sony 15 bit protocol - * @RC_PROTO_SONY20: Sony 20 bit protocol - * @RC_PROTO_NEC: NEC protocol - * @RC_PROTO_NECX: Extended NEC protocol - * @RC_PROTO_NEC32: NEC 32 bit protocol - * @RC_PROTO_SANYO: Sanyo protocol - * @RC_PROTO_MCIR2_KBD: RC6-ish MCE keyboard - * @RC_PROTO_MCIR2_MSE: RC6-ish MCE mouse - * @RC_PROTO_RC6_0: Philips RC6-0-16 protocol - * @RC_PROTO_RC6_6A_20: Philips RC6-6A-20 protocol - * @RC_PROTO_RC6_6A_24: Philips RC6-6A-24 protocol - * @RC_PROTO_RC6_6A_32: Philips RC6-6A-32 protocol - * @RC_PROTO_RC6_MCE: MCE (Philips RC6-6A-32 subtype) protocol - * @RC_PROTO_SHARP: Sharp protocol - * @RC_PROTO_XMP: XMP protocol - * @RC_PROTO_CEC: CEC protocol - * @RC_PROTO_IMON: iMon Pad protocol - * @RC_PROTO_RCMM12: RC-MM protocol 12 bits - * @RC_PROTO_RCMM24: RC-MM protocol 24 bits - * @RC_PROTO_RCMM32: RC-MM protocol 32 bits - */ -enum rc_proto { - RC_PROTO_UNKNOWN = 0, - RC_PROTO_OTHER = 1, - RC_PROTO_RC5 = 2, - RC_PROTO_RC5X_20 = 3, - RC_PROTO_RC5_SZ = 4, - RC_PROTO_JVC = 5, - RC_PROTO_SONY12 = 6, - RC_PROTO_SONY15 = 7, - RC_PROTO_SONY20 = 8, - RC_PROTO_NEC = 9, - RC_PROTO_NECX = 10, - RC_PROTO_NEC32 = 11, - RC_PROTO_SANYO = 12, - RC_PROTO_MCIR2_KBD = 13, - RC_PROTO_MCIR2_MSE = 14, - RC_PROTO_RC6_0 = 15, - RC_PROTO_RC6_6A_20 = 16, - RC_PROTO_RC6_6A_24 = 17, - RC_PROTO_RC6_6A_32 = 18, - RC_PROTO_RC6_MCE = 19, - RC_PROTO_SHARP = 20, - RC_PROTO_XMP = 21, - RC_PROTO_CEC = 22, - RC_PROTO_IMON = 23, - RC_PROTO_RCMM12 = 24, - RC_PROTO_RCMM24 = 25, - RC_PROTO_RCMM32 = 26, -}; - -#endif diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c index ebf68dce5504..2893e9f2f1e0 100644 --- a/tools/testing/selftests/bpf/test_lirc_mode2_user.c +++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c @@ -28,7 +28,6 @@ // 5. We can read keycode from same /dev/lirc device #include -#include #include #include #include From 429c3be8a5e2695b5b92a6a12361eb89eb185495 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 25 Jan 2022 12:06:54 +0200 Subject: [PATCH 0597/1295] sch_htb: Fail on unsupported parameters when offload is requested The current implementation of HTB offload doesn't support some parameters. Instead of ignoring them, actively return the EINVAL error when they are set to non-defaults. As this patch goes to stable, the driver API is not changed here. If future drivers support more offload parameters, the checks can be moved to the driver side. Note that the buffer and cbuffer parameters are also not supported, but the tc userspace tool assigns some default values derived from rate and ceil, and identifying these defaults in sch_htb would be unreliable, so they are still ignored. Fixes: d03b195b5aa0 ("sch_htb: Hierarchical QoS hardware offload") Reported-by: Jakub Kicinski Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20220125100654.424570-1-maximmi@nvidia.com Signed-off-by: Jakub Kicinski --- net/sched/sch_htb.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 9267922ea9c3..23a9d6242429 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1810,6 +1810,26 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!hopt->rate.rate || !hopt->ceil.rate) goto failure; + if (q->offload) { + /* Options not supported by the offload. */ + if (hopt->rate.overhead || hopt->ceil.overhead) { + NL_SET_ERR_MSG(extack, "HTB offload doesn't support the overhead parameter"); + goto failure; + } + if (hopt->rate.mpu || hopt->ceil.mpu) { + NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter"); + goto failure; + } + if (hopt->quantum) { + NL_SET_ERR_MSG(extack, "HTB offload doesn't support the quantum parameter"); + goto failure; + } + if (hopt->prio) { + NL_SET_ERR_MSG(extack, "HTB offload doesn't support the prio parameter"); + goto failure; + } + } + /* Keeping backward compatible with rate_table based iproute2 tc */ if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB], From 76cea3d95513fe40000d06a3719c4bb6b53275e2 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 20 Jan 2022 14:05:27 +1000 Subject: [PATCH 0598/1295] Revert "drm/ast: Support 1600x900 with 108MHz PCLK" This reverts commit 9bb7b689274b67ecb3641e399e76f84adc627df1. This caused a regression reported to Red Hat. Fixes: 9bb7b689274b ("drm/ast: Support 1600x900 with 108MHz PCLK") Signed-off-by: Dave Airlie Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20220120040527.552068-1-airlied@gmail.com --- drivers/gpu/drm/ast/ast_tables.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/ast/ast_tables.h b/drivers/gpu/drm/ast/ast_tables.h index d9eb353a4bf0..dbe1cc620f6e 100644 --- a/drivers/gpu/drm/ast/ast_tables.h +++ b/drivers/gpu/drm/ast/ast_tables.h @@ -282,8 +282,6 @@ static const struct ast_vbios_enhtable res_1360x768[] = { }; static const struct ast_vbios_enhtable res_1600x900[] = { - {1800, 1600, 24, 80, 1000, 900, 1, 3, VCLK108, /* 60Hz */ - (SyncPP | Charx8Dot | LineCompareOff | WideScreenMode | NewModeInfo), 60, 3, 0x3A }, {1760, 1600, 48, 32, 926, 900, 3, 5, VCLK97_75, /* 60Hz CVT RB */ (SyncNP | Charx8Dot | LineCompareOff | WideScreenMode | NewModeInfo | AST2500PreCatchCRT), 60, 1, 0x3A }, From c733ebb7cb67dfb146a07c0ae329a0de9ec52f36 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 24 Jan 2022 13:38:09 +0000 Subject: [PATCH 0599/1295] irqchip/gic-v3-its: Reset each ITS's BASERn register before probe A recent bug report outlined that the way GICv4.1 is handled across kexec is pretty bad. We can end-up in a situation where ITSs share memory (this is the case when SVPET==1) and reprogram the base registers, creating a situation where ITSs that are part of a given affinity group see different pointers. Which is illegal. Boo. In order to restore some sanity, reset the BASERn registers to 0 *before* probing any ITS. Although this isn't optimised at all, this is only a once-per-boot cost, which shouldn't show up on anyone's radar. Cc: Jay Chen Signed-off-by: Marc Zyngier Reviewed-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20211216190315.GA14220@lpieralisi Link: https://lore.kernel.org/r/20220124133809.1291195-1-maz@kernel.org --- drivers/irqchip/irq-gic-v3-its.c | 120 +++++++++++++++++++++++++------ 1 file changed, 99 insertions(+), 21 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 7b8f1ec0ff78..220fa45c22f4 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -4856,6 +4856,38 @@ static struct syscore_ops its_syscore_ops = { .resume = its_restore_enable, }; +static void __init __iomem *its_map_one(struct resource *res, int *err) +{ + void __iomem *its_base; + u32 val; + + its_base = ioremap(res->start, SZ_64K); + if (!its_base) { + pr_warn("ITS@%pa: Unable to map ITS registers\n", &res->start); + *err = -ENOMEM; + return NULL; + } + + val = readl_relaxed(its_base + GITS_PIDR2) & GIC_PIDR2_ARCH_MASK; + if (val != 0x30 && val != 0x40) { + pr_warn("ITS@%pa: No ITS detected, giving up\n", &res->start); + *err = -ENODEV; + goto out_unmap; + } + + *err = its_force_quiescent(its_base); + if (*err) { + pr_warn("ITS@%pa: Failed to quiesce, giving up\n", &res->start); + goto out_unmap; + } + + return its_base; + +out_unmap: + iounmap(its_base); + return NULL; +} + static int its_init_domain(struct fwnode_handle *handle, struct its_node *its) { struct irq_domain *inner_domain; @@ -4963,29 +4995,14 @@ static int __init its_probe_one(struct resource *res, { struct its_node *its; void __iomem *its_base; - u32 val, ctlr; u64 baser, tmp, typer; struct page *page; + u32 ctlr; int err; - its_base = ioremap(res->start, SZ_64K); - if (!its_base) { - pr_warn("ITS@%pa: Unable to map ITS registers\n", &res->start); - return -ENOMEM; - } - - val = readl_relaxed(its_base + GITS_PIDR2) & GIC_PIDR2_ARCH_MASK; - if (val != 0x30 && val != 0x40) { - pr_warn("ITS@%pa: No ITS detected, giving up\n", &res->start); - err = -ENODEV; - goto out_unmap; - } - - err = its_force_quiescent(its_base); - if (err) { - pr_warn("ITS@%pa: Failed to quiesce, giving up\n", &res->start); - goto out_unmap; - } + its_base = its_map_one(res, &err); + if (!its_base) + return err; pr_info("ITS %pR\n", res); @@ -5249,6 +5266,23 @@ out: return ret; } +/* Mark all the BASER registers as invalid before they get reprogrammed */ +static int __init its_reset_one(struct resource *res) +{ + void __iomem *its_base; + int err, i; + + its_base = its_map_one(res, &err); + if (!its_base) + return err; + + for (i = 0; i < GITS_BASER_NR_REGS; i++) + gits_write_baser(0, its_base + GITS_BASER + (i << 3)); + + iounmap(its_base); + return 0; +} + static const struct of_device_id its_device_id[] = { { .compatible = "arm,gic-v3-its", }, {}, @@ -5259,6 +5293,26 @@ static int __init its_of_probe(struct device_node *node) struct device_node *np; struct resource res; + /* + * Make sure *all* the ITS are reset before we probe any, as + * they may be sharing memory. If any of the ITS fails to + * reset, don't even try to go any further, as this could + * result in something even worse. + */ + for (np = of_find_matching_node(node, its_device_id); np; + np = of_find_matching_node(np, its_device_id)) { + int err; + + if (!of_device_is_available(np) || + !of_property_read_bool(np, "msi-controller") || + of_address_to_resource(np, 0, &res)) + continue; + + err = its_reset_one(&res); + if (err) + return err; + } + for (np = of_find_matching_node(node, its_device_id); np; np = of_find_matching_node(np, its_device_id)) { if (!of_device_is_available(np)) @@ -5421,11 +5475,35 @@ dom_err: return err; } +static int __init its_acpi_reset(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_generic_translator *its_entry; + struct resource res; + + its_entry = (struct acpi_madt_generic_translator *)header; + res = (struct resource) { + .start = its_entry->base_address, + .end = its_entry->base_address + ACPI_GICV3_ITS_MEM_SIZE - 1, + .flags = IORESOURCE_MEM, + }; + + return its_reset_one(&res); +} + static void __init its_acpi_probe(void) { acpi_table_parse_srat_its(); - acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_TRANSLATOR, - gic_acpi_parse_madt_its, 0); + /* + * Make sure *all* the ITS are reset before we probe any, as + * they may be sharing memory. If any of the ITS fails to + * reset, don't even try to go any further, as this could + * result in something even worse. + */ + if (acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_TRANSLATOR, + its_acpi_reset, 0) > 0) + acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_TRANSLATOR, + gic_acpi_parse_madt_its, 0); acpi_its_srat_maps_free(); } #else From 0c566618e27f17b5807086dba8c222ca8ca3dc1e Mon Sep 17 00:00:00 2001 From: Tim Harvey Date: Thu, 16 Dec 2021 08:12:27 -0800 Subject: [PATCH 0600/1295] arm64: dts: imx8mn-venice-gw7902: disable gpu Since commit 99aa29932271 ("arm64: dts: imx8mn: Enable GPU") imx8mn-venice-gw7902 will hang during kernel init because it uses a MIMX8MN5CVTI which does not have a GPU. Disable pgc_gpumix to work around this. We also disable the GPU devices that depend on the gpumix power domain and pgc_gpu to avoid them staying in a probe deferred state forever. Cc: Adam Ford Cc: Lucas Stach Signed-off-by: Tim Harvey Fixes: 99aa29932271 ("arm64: dts: imx8mn: Enable GPU") Reviewed-by: Adam Ford Signed-off-by: Shawn Guo --- .../boot/dts/freescale/imx8mn-venice-gw7902.dts | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts b/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts index 236f425e1570..2d58005d20e4 100644 --- a/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts +++ b/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts @@ -220,6 +220,10 @@ }; }; +&disp_blk_ctrl { + status = "disabled"; +}; + /* off-board header */ &ecspi2 { pinctrl-names = "default"; @@ -251,6 +255,10 @@ }; }; +&gpu { + status = "disabled"; +}; + &i2c1 { clock-frequency = <100000>; pinctrl-names = "default"; @@ -546,6 +554,10 @@ status = "okay"; }; +&pgc_gpumix { + status = "disabled"; +}; + /* off-board header */ &sai3 { pinctrl-names = "default"; From 825911492eb15bf8bb7fb94bc0c0421fe7a6327d Mon Sep 17 00:00:00 2001 From: Sing-Han Chen Date: Wed, 12 Jan 2022 17:41:43 +0800 Subject: [PATCH 0601/1295] ucsi_ccg: Check DEV_INT bit only when starting CCG4 CCGx clears Bit 0:Device Interrupt in the INTR_REG if CCGx is reset successfully. However, there might be a chance that other bits in INTR_REG are not cleared due to internal data queued in PPM. This case misleads the driver that CCGx reset failed. The commit checks bit 0 in INTR_REG and ignores other bits. The ucsi driver would reset PPM later. Fixes: 247c554a14aa ("usb: typec: ucsi: add support for Cypress CCGx") Cc: stable@vger.kernel.org Reviewed-by: Heikki Krogerus Signed-off-by: Sing-Han Chen Signed-off-by: Wayne Chang Link: https://lore.kernel.org/r/20220112094143.628610-1-waynec@nvidia.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi_ccg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/ucsi/ucsi_ccg.c b/drivers/usb/typec/ucsi/ucsi_ccg.c index bff96d64dddf..6db7c8ddd51c 100644 --- a/drivers/usb/typec/ucsi/ucsi_ccg.c +++ b/drivers/usb/typec/ucsi/ucsi_ccg.c @@ -325,7 +325,7 @@ static int ucsi_ccg_init(struct ucsi_ccg *uc) if (status < 0) return status; - if (!data) + if (!(data & DEV_INT)) return 0; status = ccg_write(uc, CCGX_RAB_INTR_REG, &data, sizeof(data)); From 904edf8aeb459697129be5fde847e2a502f41fd9 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Sat, 22 Jan 2022 08:33:22 +0530 Subject: [PATCH 0602/1295] usb: gadget: f_sourcesink: Fix isoc transfer for USB_SPEED_SUPER_PLUS Currently when gadget enumerates in super speed plus, the isoc endpoint request buffer size is not calculated correctly. Fix this by checking the gadget speed against USB_SPEED_SUPER_PLUS and update the request buffer size. Fixes: 90c4d05780d4 ("usb: fix various gadgets null ptr deref on 10gbps cabling.") Cc: stable Signed-off-by: Pavankumar Kondeti Link: https://lore.kernel.org/r/1642820602-20619-1-git-send-email-quic_pkondeti@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_sourcesink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/f_sourcesink.c b/drivers/usb/gadget/function/f_sourcesink.c index 1abf08e5164a..6803cd60cc6d 100644 --- a/drivers/usb/gadget/function/f_sourcesink.c +++ b/drivers/usb/gadget/function/f_sourcesink.c @@ -584,6 +584,7 @@ static int source_sink_start_ep(struct f_sourcesink *ss, bool is_in, if (is_iso) { switch (speed) { + case USB_SPEED_SUPER_PLUS: case USB_SPEED_SUPER: size = ss->isoc_maxpacket * (ss->isoc_mult + 1) * From 2e3dd4a6246945bf84ea6f478365d116e661554c Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Mon, 17 Jan 2022 15:00:39 +0000 Subject: [PATCH 0603/1295] usb: common: ulpi: Fix crash in ulpi_match() Commit 7495af930835 ("ARM: multi_v7_defconfig: Enable drivers for DragonBoard 410c") enables the CONFIG_PHY_QCOM_USB_HS for the ARM multi_v7_defconfig. Enabling this Kconfig is causing the kernel to crash on the Tegra20 Ventana platform in the ulpi_match() function. The Qualcomm USB HS PHY driver that is enabled by CONFIG_PHY_QCOM_USB_HS, registers a ulpi_driver but this driver does not provide an 'id_table', so when ulpi_match() is called on the Tegra20 Ventana platform, it crashes when attempting to deference the id_table pointer which is not valid. The Qualcomm USB HS PHY driver uses device-tree for matching the ULPI driver with the device and so fix this crash by using device-tree for matching if the id_table is not valid. Fixes: ef6a7bcfb01c ("usb: ulpi: Support device discovery via DT") Cc: stable Signed-off-by: Jon Hunter Link: https://lore.kernel.org/r/20220117150039.44058-1-jonathanh@nvidia.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/common/ulpi.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/usb/common/ulpi.c b/drivers/usb/common/ulpi.c index 4169cf40a03b..8f8405b0d608 100644 --- a/drivers/usb/common/ulpi.c +++ b/drivers/usb/common/ulpi.c @@ -39,8 +39,11 @@ static int ulpi_match(struct device *dev, struct device_driver *driver) struct ulpi *ulpi = to_ulpi_dev(dev); const struct ulpi_device_id *id; - /* Some ULPI devices don't have a vendor id so rely on OF match */ - if (ulpi->id.vendor == 0) + /* + * Some ULPI devices don't have a vendor id + * or provide an id_table so rely on OF match. + */ + if (ulpi->id.vendor == 0 || !drv->id_table) return of_driver_match_device(dev, driver); for (id = drv->id_table; id->vendor; id++) From 9df478463d9feb90dae24f183383961cf123a0ec Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 10 Jan 2022 11:27:38 -0600 Subject: [PATCH 0604/1295] usb: xhci-plat: fix crash when suspend if remote wake enable Crashed at i.mx8qm platform when suspend if enable remote wakeup Internal error: synchronous external abort: 96000210 [#1] PREEMPT SMP Modules linked in: CPU: 2 PID: 244 Comm: kworker/u12:6 Not tainted 5.15.5-dirty #12 Hardware name: Freescale i.MX8QM MEK (DT) Workqueue: events_unbound async_run_entry_fn pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : xhci_disable_hub_port_wake.isra.62+0x60/0xf8 lr : xhci_disable_hub_port_wake.isra.62+0x34/0xf8 sp : ffff80001394bbf0 x29: ffff80001394bbf0 x28: 0000000000000000 x27: ffff00081193b578 x26: ffff00081193b570 x25: 0000000000000000 x24: 0000000000000000 x23: ffff00081193a29c x22: 0000000000020001 x21: 0000000000000001 x20: 0000000000000000 x19: ffff800014e90490 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000002 x12: 0000000000000000 x11: 0000000000000000 x10: 0000000000000960 x9 : ffff80001394baa0 x8 : ffff0008145d1780 x7 : ffff0008f95b8e80 x6 : 000000001853b453 x5 : 0000000000000496 x4 : 0000000000000000 x3 : ffff00081193a29c x2 : 0000000000000001 x1 : 0000000000000000 x0 : ffff000814591620 Call trace: xhci_disable_hub_port_wake.isra.62+0x60/0xf8 xhci_suspend+0x58/0x510 xhci_plat_suspend+0x50/0x78 platform_pm_suspend+0x2c/0x78 dpm_run_callback.isra.25+0x50/0xe8 __device_suspend+0x108/0x3c0 The basic flow: 1. run time suspend call xhci_suspend, xhci parent devices gate the clock. 2. echo mem >/sys/power/state, system _device_suspend call xhci_suspend 3. xhci_suspend call xhci_disable_hub_port_wake, which access register, but clock already gated by run time suspend. This problem was hidden by power domain driver, which call run time resume before it. But the below commit remove it and make this issue happen. commit c1df456d0f06e ("PM: domains: Don't runtime resume devices at genpd_prepare()") This patch call run time resume before suspend to make sure clock is on before access register. Reviewed-by: Peter Chen Cc: stable Signed-off-by: Frank Li Testeb-by: Abel Vesa Link: https://lore.kernel.org/r/20220110172738.31686-1-Frank.Li@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-plat.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/host/xhci-plat.c b/drivers/usb/host/xhci-plat.c index c1edcc9b13ce..dc570ce4e831 100644 --- a/drivers/usb/host/xhci-plat.c +++ b/drivers/usb/host/xhci-plat.c @@ -437,6 +437,9 @@ static int __maybe_unused xhci_plat_suspend(struct device *dev) struct xhci_hcd *xhci = hcd_to_xhci(hcd); int ret; + if (pm_runtime_suspended(dev)) + pm_runtime_resume(dev); + ret = xhci_priv_suspend_quirk(hcd); if (ret) return ret; From 9678f3361afc27a3124cd2824aec0227739986fb Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 25 Jan 2022 18:02:50 -0600 Subject: [PATCH 0605/1295] usb: dwc3: xilinx: Skip resets and USB3 register settings for USB2.0 mode It appears that the PIPE clock should not be selected when only USB 2.0 is being used in the design and no USB 3.0 reference clock is used. Also, the core resets are not required if a USB3 PHY is not in use, and will break things if USB3 is actually used but the PHY entry is not listed in the device tree. Skip core resets and register settings that are only required for USB3 mode when no USB3 PHY is specified in the device tree. Fixes: 84770f028fab ("usb: dwc3: Add driver for Xilinx platforms") Cc: stable Signed-off-by: Robert Hancock Link: https://lore.kernel.org/r/20220126000253.1586760-2-robert.hancock@calian.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-xilinx.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/usb/dwc3/dwc3-xilinx.c b/drivers/usb/dwc3/dwc3-xilinx.c index 9cc3ad701a29..06b591b14b09 100644 --- a/drivers/usb/dwc3/dwc3-xilinx.c +++ b/drivers/usb/dwc3/dwc3-xilinx.c @@ -110,6 +110,18 @@ static int dwc3_xlnx_init_zynqmp(struct dwc3_xlnx *priv_data) usb3_phy = NULL; } + /* + * The following core resets are not required unless a USB3 PHY + * is used, and the subsequent register settings are not required + * unless a core reset is performed (they should be set properly + * by the first-stage boot loader, but may be reverted by a core + * reset). They may also break the configuration if USB3 is actually + * in use but the usb3-phy entry is missing from the device tree. + * Therefore, skip these operations in this case. + */ + if (!usb3_phy) + goto skip_usb3_phy; + crst = devm_reset_control_get_exclusive(dev, "usb_crst"); if (IS_ERR(crst)) { ret = PTR_ERR(crst); @@ -188,6 +200,7 @@ static int dwc3_xlnx_init_zynqmp(struct dwc3_xlnx *priv_data) goto err; } +skip_usb3_phy: /* * This routes the USB DMA traffic to go through FPD path instead * of reaching DDR directly. This traffic routing is needed to From 2cc9b1c93b1c4caa2d971856c0780fb5f7d04692 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 25 Jan 2022 18:02:51 -0600 Subject: [PATCH 0606/1295] usb: dwc3: xilinx: Fix error handling when getting USB3 PHY The code that looked up the USB3 PHY was ignoring all errors other than EPROBE_DEFER in an attempt to handle the PHY not being present. Fix and simplify the code by using devm_phy_optional_get and dev_err_probe so that a missing PHY is not treated as an error and unexpected errors are handled properly. Fixes: 84770f028fab ("usb: dwc3: Add driver for Xilinx platforms") Cc: stable Signed-off-by: Robert Hancock Link: https://lore.kernel.org/r/20220126000253.1586760-3-robert.hancock@calian.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-xilinx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/usb/dwc3/dwc3-xilinx.c b/drivers/usb/dwc3/dwc3-xilinx.c index 06b591b14b09..e14ac15e24c3 100644 --- a/drivers/usb/dwc3/dwc3-xilinx.c +++ b/drivers/usb/dwc3/dwc3-xilinx.c @@ -102,12 +102,12 @@ static int dwc3_xlnx_init_zynqmp(struct dwc3_xlnx *priv_data) int ret; u32 reg; - usb3_phy = devm_phy_get(dev, "usb3-phy"); - if (PTR_ERR(usb3_phy) == -EPROBE_DEFER) { - ret = -EPROBE_DEFER; + usb3_phy = devm_phy_optional_get(dev, "usb3-phy"); + if (IS_ERR(usb3_phy)) { + ret = PTR_ERR(usb3_phy); + dev_err_probe(dev, ret, + "failed to get USB3 PHY\n"); goto err; - } else if (IS_ERR(usb3_phy)) { - usb3_phy = NULL; } /* From 26d81b29249273d39e753cc0c7b0ca62c6a6283f Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Wed, 19 Jan 2022 10:08:49 +0800 Subject: [PATCH 0607/1295] usb: gadget: at91_udc: fix incorrect print type Fix a build error observed with ARCH=arm DEFCONFIG=allmodconfig build. drivers/usb/gadget/udc/at91_udc.h:174:42: error: format '%d' expects argument of type 'int', but argument 3 has type 'struct gpio_desc *' [-Werror=format=] Fixes: 4a555f2b8d31 ("usb: gadget: at91_udc: Convert to GPIO descriptors") Reviewed-by: Macpaul Lin Signed-off-by: Miles Chen Link: https://lore.kernel.org/r/20220119020849.25732-1-miles.chen@mediatek.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/at91_udc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/at91_udc.c b/drivers/usb/gadget/udc/at91_udc.c index dd0819df096e..9040a0561466 100644 --- a/drivers/usb/gadget/udc/at91_udc.c +++ b/drivers/usb/gadget/udc/at91_udc.c @@ -1895,7 +1895,7 @@ static int at91udc_probe(struct platform_device *pdev) at91_vbus_irq, 0, driver_name, udc); if (retval) { DBG("request vbus irq %d failed\n", - udc->board.vbus_pin); + desc_to_gpio(udc->board.vbus_pin)); goto err_unprepare_iclk; } } From ac55d163855924aa5af9f1560977da8f346963c8 Mon Sep 17 00:00:00 2001 From: Amelie Delaunay Date: Tue, 7 Dec 2021 14:01:01 +0100 Subject: [PATCH 0608/1295] usb: dwc2: gadget: don't try to disable ep0 in dwc2_hsotg_suspend Calling dwc2_hsotg_ep_disable on ep0 (in/out) will lead to the following logs before returning -EINVAL: dwc2 49000000.usb-otg: dwc2_hsotg_ep_disable: called for ep0 dwc2 49000000.usb-otg: dwc2_hsotg_ep_disable: called for ep0 To avoid these two logs while suspending, start disabling the endpoint from the index 1, as done in dwc2_hsotg_udc_stop: /* all endpoints should be shutdown */ for (ep = 1; ep < hsotg->num_of_eps; ep++) { if (hsotg->eps_in[ep]) dwc2_hsotg_ep_disable_lock(&hsotg->eps_in[ep]->ep); if (hsotg->eps_out[ep]) dwc2_hsotg_ep_disable_lock(&hsotg->eps_out[ep]->ep); } Acked-by: Minas Harutyunyan Signed-off-by: Amelie Delaunay Link: https://lore.kernel.org/r/20211207130101.270314-1-amelie.delaunay@foss.st.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/gadget.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c index 2bc03f41c70a..eee3504397e6 100644 --- a/drivers/usb/dwc2/gadget.c +++ b/drivers/usb/dwc2/gadget.c @@ -5097,7 +5097,7 @@ int dwc2_hsotg_suspend(struct dwc2_hsotg *hsotg) hsotg->gadget.speed = USB_SPEED_UNKNOWN; spin_unlock_irqrestore(&hsotg->lock, flags); - for (ep = 0; ep < hsotg->num_of_eps; ep++) { + for (ep = 1; ep < hsotg->num_of_eps; ep++) { if (hsotg->eps_in[ep]) dwc2_hsotg_ep_disable_lock(&hsotg->eps_in[ep]->ep); if (hsotg->eps_out[ep]) From 90cafce461de108bfb07c06148395dc86c3fcd23 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Mon, 6 Dec 2021 18:19:31 +0800 Subject: [PATCH 0609/1295] spi: change clk_disable_unprepare to clk_unprepare The corresponding API for clk_prepare is clk_unprepare, other than clk_disable_unprepare. Fix this by changing clk_disable_unprepare to clk_unprepare. Fixes: 5762ab71eb24 ("spi: Add support for Armada 3700 SPI Controller") Signed-off-by: Dongliang Mu Link: https://lore.kernel.org/r/20211206101931.2816597-1-mudongliangabcd@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-armada-3700.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-armada-3700.c b/drivers/spi/spi-armada-3700.c index 46feafe4e201..d8cc4b270644 100644 --- a/drivers/spi/spi-armada-3700.c +++ b/drivers/spi/spi-armada-3700.c @@ -901,7 +901,7 @@ static int a3700_spi_probe(struct platform_device *pdev) return 0; error_clk: - clk_disable_unprepare(spi->clk); + clk_unprepare(spi->clk); error: spi_master_put(master); out: From 23e3404de1aecc62c14ac96d4b63403c3e0f52d5 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Wed, 22 Dec 2021 13:48:12 +0900 Subject: [PATCH 0610/1295] spi: uniphier: Fix a bug that doesn't point to private data correctly In uniphier_spi_remove(), there is a wrong code to get private data from the platform device, so the driver can't be removed properly. The driver should get spi_master from the platform device and retrieve the private data from it. Cc: Fixes: 5ba155a4d4cc ("spi: add SPI controller driver for UniPhier SoC") Signed-off-by: Kunihiko Hayashi Link: https://lore.kernel.org/r/1640148492-32178-1-git-send-email-hayashi.kunihiko@socionext.com Signed-off-by: Mark Brown --- drivers/spi/spi-uniphier.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/spi/spi-uniphier.c b/drivers/spi/spi-uniphier.c index 8900e51e1a1c..342ee8d2c476 100644 --- a/drivers/spi/spi-uniphier.c +++ b/drivers/spi/spi-uniphier.c @@ -767,12 +767,13 @@ out_master_put: static int uniphier_spi_remove(struct platform_device *pdev) { - struct uniphier_spi_priv *priv = platform_get_drvdata(pdev); + struct spi_master *master = platform_get_drvdata(pdev); + struct uniphier_spi_priv *priv = spi_master_get_devdata(master); - if (priv->master->dma_tx) - dma_release_channel(priv->master->dma_tx); - if (priv->master->dma_rx) - dma_release_channel(priv->master->dma_rx); + if (master->dma_tx) + dma_release_channel(master->dma_tx); + if (master->dma_rx) + dma_release_channel(master->dma_rx); clk_disable_unprepare(priv->clk); From 79aa3e19fe8f5be30e846df8a436bfe306e8b1a6 Mon Sep 17 00:00:00 2001 From: Pawel Laszczak Date: Tue, 11 Jan 2022 10:07:37 +0100 Subject: [PATCH 0611/1295] usb: cdnsp: Fix segmentation fault in cdns_lost_power function CDNSP driver read not initialized cdns->otg_v0_regs which lead to segmentation fault. Patch fixes this issue. Fixes: 2cf2581cd229 ("usb: cdns3: add power lost support for system resume") cc: Signed-off-by: Pawel Laszczak Link: https://lore.kernel.org/r/20220111090737.10345-1-pawell@gli-login.cadence.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/drd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/usb/cdns3/drd.c b/drivers/usb/cdns3/drd.c index 55c73b1d8704..d00ff98dffab 100644 --- a/drivers/usb/cdns3/drd.c +++ b/drivers/usb/cdns3/drd.c @@ -483,11 +483,11 @@ int cdns_drd_exit(struct cdns *cdns) /* Indicate the cdns3 core was power lost before */ bool cdns_power_is_lost(struct cdns *cdns) { - if (cdns->version == CDNS3_CONTROLLER_V1) { - if (!(readl(&cdns->otg_v1_regs->simulate) & BIT(0))) + if (cdns->version == CDNS3_CONTROLLER_V0) { + if (!(readl(&cdns->otg_v0_regs->simulate) & BIT(0))) return true; } else { - if (!(readl(&cdns->otg_v0_regs->simulate) & BIT(0))) + if (!(readl(&cdns->otg_v1_regs->simulate) & BIT(0))) return true; } return false; From 993d66140f8d1c1853a3b58b77b43b681eb64dee Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sun, 19 Dec 2021 19:42:15 -0300 Subject: [PATCH 0612/1295] ARM: dts: imx6qdl-udoo: Properly describe the SD card detect GPIO7_IO00 is used as SD card detect. Properly describe this in the devicetree. Fixes: 40cdaa542cf0 ("ARM: dts: imx6q-udoo: Add initial board support") Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx6qdl-udoo.dtsi | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/imx6qdl-udoo.dtsi b/arch/arm/boot/dts/imx6qdl-udoo.dtsi index d07d8f83456d..ccfa8e320be6 100644 --- a/arch/arm/boot/dts/imx6qdl-udoo.dtsi +++ b/arch/arm/boot/dts/imx6qdl-udoo.dtsi @@ -5,6 +5,8 @@ * Author: Fabio Estevam */ +#include + / { aliases { backlight = &backlight; @@ -226,6 +228,7 @@ MX6QDL_PAD_SD3_DAT1__SD3_DATA1 0x17059 MX6QDL_PAD_SD3_DAT2__SD3_DATA2 0x17059 MX6QDL_PAD_SD3_DAT3__SD3_DATA3 0x17059 + MX6QDL_PAD_SD3_DAT5__GPIO7_IO00 0x1b0b0 >; }; @@ -304,7 +307,7 @@ &usdhc3 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usdhc3>; - non-removable; + cd-gpios = <&gpio7 0 GPIO_ACTIVE_LOW>; status = "okay"; }; From c5c1546a654f613e291a7c5d6f3660fc1eb6d0c7 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Wed, 26 Jan 2022 11:35:46 +0000 Subject: [PATCH 0613/1295] ASoC: codecs: wcd938x: fix incorrect used of portid Mixer controls have the channel id in mixer->reg, which is not same as port id. port id should be derived from chan_info array. So fix this. Without this, its possible that we could corrupt struct wcd938x_sdw_priv by accessing port_map array out of range with channel id instead of port id. Fixes: e8ba1e05bdc0 ("ASoC: codecs: wcd938x: add basic controls") Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220126113549.8853-2-srinivas.kandagatla@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/wcd938x.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/sound/soc/codecs/wcd938x.c b/sound/soc/codecs/wcd938x.c index eff200a07d9f..5994644c8702 100644 --- a/sound/soc/codecs/wcd938x.c +++ b/sound/soc/codecs/wcd938x.c @@ -1432,14 +1432,10 @@ static int wcd938x_sdw_connect_port(struct wcd938x_sdw_ch_info *ch_info, return 0; } -static int wcd938x_connect_port(struct wcd938x_sdw_priv *wcd, u8 ch_id, u8 enable) +static int wcd938x_connect_port(struct wcd938x_sdw_priv *wcd, u8 port_num, u8 ch_id, u8 enable) { - u8 port_num; - - port_num = wcd->ch_info[ch_id].port_num; - return wcd938x_sdw_connect_port(&wcd->ch_info[ch_id], - &wcd->port_config[port_num], + &wcd->port_config[port_num - 1], enable); } @@ -2593,6 +2589,7 @@ static int wcd938x_set_compander(struct snd_kcontrol *kcontrol, struct wcd938x_priv *wcd938x = snd_soc_component_get_drvdata(component); struct wcd938x_sdw_priv *wcd; int value = ucontrol->value.integer.value[0]; + int portidx; struct soc_mixer_control *mc; bool hphr; @@ -2606,10 +2603,12 @@ static int wcd938x_set_compander(struct snd_kcontrol *kcontrol, else wcd938x->comp1_enable = value; + portidx = wcd->ch_info[mc->reg].port_num; + if (value) - wcd938x_connect_port(wcd, mc->reg, true); + wcd938x_connect_port(wcd, portidx, mc->reg, true); else - wcd938x_connect_port(wcd, mc->reg, false); + wcd938x_connect_port(wcd, portidx, mc->reg, false); return 0; } @@ -2882,9 +2881,11 @@ static int wcd938x_get_swr_port(struct snd_kcontrol *kcontrol, struct wcd938x_sdw_priv *wcd; struct soc_mixer_control *mixer = (struct soc_mixer_control *)kcontrol->private_value; int dai_id = mixer->shift; - int portidx = mixer->reg; + int portidx, ch_idx = mixer->reg; + wcd = wcd938x->sdw_priv[dai_id]; + portidx = wcd->ch_info[ch_idx].port_num; ucontrol->value.integer.value[0] = wcd->port_enable[portidx]; @@ -2899,12 +2900,14 @@ static int wcd938x_set_swr_port(struct snd_kcontrol *kcontrol, struct wcd938x_sdw_priv *wcd; struct soc_mixer_control *mixer = (struct soc_mixer_control *)kcontrol->private_value; - int portidx = mixer->reg; + int ch_idx = mixer->reg; + int portidx; int dai_id = mixer->shift; bool enable; wcd = wcd938x->sdw_priv[dai_id]; + portidx = wcd->ch_info[ch_idx].port_num; if (ucontrol->value.integer.value[0]) enable = true; else @@ -2912,7 +2915,7 @@ static int wcd938x_set_swr_port(struct snd_kcontrol *kcontrol, wcd->port_enable[portidx] = enable; - wcd938x_connect_port(wcd, portidx, enable); + wcd938x_connect_port(wcd, portidx, ch_idx, enable); return 0; From fca041a3ab70a099a6d5519ecb689b6279bd04f3 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Wed, 26 Jan 2022 11:35:47 +0000 Subject: [PATCH 0614/1295] ASoC: codecs: lpass-rx-macro: fix sidetone register offsets For some reason we ended up with incorrect register offfset calcuations for sidetone. regmap clearly throw errors when accessing these incorrect registers as these do not belong to any read/write ranges. so fix them to point to correct register offsets. Fixes: f3ce6f3c9a99 ("ASoC: codecs: lpass-rx-macro: add iir widgets") Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220126113549.8853-3-srinivas.kandagatla@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/lpass-rx-macro.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/soc/codecs/lpass-rx-macro.c b/sound/soc/codecs/lpass-rx-macro.c index aec5127260fd..6ffe88345de5 100644 --- a/sound/soc/codecs/lpass-rx-macro.c +++ b/sound/soc/codecs/lpass-rx-macro.c @@ -2688,8 +2688,8 @@ static uint32_t get_iir_band_coeff(struct snd_soc_component *component, int reg, b2_reg; /* Address does not automatically update if reading */ - reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B1_CTL + 16 * iir_idx; - b2_reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B2_CTL + 16 * iir_idx; + reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B1_CTL + 0x80 * iir_idx; + b2_reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B2_CTL + 0x80 * iir_idx; snd_soc_component_write(component, reg, ((band_idx * BAND_MAX + coeff_idx) * @@ -2718,7 +2718,7 @@ static uint32_t get_iir_band_coeff(struct snd_soc_component *component, static void set_iir_band_coeff(struct snd_soc_component *component, int iir_idx, int band_idx, uint32_t value) { - int reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B2_CTL + 16 * iir_idx; + int reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B2_CTL + 0x80 * iir_idx; snd_soc_component_write(component, reg, (value & 0xFF)); snd_soc_component_write(component, reg, (value >> 8) & 0xFF); @@ -2739,7 +2739,7 @@ static int rx_macro_put_iir_band_audio_mixer( int iir_idx = ctl->iir_idx; int band_idx = ctl->band_idx; u32 coeff[BAND_MAX]; - int reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B1_CTL + 16 * iir_idx; + int reg = CDC_RX_SIDETONE_IIR0_IIR_COEF_B1_CTL + 0x80 * iir_idx; memcpy(&coeff[0], ucontrol->value.bytes.data, params->max); From bd2347fd67d8da0fa76296507cc556da0a233bcb Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Wed, 26 Jan 2022 11:35:48 +0000 Subject: [PATCH 0615/1295] ASoC: codecs: wcd938x: fix return value of mixer put function wcd938x_ear_pa_put_gain, wcd938x_set_swr_port and wcd938x_set_compander currently returns zero eventhough it changes the value. Fix this, so that change notifications are sent correctly. Fixes: e8ba1e05bdc01 ("ASoC: codecs: wcd938x: add basic controls") Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220126113549.8853-4-srinivas.kandagatla@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/wcd938x.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/wcd938x.c b/sound/soc/codecs/wcd938x.c index 5994644c8702..36cbc66914f9 100644 --- a/sound/soc/codecs/wcd938x.c +++ b/sound/soc/codecs/wcd938x.c @@ -2559,7 +2559,7 @@ static int wcd938x_ear_pa_put_gain(struct snd_kcontrol *kcontrol, WCD938X_EAR_GAIN_MASK, ucontrol->value.integer.value[0]); - return 0; + return 1; } static int wcd938x_get_compander(struct snd_kcontrol *kcontrol, @@ -2610,7 +2610,7 @@ static int wcd938x_set_compander(struct snd_kcontrol *kcontrol, else wcd938x_connect_port(wcd, portidx, mc->reg, false); - return 0; + return 1; } static int wcd938x_ldoh_get(struct snd_kcontrol *kcontrol, @@ -2917,7 +2917,7 @@ static int wcd938x_set_swr_port(struct snd_kcontrol *kcontrol, wcd938x_connect_port(wcd, portidx, ch_idx, enable); - return 0; + return 1; } From 8f2e5c65ec7534cce6d315fccf2c3aef023f68f0 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Wed, 26 Jan 2022 11:35:49 +0000 Subject: [PATCH 0616/1295] ASoC: qdsp6: q6apm-dai: only stop graphs that are started Its possible that the sound card is just opened and closed without actually playing stream, ex: if the audio file itself is missing. Even in such cases we do call stop on graphs that are not yet started. DSP can throw errors in such cases, so add a check to see if the graph was started before stopping it. Fixes: 9b4fe0f1cd79 ("ASoC: qdsp6: audioreach: add q6apm-dai support") Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220126113549.8853-5-srinivas.kandagatla@linaro.org Signed-off-by: Mark Brown --- sound/soc/qcom/qdsp6/q6apm-dai.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sound/soc/qcom/qdsp6/q6apm-dai.c b/sound/soc/qcom/qdsp6/q6apm-dai.c index eb1c3aec479b..19c4a90ec1ea 100644 --- a/sound/soc/qcom/qdsp6/q6apm-dai.c +++ b/sound/soc/qcom/qdsp6/q6apm-dai.c @@ -308,8 +308,11 @@ static int q6apm_dai_close(struct snd_soc_component *component, struct snd_pcm_runtime *runtime = substream->runtime; struct q6apm_dai_rtd *prtd = runtime->private_data; - q6apm_graph_stop(prtd->graph); - q6apm_unmap_memory_regions(prtd->graph, substream->stream); + if (prtd->state) { /* only stop graph that is started */ + q6apm_graph_stop(prtd->graph); + q6apm_unmap_memory_regions(prtd->graph, substream->stream); + } + q6apm_graph_close(prtd->graph); prtd->graph = NULL; kfree(prtd); From e937440f7fc444a3e3f1fb75ea65292d6f433a44 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Wed, 26 Jan 2022 11:04:47 +0000 Subject: [PATCH 0617/1295] spi: meson-spicc: add IRQ check in meson_spicc_probe This check misses checking for platform_get_irq()'s call and may passes the negative error codes to devm_request_irq(), which takes unsigned IRQ #, causing it to fail with -EINVAL, overriding an original error code. Stop calling devm_request_irq() with invalid IRQ #s. Fixes: 454fa271bc4e ("spi: Add Meson SPICC driver") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220126110447.24549-1-linmq006@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-meson-spicc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/spi/spi-meson-spicc.c b/drivers/spi/spi-meson-spicc.c index c208efeadd18..0bc7daa7afc8 100644 --- a/drivers/spi/spi-meson-spicc.c +++ b/drivers/spi/spi-meson-spicc.c @@ -693,6 +693,11 @@ static int meson_spicc_probe(struct platform_device *pdev) writel_relaxed(0, spicc->base + SPICC_INTREG); irq = platform_get_irq(pdev, 0); + if (irq < 0) { + ret = irq; + goto out_master; + } + ret = devm_request_irq(&pdev->dev, irq, meson_spicc_irq, 0, NULL, spicc); if (ret) { From 6a7b9f002eca6788d346c16a6ff0c218b41f8d1d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 26 Jan 2022 14:33:58 +0100 Subject: [PATCH 0618/1295] Revert "tty: serial: Use fifo in 8250 console driver" This reverts commit 5021d709b31b8a14317998a33cbc78be0de9ab30. The patch is still a bit buggy, and this breaks some other hardware types. It needs to be resubmitted in a non-buggy way, and make sure the other hardware types also continue to work properly. Fixes: 5021d709b31b ("tty: serial: Use fifo in 8250 console driver") Reported-by: Sebastian Andrzej Siewior Reported-by: Jon Hunter Link: https://lore.kernel.org/r/Ye/1+Z8mEzbKbrqG@linutronix.de Link: https://lore.kernel.org/r/a1ac6254-f79e-d131-fa2a-c7ad714c6d4a@nvidia.com Cc: Wander Lairson Costa Cc: Andy Shevchenko Cc: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_port.c | 61 +++-------------------------- 1 file changed, 6 insertions(+), 55 deletions(-) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 2abb3de11a48..3b12bfc1ed67 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -2056,7 +2056,10 @@ static void serial8250_break_ctl(struct uart_port *port, int break_state) serial8250_rpm_put(up); } -static void wait_for_lsr(struct uart_8250_port *up, int bits) +/* + * Wait for transmitter & holding register to empty + */ +static void wait_for_xmitr(struct uart_8250_port *up, int bits) { unsigned int status, tmout = 10000; @@ -2073,16 +2076,6 @@ static void wait_for_lsr(struct uart_8250_port *up, int bits) udelay(1); touch_nmi_watchdog(); } -} - -/* - * Wait for transmitter & holding register to empty - */ -static void wait_for_xmitr(struct uart_8250_port *up, int bits) -{ - unsigned int tmout; - - wait_for_lsr(up, bits); /* Wait up to 1s for flow control if necessary */ if (up->port.flags & UPF_CONS_FLOW) { @@ -3332,35 +3325,6 @@ static void serial8250_console_restore(struct uart_8250_port *up) serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS); } -/* - * Print a string to the serial port using the device FIFO - * - * It sends fifosize bytes and then waits for the fifo - * to get empty. - */ -static void serial8250_console_fifo_write(struct uart_8250_port *up, - const char *s, unsigned int count) -{ - int i; - const char *end = s + count; - unsigned int fifosize = up->port.fifosize; - bool cr_sent = false; - - while (s != end) { - wait_for_lsr(up, UART_LSR_THRE); - - for (i = 0; i < fifosize && s != end; ++i) { - if (*s == '\n' && !cr_sent) { - serial_out(up, UART_TX, '\r'); - cr_sent = true; - } else { - serial_out(up, UART_TX, *s++); - cr_sent = false; - } - } - } -} - /* * Print a string to the serial port trying not to disturb * any possible real use of the port... @@ -3376,7 +3340,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, struct uart_8250_em485 *em485 = up->em485; struct uart_port *port = &up->port; unsigned long flags; - unsigned int ier, use_fifo; + unsigned int ier; int locked = 1; touch_nmi_watchdog(); @@ -3408,20 +3372,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, mdelay(port->rs485.delay_rts_before_send); } - use_fifo = (up->capabilities & UART_CAP_FIFO) && - port->fifosize > 1 && - (serial_port_in(port, UART_FCR) & UART_FCR_ENABLE_FIFO) && - /* - * After we put a data in the fifo, the controller will send - * it regardless of the CTS state. Therefore, only use fifo - * if we don't use control flow. - */ - !(up->port.flags & UPF_CONS_FLOW); - - if (likely(use_fifo)) - serial8250_console_fifo_write(up, s, count); - else - uart_console_write(port, s, count, serial8250_console_putchar); + uart_console_write(port, s, count, serial8250_console_putchar); /* * Finally, wait for transmitter to become empty From 592ee1197f78b30bd60c87db9b6c8c045c8d8314 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 26 Jan 2022 09:21:32 +0800 Subject: [PATCH 0619/1295] blk-mq: fix missing blk_account_io_done() in error path If blk_mq_request_issue_directly() failed from blk_insert_cloned_request(), the request will be accounted start. Currently, blk_insert_cloned_request() is only called by dm, and such request won't be accounted done by dm. In normal path, io will be accounted start from blk_mq_bio_to_request(), when the request is allocated, and such io will be accounted done from __blk_mq_end_request_acct() whether it succeeded or failed. Thus add blk_account_io_done() to fix the problem. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220126012132.3111551-1-yukuai3@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index f3bf3358a3bb..1adfe4824ef5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2922,6 +2922,8 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * */ blk_mq_run_dispatch_ops(rq->q, ret = blk_mq_request_issue_directly(rq, true)); + if (ret) + blk_account_io_done(rq, ktime_get_ns()); return ret; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); From 152d1afa834c84530828ee031cf07a00e0fc0b8c Mon Sep 17 00:00:00 2001 From: Cameron Williams Date: Mon, 24 Jan 2022 09:42:23 +0000 Subject: [PATCH 0620/1295] tty: Add support for Brainboxes UC cards. This commit adds support for the some of the Brainboxes PCI range of cards, including the UC-101, UC-235/246, UC-257, UC-268, UC-275/279, UC-302, UC-310, UC-313, UC-320/324, UC-346, UC-357, UC-368 and UC-420/431. Signed-off-by: Cameron Williams Cc: stable Link: https://lore.kernel.org/r/AM5PR0202MB2564688493F7DD9B9C610827C45E9@AM5PR0202MB2564.eurprd02.prod.outlook.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_pci.c | 100 ++++++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c index e8b5469e9dfa..e17e97ea86fa 100644 --- a/drivers/tty/serial/8250/8250_pci.c +++ b/drivers/tty/serial/8250/8250_pci.c @@ -4779,8 +4779,30 @@ static const struct pci_device_id serial_pci_tbl[] = { { PCI_VENDOR_ID_INTASHIELD, PCI_DEVICE_ID_INTASHIELD_IS400, PCI_ANY_ID, PCI_ANY_ID, 0, 0, /* 135a.0dc0 */ pbn_b2_4_115200 }, + /* Brainboxes Devices */ /* - * BrainBoxes UC-260 + * Brainboxes UC-101 + */ + { PCI_VENDOR_ID_INTASHIELD, 0x0BA1, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + pbn_b2_2_115200 }, + /* + * Brainboxes UC-235/246 + */ + { PCI_VENDOR_ID_INTASHIELD, 0x0AA1, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + pbn_b2_1_115200 }, + /* + * Brainboxes UC-257 + */ + { PCI_VENDOR_ID_INTASHIELD, 0x0861, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + pbn_b2_2_115200 }, + /* + * Brainboxes UC-260/271/701/756 */ { PCI_VENDOR_ID_INTASHIELD, 0x0D21, PCI_ANY_ID, PCI_ANY_ID, @@ -4788,7 +4810,81 @@ static const struct pci_device_id serial_pci_tbl[] = { pbn_b2_4_115200 }, { PCI_VENDOR_ID_INTASHIELD, 0x0E34, PCI_ANY_ID, PCI_ANY_ID, - PCI_CLASS_COMMUNICATION_MULTISERIAL << 8, 0xffff00, + PCI_CLASS_COMMUNICATION_MULTISERIAL << 8, 0xffff00, + pbn_b2_4_115200 }, + /* + * Brainboxes UC-268 + */ + { PCI_VENDOR_ID_INTASHIELD, 0x0841, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + pbn_b2_4_115200 }, + /* + * Brainboxes UC-275/279 + */ + { PCI_VENDOR_ID_INTASHIELD, 0x0881, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + pbn_b2_8_115200 }, + /* + * Brainboxes UC-302 + */ + { PCI_VENDOR_ID_INTASHIELD, 0x08E1, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + pbn_b2_2_115200 }, + /* + * Brainboxes UC-310 + */ + { PCI_VENDOR_ID_INTASHIELD, 0x08C1, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + pbn_b2_2_115200 }, + /* + * Brainboxes UC-313 + */ + { PCI_VENDOR_ID_INTASHIELD, 0x08A3, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + pbn_b2_2_115200 }, + /* + * Brainboxes UC-320/324 + */ + { PCI_VENDOR_ID_INTASHIELD, 0x0A61, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + pbn_b2_1_115200 }, + /* + * Brainboxes UC-346 + */ + { PCI_VENDOR_ID_INTASHIELD, 0x0B02, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + pbn_b2_4_115200 }, + /* + * Brainboxes UC-357 + */ + { PCI_VENDOR_ID_INTASHIELD, 0x0A81, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + pbn_b2_2_115200 }, + { PCI_VENDOR_ID_INTASHIELD, 0x0A83, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + pbn_b2_2_115200 }, + /* + * Brainboxes UC-368 + */ + { PCI_VENDOR_ID_INTASHIELD, 0x0C41, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + pbn_b2_4_115200 }, + /* + * Brainboxes UC-420/431 + */ + { PCI_VENDOR_ID_INTASHIELD, 0x0921, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, pbn_b2_4_115200 }, /* * Perle PCI-RAS cards From f23653fe64479d96910bfda2b700b1af17c991ac Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Wed, 26 Jan 2022 09:22:54 +0000 Subject: [PATCH 0621/1295] tty: Partially revert the removal of the Cyclades public API Fix a user API regression introduced with commit f76edd8f7ce0 ("tty: cyclades, remove this orphan"), which removed a part of the API and caused compilation errors for user programs using said part, such as GCC 9 in its libsanitizer component[1]: .../libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.cc:160:10: fatal error: linux/cyclades.h: No such file or directory 160 | #include | ^~~~~~~~~~~~~~~~~~ compilation terminated. make[4]: *** [Makefile:664: sanitizer_platform_limits_posix.lo] Error 1 As the absolute minimum required bring `struct cyclades_monitor' and ioctl numbers back then so as to make the library build again. Add a preprocessor warning as to the obsolescence of the features provided. References: [1] GCC PR sanitizer/100379, "cyclades.h is removed from linux kernel header files", Fixes: f76edd8f7ce0 ("tty: cyclades, remove this orphan") Cc: stable@vger.kernel.org # v5.13+ Reviewed-by: Christoph Hellwig Signed-off-by: Maciej W. Rozycki Link: https://lore.kernel.org/r/alpine.DEB.2.20.2201260733430.11348@tpp.orcam.me.uk Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/cyclades.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 include/uapi/linux/cyclades.h diff --git a/include/uapi/linux/cyclades.h b/include/uapi/linux/cyclades.h new file mode 100644 index 000000000000..6225c5aebe06 --- /dev/null +++ b/include/uapi/linux/cyclades.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_CYCLADES_H +#define _UAPI_LINUX_CYCLADES_H + +#warning "Support for features provided by this header has been removed" +#warning "Please consider updating your code" + +struct cyclades_monitor { + unsigned long int_count; + unsigned long char_count; + unsigned long char_max; + unsigned long char_last; +}; + +#define CYGETMON 0x435901 +#define CYGETTHRESH 0x435902 +#define CYSETTHRESH 0x435903 +#define CYGETDEFTHRESH 0x435904 +#define CYSETDEFTHRESH 0x435905 +#define CYGETTIMEOUT 0x435906 +#define CYSETTIMEOUT 0x435907 +#define CYGETDEFTIMEOUT 0x435908 +#define CYSETDEFTIMEOUT 0x435909 +#define CYSETRFLOW 0x43590a +#define CYGETRFLOW 0x43590b +#define CYSETRTSDTR_INV 0x43590c +#define CYGETRTSDTR_INV 0x43590d +#define CYZSETPOLLCYCLE 0x43590e +#define CYZGETPOLLCYCLE 0x43590f +#define CYGETCD1400VER 0x435910 +#define CYSETWAIT 0x435912 +#define CYGETWAIT 0x435913 + +#endif /* _UAPI_LINUX_CYCLADES_H */ From db7f19c0aa0abcb751ff0ed694a071363f702b1d Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Tue, 4 Jan 2022 17:35:45 +0100 Subject: [PATCH 0622/1295] tty: rpmsg: Fix race condition releasing tty port The tty_port struct is part of the rpmsg_tty_port structure. The issue is that the rpmsg_tty_port structure is freed on rpmsg_tty_remove while it is still referenced in the tty_struct. Its release is not predictable due to workqueues. For instance following ftrace shows that rpmsg_tty_close is called after rpmsg_tty_release_cport: nr_test.sh-389 [000] ..... 212.093752: rpmsg_tty_remove <-rpmsg_dev_ remove cat-1191 [001] ..... 212.095697: tty_release <-__fput nr_test.sh-389 [000] ..... 212.099166: rpmsg_tty_release_cport <-rpm sg_tty_remove cat-1191 [001] ..... 212.115352: rpmsg_tty_close <-tty_release cat-1191 [001] ..... 212.115371: release_tty <-tty_release_str As consequence, the port must be free only when user has released the TTY interface. This path : - Introduce the .destruct port tty ops function to release the allocated rpmsg_tty_port structure. - Introduce the .hangup tty ops function to call tty_port_hangup. - Manages the tty port refcounting to trig the .destruct port ops, - Introduces the rpmsg_tty_cleanup function to ensure that the TTY is removed before decreasing the port refcount. Fixes: 7c0408d80579 ("tty: add rpmsg driver") Cc: stable Signed-off-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20220104163545.34710-1-arnaud.pouliquen@foss.st.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/rpmsg_tty.c | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/drivers/tty/rpmsg_tty.c b/drivers/tty/rpmsg_tty.c index dae2a4e44f38..29db413bbc03 100644 --- a/drivers/tty/rpmsg_tty.c +++ b/drivers/tty/rpmsg_tty.c @@ -50,10 +50,17 @@ static int rpmsg_tty_cb(struct rpmsg_device *rpdev, void *data, int len, void *p static int rpmsg_tty_install(struct tty_driver *driver, struct tty_struct *tty) { struct rpmsg_tty_port *cport = idr_find(&tty_idr, tty->index); + struct tty_port *port; tty->driver_data = cport; - return tty_port_install(&cport->port, driver, tty); + port = tty_port_get(&cport->port); + return tty_port_install(port, driver, tty); +} + +static void rpmsg_tty_cleanup(struct tty_struct *tty) +{ + tty_port_put(tty->port); } static int rpmsg_tty_open(struct tty_struct *tty, struct file *filp) @@ -106,12 +113,19 @@ static unsigned int rpmsg_tty_write_room(struct tty_struct *tty) return size; } +static void rpmsg_tty_hangup(struct tty_struct *tty) +{ + tty_port_hangup(tty->port); +} + static const struct tty_operations rpmsg_tty_ops = { .install = rpmsg_tty_install, .open = rpmsg_tty_open, .close = rpmsg_tty_close, .write = rpmsg_tty_write, .write_room = rpmsg_tty_write_room, + .hangup = rpmsg_tty_hangup, + .cleanup = rpmsg_tty_cleanup, }; static struct rpmsg_tty_port *rpmsg_tty_alloc_cport(void) @@ -137,8 +151,10 @@ static struct rpmsg_tty_port *rpmsg_tty_alloc_cport(void) return cport; } -static void rpmsg_tty_release_cport(struct rpmsg_tty_port *cport) +static void rpmsg_tty_destruct_port(struct tty_port *port) { + struct rpmsg_tty_port *cport = container_of(port, struct rpmsg_tty_port, port); + mutex_lock(&idr_lock); idr_remove(&tty_idr, cport->id); mutex_unlock(&idr_lock); @@ -146,7 +162,10 @@ static void rpmsg_tty_release_cport(struct rpmsg_tty_port *cport) kfree(cport); } -static const struct tty_port_operations rpmsg_tty_port_ops = { }; +static const struct tty_port_operations rpmsg_tty_port_ops = { + .destruct = rpmsg_tty_destruct_port, +}; + static int rpmsg_tty_probe(struct rpmsg_device *rpdev) { @@ -166,7 +185,8 @@ static int rpmsg_tty_probe(struct rpmsg_device *rpdev) cport->id, dev); if (IS_ERR(tty_dev)) { ret = dev_err_probe(dev, PTR_ERR(tty_dev), "Failed to register tty port\n"); - goto err_destroy; + tty_port_put(&cport->port); + return ret; } cport->rpdev = rpdev; @@ -177,12 +197,6 @@ static int rpmsg_tty_probe(struct rpmsg_device *rpdev) rpdev->src, rpdev->dst, cport->id); return 0; - -err_destroy: - tty_port_destroy(&cport->port); - rpmsg_tty_release_cport(cport); - - return ret; } static void rpmsg_tty_remove(struct rpmsg_device *rpdev) @@ -192,13 +206,11 @@ static void rpmsg_tty_remove(struct rpmsg_device *rpdev) dev_dbg(&rpdev->dev, "Removing rpmsg tty device %d\n", cport->id); /* User hang up to release the tty */ - if (tty_port_initialized(&cport->port)) - tty_port_tty_hangup(&cport->port, false); + tty_port_tty_hangup(&cport->port, false); tty_unregister_device(rpmsg_tty_driver, cport->id); - tty_port_destroy(&cport->port); - rpmsg_tty_release_cport(cport); + tty_port_put(&cport->port); } static struct rpmsg_device_id rpmsg_driver_tty_id_table[] = { From d06b1cf28297e27127d3da54753a3a01a2fa2f28 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Wed, 12 Jan 2022 13:42:14 -0600 Subject: [PATCH 0623/1295] serial: 8250: of: Fix mapped region size when using reg-offset property 8250_of supports a reg-offset property which is intended to handle cases where the device registers start at an offset inside the region of memory allocated to the device. The Xilinx 16550 UART, for which this support was initially added, requires this. However, the code did not adjust the overall size of the mapped region accordingly, causing the driver to request an area of memory past the end of the device's allocation. For example, if the UART was allocated an address of 0xb0130000, size of 0x10000 and reg-offset of 0x1000 in the device tree, the region of memory reserved was b0131000-b0140fff, which caused the driver for the region starting at b0140000 to fail to probe. Fix this by subtracting reg-offset from the mapped region size. Fixes: b912b5e2cfb3 ([POWERPC] Xilinx: of_serial support for Xilinx uart 16550.) Cc: stable Signed-off-by: Robert Hancock Link: https://lore.kernel.org/r/20220112194214.881844-1-robert.hancock@calian.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_of.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_of.c b/drivers/tty/serial/8250/8250_of.c index bce28729dd7b..be8626234627 100644 --- a/drivers/tty/serial/8250/8250_of.c +++ b/drivers/tty/serial/8250/8250_of.c @@ -83,8 +83,17 @@ static int of_platform_serial_setup(struct platform_device *ofdev, port->mapsize = resource_size(&resource); /* Check for shifted address mapping */ - if (of_property_read_u32(np, "reg-offset", &prop) == 0) + if (of_property_read_u32(np, "reg-offset", &prop) == 0) { + if (prop >= port->mapsize) { + dev_warn(&ofdev->dev, "reg-offset %u exceeds region size %pa\n", + prop, &port->mapsize); + ret = -EINVAL; + goto err_unprepare; + } + port->mapbase += prop; + port->mapsize -= prop; + } port->iotype = UPIO_MEM; if (of_property_read_u32(np, "reg-io-width", &prop) == 0) { From 8838b2af23caf1ff0610caef2795d6668a013b2d Mon Sep 17 00:00:00 2001 From: "daniel.starke@siemens.com" Date: Thu, 20 Jan 2022 02:18:57 -0800 Subject: [PATCH 0624/1295] tty: n_gsm: fix SW flow control encoding/handling n_gsm is based on the 3GPP 07.010 and its newer version is the 3GPP 27.010. See https://portal.3gpp.org/desktopmodules/Specifications/SpecificationDetails.aspx?specificationId=1516 The changes from 07.010 to 27.010 are non-functional. Therefore, I refer to the newer 27.010 here. Chapter 5.2.7.3 states that DC1 (XON) and DC3 (XOFF) are the control characters defined in ISO/IEC 646. These shall be quoted if seen in the data stream to avoid interpretation as flow control characters. ISO/IEC 646 refers to the set of ISO standards described as the ISO 7-bit coded character set for information interchange. Its final version is also known as ITU T.50. See https://www.itu.int/rec/T-REC-T.50-199209-I/en To abide the standard it is needed to quote DC1 and DC3 correctly if these are seen as data bytes and not as control characters. The current implementation already tries to enforce this but fails to catch all defined cases. 3GPP 27.010 chapter 5.2.7.3 clearly states that the most significant bit shall be ignored for DC1 and DC3 handling. The current implementation handles only the case with the most significant bit set 0. Cases in which DC1 and DC3 have the most significant bit set 1 are left unhandled. This patch fixes this by masking the data bytes with ISO_IEC_646_MASK (only the 7 least significant bits set 1) before comparing them with XON (a.k.a. DC1) and XOFF (a.k.a. DC3) when testing which byte values need quotation via byte stuffing. Fixes: e1eaea46bb40 ("tty: n_gsm line discipline") Cc: stable@vger.kernel.org Signed-off-by: Daniel Starke Link: https://lore.kernel.org/r/20220120101857.2509-1-daniel.starke@siemens.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_gsm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index ba27b274c967..0b1808e3a912 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -322,6 +322,7 @@ static int addr_cnt; #define GSM1_ESCAPE_BITS 0x20 #define XON 0x11 #define XOFF 0x13 +#define ISO_IEC_646_MASK 0x7F static const struct tty_port_operations gsm_port_ops; @@ -531,7 +532,8 @@ static int gsm_stuff_frame(const u8 *input, u8 *output, int len) int olen = 0; while (len--) { if (*input == GSM1_SOF || *input == GSM1_ESCAPE - || *input == XON || *input == XOFF) { + || (*input & ISO_IEC_646_MASK) == XON + || (*input & ISO_IEC_646_MASK) == XOFF) { *output++ = GSM1_ESCAPE; *output++ = *input++ ^ GSM1_ESCAPE_BITS; olen++; From d3d079bde07e1b7deaeb57506dc0b86010121d17 Mon Sep 17 00:00:00 2001 From: Valentin Caron Date: Tue, 11 Jan 2022 17:44:40 +0100 Subject: [PATCH 0625/1295] serial: stm32: prevent TDR register overwrite when sending x_char When sending x_char in stm32_usart_transmit_chars(), driver can overwrite the value of TDR register by the value of x_char. If this happens, the previous value that was present in TDR register will not be sent through uart. This code checks if the previous value in TDR register is sent before writing the x_char value into register. Fixes: 48a6092fb41f ("serial: stm32-usart: Add STM32 USART Driver") Cc: stable Signed-off-by: Valentin Caron Link: https://lore.kernel.org/r/20220111164441.6178-2-valentin.caron@foss.st.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/stm32-usart.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index 1f89ab0e49ac..c1b8828451c8 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -550,11 +550,23 @@ static void stm32_usart_transmit_chars(struct uart_port *port) struct stm32_port *stm32_port = to_stm32_port(port); const struct stm32_usart_offsets *ofs = &stm32_port->info->ofs; struct circ_buf *xmit = &port->state->xmit; + u32 isr; + int ret; if (port->x_char) { if (stm32_usart_tx_dma_started(stm32_port) && stm32_usart_tx_dma_enabled(stm32_port)) stm32_usart_clr_bits(port, ofs->cr3, USART_CR3_DMAT); + + /* Check that TDR is empty before filling FIFO */ + ret = + readl_relaxed_poll_timeout_atomic(port->membase + ofs->isr, + isr, + (isr & USART_SR_TXE), + 10, 1000); + if (ret) + dev_warn(port->dev, "1 character may be erased\n"); + writel_relaxed(port->x_char, port->membase + ofs->tdr); port->x_char = 0; port->icount.tx++; From 037b91ec7729524107982e36ec4b40f9b174f7a2 Mon Sep 17 00:00:00 2001 From: Valentin Caron Date: Tue, 11 Jan 2022 17:44:41 +0100 Subject: [PATCH 0626/1295] serial: stm32: fix software flow control transfer x_char is ignored by stm32_usart_start_tx() when xmit buffer is empty. Fix start_tx condition to allow x_char to be sent. Fixes: 48a6092fb41f ("serial: stm32-usart: Add STM32 USART Driver") Cc: stable Signed-off-by: Erwan Le Ray Signed-off-by: Valentin Caron Link: https://lore.kernel.org/r/20220111164441.6178-3-valentin.caron@foss.st.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/stm32-usart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index c1b8828451c8..9570002d07e7 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -742,7 +742,7 @@ static void stm32_usart_start_tx(struct uart_port *port) struct serial_rs485 *rs485conf = &port->rs485; struct circ_buf *xmit = &port->state->xmit; - if (uart_circ_empty(xmit)) + if (uart_circ_empty(xmit) && !port->x_char) return; if (rs485conf->flags & SER_RS485_ENABLED) { From 62f676ff7898f6c1bd26ce014564773a3dc00601 Mon Sep 17 00:00:00 2001 From: Jochen Mades Date: Sat, 23 Jan 2021 05:10:14 +0100 Subject: [PATCH 0627/1295] serial: pl011: Fix incorrect rs485 RTS polarity on set_mctrl Commit 8d479237727c ("serial: amba-pl011: add RS485 support") sought to keep RTS deasserted on set_mctrl if rs485 is enabled. However it did so only if deasserted RTS polarity is high. Fix it in case it's low. Fixes: 8d479237727c ("serial: amba-pl011: add RS485 support") Cc: stable@vger.kernel.org # v5.15+ Cc: Lino Sanfilippo Signed-off-by: Jochen Mades [lukas: copyedit commit message, add stable designation] Signed-off-by: Lukas Wunner Link: https://lore.kernel.org/r/85fa3323ba8c307943969b7343e23f34c3e652ba.1642909284.git.lukas@wunner.de Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/amba-pl011.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 1f1df46242f9..f64c475a1379 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -1582,8 +1582,12 @@ static void pl011_set_mctrl(struct uart_port *port, unsigned int mctrl) container_of(port, struct uart_amba_port, port); unsigned int cr; - if (port->rs485.flags & SER_RS485_ENABLED) - mctrl &= ~TIOCM_RTS; + if (port->rs485.flags & SER_RS485_ENABLED) { + if (port->rs485.flags & SER_RS485_RTS_AFTER_SEND) + mctrl &= ~TIOCM_RTS; + else + mctrl |= TIOCM_RTS; + } cr = pl011_read(uap, REG_CR); From 2dd8a74fddd21b95dcc60a2d3c9eaec993419d69 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 23 Jan 2022 05:21:14 +0100 Subject: [PATCH 0628/1295] serial: core: Initialize rs485 RTS polarity already on probe RTS polarity of rs485-enabled ports is currently initialized on uart open via: tty_port_open() tty_port_block_til_ready() tty_port_raise_dtr_rts() # if (C_BAUD(tty)) uart_dtr_rts() uart_port_dtr_rts() There's at least three problems here: First, if no baud rate is set, RTS polarity is not initialized. That's the right thing to do for rs232, but not for rs485, which requires that RTS is deasserted unconditionally. Second, if the DeviceTree property "linux,rs485-enabled-at-boot-time" is present, RTS should be deasserted as early as possible, i.e. on probe. Otherwise it may remain asserted until first open. Third, even though RTS is deasserted on open and close, it may subsequently be asserted by uart_throttle(), uart_unthrottle() or uart_set_termios() because those functions aren't rs485-aware. (Only uart_tiocmset() is.) To address these issues, move RTS initialization from uart_port_dtr_rts() to uart_configure_port(). Prevent subsequent modification of RTS polarity by moving the existing rs485 check from uart_tiocmget() to uart_update_mctrl(). That way, RTS is initialized on probe and then remains unmodified unless the uart transmits data. If rs485 is enabled at runtime (instead of at boot) through a TIOCSRS485 ioctl(), RTS is initialized by the uart driver's ->rs485_config() callback and then likewise remains unmodified. The PL011 driver initializes RTS on uart open and prevents subsequent modification in its ->set_mctrl() callback. That code is obsoleted by the present commit, so drop it. Cc: Jan Kiszka Cc: Su Bao Cheng Signed-off-by: Lukas Wunner Link: https://lore.kernel.org/r/2d2acaf3a69e89b7bf687c912022b11fd29dfa1e.1642909284.git.lukas@wunner.de Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/amba-pl011.c | 15 +------------- drivers/tty/serial/serial_core.c | 34 +++++++++++--------------------- 2 files changed, 13 insertions(+), 36 deletions(-) diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index f64c475a1379..ba053a68529f 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -1582,13 +1582,6 @@ static void pl011_set_mctrl(struct uart_port *port, unsigned int mctrl) container_of(port, struct uart_amba_port, port); unsigned int cr; - if (port->rs485.flags & SER_RS485_ENABLED) { - if (port->rs485.flags & SER_RS485_RTS_AFTER_SEND) - mctrl &= ~TIOCM_RTS; - else - mctrl |= TIOCM_RTS; - } - cr = pl011_read(uap, REG_CR); #define TIOCMBIT(tiocmbit, uartbit) \ @@ -1812,14 +1805,8 @@ static int pl011_startup(struct uart_port *port) cr &= UART011_CR_RTS | UART011_CR_DTR; cr |= UART01x_CR_UARTEN | UART011_CR_RXE; - if (port->rs485.flags & SER_RS485_ENABLED) { - if (port->rs485.flags & SER_RS485_RTS_AFTER_SEND) - cr &= ~UART011_CR_RTS; - else - cr |= UART011_CR_RTS; - } else { + if (!(port->rs485.flags & SER_RS485_ENABLED)) cr |= UART011_CR_TXE; - } pl011_write(cr, uap, REG_CR); diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index dc40c4155356..0db90be4c3bc 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -144,6 +144,11 @@ uart_update_mctrl(struct uart_port *port, unsigned int set, unsigned int clear) unsigned long flags; unsigned int old; + if (port->rs485.flags & SER_RS485_ENABLED) { + set &= ~TIOCM_RTS; + clear &= ~TIOCM_RTS; + } + spin_lock_irqsave(&port->lock, flags); old = port->mctrl; port->mctrl = (old & ~clear) | set; @@ -157,23 +162,10 @@ uart_update_mctrl(struct uart_port *port, unsigned int set, unsigned int clear) static void uart_port_dtr_rts(struct uart_port *uport, int raise) { - int rs485_on = uport->rs485_config && - (uport->rs485.flags & SER_RS485_ENABLED); - int RTS_after_send = !!(uport->rs485.flags & SER_RS485_RTS_AFTER_SEND); - - if (raise) { - if (rs485_on && RTS_after_send) { - uart_set_mctrl(uport, TIOCM_DTR); - uart_clear_mctrl(uport, TIOCM_RTS); - } else { - uart_set_mctrl(uport, TIOCM_DTR | TIOCM_RTS); - } - } else { - unsigned int clear = TIOCM_DTR; - - clear |= (!rs485_on || RTS_after_send) ? TIOCM_RTS : 0; - uart_clear_mctrl(uport, clear); - } + if (raise) + uart_set_mctrl(uport, TIOCM_DTR | TIOCM_RTS); + else + uart_clear_mctrl(uport, TIOCM_DTR | TIOCM_RTS); } /* @@ -1075,11 +1067,6 @@ uart_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) goto out; if (!tty_io_error(tty)) { - if (uport->rs485.flags & SER_RS485_ENABLED) { - set &= ~TIOCM_RTS; - clear &= ~TIOCM_RTS; - } - uart_update_mctrl(uport, set, clear); ret = 0; } @@ -2390,6 +2377,9 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state, */ spin_lock_irqsave(&port->lock, flags); port->mctrl &= TIOCM_DTR; + if (port->rs485.flags & SER_RS485_ENABLED && + !(port->rs485.flags & SER_RS485_RTS_AFTER_SEND)) + port->mctrl |= TIOCM_RTS; port->ops->set_mctrl(port, port->mctrl); spin_unlock_irqrestore(&port->lock, flags); From 961c39121759ad09a89598ec4ccdd34ae0468a19 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 6 Dec 2021 11:38:40 +0000 Subject: [PATCH 0629/1295] perf: Always wake the parent event When using per-process mode and event inheritance is set to true, forked processes will create a new perf events via inherit_event() -> perf_event_alloc(). But these events will not have ring buffers assigned to them. Any call to wakeup will be dropped if it's called on an event with no ring buffer assigned because that's the object that holds the wakeup list. If the child event is disabled due to a call to perf_aux_output_begin() or perf_aux_output_end(), the wakeup is dropped leaving userspace hanging forever on the poll. Normally the event is explicitly re-enabled by userspace after it wakes up to read the aux data, but in this case it does not get woken up so the event remains disabled. This can be reproduced when using Arm SPE and 'stress' which forks once before running the workload. By looking at the list of aux buffers read, it's apparent that they stop after the fork: perf record -e arm_spe// -vvv -- stress -c 1 With this patch applied they continue to be printed. This behaviour doesn't happen when using systemwide or per-cpu mode. Reported-by: Ruben Ayrapetyan Signed-off-by: James Clark Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211206113840.130802-2-james.clark@arm.com --- kernel/events/core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 479c9e672ec4..b1c1928c0e7c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5985,6 +5985,8 @@ static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *old_rb = NULL; unsigned long flags; + WARN_ON_ONCE(event->parent); + if (event->rb) { /* * Should be impossible, we set this when removing @@ -6042,6 +6044,9 @@ static void ring_buffer_wakeup(struct perf_event *event) { struct perf_buffer *rb; + if (event->parent) + event = event->parent; + rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { @@ -6055,6 +6060,9 @@ struct perf_buffer *ring_buffer_get(struct perf_event *event) { struct perf_buffer *rb; + if (event->parent) + event = event->parent; + rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { @@ -6763,7 +6771,7 @@ static unsigned long perf_prepare_sample_aux(struct perf_event *event, if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) goto out; - rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + rb = ring_buffer_get(sampler); if (!rb) goto out; @@ -6829,7 +6837,7 @@ static void perf_aux_sample_output(struct perf_event *event, if (WARN_ON_ONCE(!sampler || !data->aux_size)) return; - rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + rb = ring_buffer_get(sampler); if (!rb) return; From c5de60cd622a2607c043ba65e25a6e9998a369f9 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 24 Jan 2022 11:58:08 -0800 Subject: [PATCH 0630/1295] perf/core: Fix cgroup event list management The active cgroup events are managed in the per-cpu cgrp_cpuctx_list. This list is only accessed from current cpu and not protected by any locks. But from the commit ef54c1a476ae ("perf: Rework perf_event_exit_event()"), it's possible to access (actually modify) the list from another cpu. In the perf_remove_from_context(), it can remove an event from the context without an IPI when the context is not active. This is not safe with cgroup events which can have some active events in the context even if ctx->is_active is 0 at the moment. The target cpu might be in the middle of list iteration at the same time. If the event is enabled when it's about to be closed, it might call perf_cgroup_event_disable() and list_del() with the cgrp_cpuctx_list on a different cpu. This resulted in a crash due to an invalid list pointer access during the cgroup list traversal on the cpu which the event belongs to. Let's fallback to IPI to access the cgrp_cpuctx_list from that cpu. Similarly, perf_install_in_context() should use IPI for the cgroup events too. Fixes: ef54c1a476ae ("perf: Rework perf_event_exit_event()") Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220124195808.2252071-1-namhyung@kernel.org --- kernel/events/core.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index b1c1928c0e7c..76c754e45d01 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2462,7 +2462,11 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla * event_function_call() user. */ raw_spin_lock_irq(&ctx->lock); - if (!ctx->is_active) { + /* + * Cgroup events are per-cpu events, and must IPI because of + * cgrp_cpuctx_list. + */ + if (!ctx->is_active && !is_cgroup_event(event)) { __perf_remove_from_context(event, __get_cpu_context(ctx), ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); @@ -2895,11 +2899,14 @@ perf_install_in_context(struct perf_event_context *ctx, * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. + * Similarly, cgroup events for the context also needs the IPI to + * manipulate the cgrp_cpuctx_list. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. */ - if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) { + if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && + ctx->nr_events && !is_cgroup_event(event)) { raw_spin_lock_irq(&ctx->lock); if (ctx->task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); From 7fde14d705985dd933a3d916d39daa72b1668098 Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Sun, 23 Jan 2022 01:10:04 -0800 Subject: [PATCH 0631/1295] drm/privacy-screen: honor acpi=off in detect_thinkpad_privacy_screen when acpi=off is provided in bootarg, kernel crash with [ 1.252739] BUG: kernel NULL pointer dereference, address: 0000000000000018 [ 1.258308] Call Trace: [ 1.258490] ? acpi_walk_namespace+0x147/0x147 [ 1.258770] acpi_get_devices+0xe4/0x137 [ 1.258921] ? drm_core_init+0xc0/0xc0 [drm] [ 1.259108] detect_thinkpad_privacy_screen+0x5e/0xa8 [drm] [ 1.259337] drm_privacy_screen_lookup_init+0xe/0xe85 [drm] The reason is that acpi_walk_namespace expects acpi related stuff initialized but in fact it wouldn't when acpi is set to off. In this case we should honor acpi=off in detect_thinkpad_privacy_screen(). Signed-off-by: Tong Zhang Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20220123091004.763775-1-ztong0001@gmail.com --- drivers/gpu/drm/drm_privacy_screen_x86.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/drm_privacy_screen_x86.c b/drivers/gpu/drm/drm_privacy_screen_x86.c index a2cafb294ca6..e7aa74ad0b24 100644 --- a/drivers/gpu/drm/drm_privacy_screen_x86.c +++ b/drivers/gpu/drm/drm_privacy_screen_x86.c @@ -33,6 +33,9 @@ static bool __init detect_thinkpad_privacy_screen(void) unsigned long long output; acpi_status status; + if (acpi_disabled) + return false; + /* Get embedded-controller handle */ status = acpi_get_devices("PNP0C09", acpi_set_handle, NULL, &ec_handle); if (ACPI_FAILURE(status) || !ec_handle) From 549f8ffc7b2f7561bea7f90930b6c5104318e87b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 26 Jan 2022 15:50:11 +0100 Subject: [PATCH 0632/1295] ALSA: hda: Fix UAF of leds class devs at unbinding The LED class devices that are created by HD-audio codec drivers are registered via devm_led_classdev_register() and associated with the HD-audio codec device. Unfortunately, it turned out that the devres release doesn't work for this case; namely, since the codec resource release happens before the devm call chain, it triggers a NULL dereference or a UAF for a stale set_brightness_delay callback. For fixing the bug, this patch changes the LED class device register and unregister in a manual manner without devres, keeping the instances in hda_gen_spec. Reported-by: Alexander Sergeyev Cc: Link: https://lore.kernel.org/r/20220111195229.a77wrpjclqwrx4bx@localhost.localdomain Link: https://lore.kernel.org/r/20220126145011.16728-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_generic.c | 17 +++++++++++++++-- sound/pci/hda/hda_generic.h | 3 +++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/hda_generic.c b/sound/pci/hda/hda_generic.c index 3bf5e3410703..fc114e522480 100644 --- a/sound/pci/hda/hda_generic.c +++ b/sound/pci/hda/hda_generic.c @@ -91,6 +91,12 @@ static void snd_hda_gen_spec_free(struct hda_gen_spec *spec) free_kctls(spec); snd_array_free(&spec->paths); snd_array_free(&spec->loopback_list); +#ifdef CONFIG_SND_HDA_GENERIC_LEDS + if (spec->led_cdevs[LED_AUDIO_MUTE]) + led_classdev_unregister(spec->led_cdevs[LED_AUDIO_MUTE]); + if (spec->led_cdevs[LED_AUDIO_MICMUTE]) + led_classdev_unregister(spec->led_cdevs[LED_AUDIO_MICMUTE]); +#endif } /* @@ -3922,7 +3928,10 @@ static int create_mute_led_cdev(struct hda_codec *codec, enum led_brightness), bool micmute) { + struct hda_gen_spec *spec = codec->spec; struct led_classdev *cdev; + int idx = micmute ? LED_AUDIO_MICMUTE : LED_AUDIO_MUTE; + int err; cdev = devm_kzalloc(&codec->core.dev, sizeof(*cdev), GFP_KERNEL); if (!cdev) @@ -3932,10 +3941,14 @@ static int create_mute_led_cdev(struct hda_codec *codec, cdev->max_brightness = 1; cdev->default_trigger = micmute ? "audio-micmute" : "audio-mute"; cdev->brightness_set_blocking = callback; - cdev->brightness = ledtrig_audio_get(micmute ? LED_AUDIO_MICMUTE : LED_AUDIO_MUTE); + cdev->brightness = ledtrig_audio_get(idx); cdev->flags = LED_CORE_SUSPENDRESUME; - return devm_led_classdev_register(&codec->core.dev, cdev); + err = led_classdev_register(&codec->core.dev, cdev); + if (err < 0) + return err; + spec->led_cdevs[idx] = cdev; + return 0; } /** diff --git a/sound/pci/hda/hda_generic.h b/sound/pci/hda/hda_generic.h index 8e1bc8ea74fc..34eba40cc6e6 100644 --- a/sound/pci/hda/hda_generic.h +++ b/sound/pci/hda/hda_generic.h @@ -294,6 +294,9 @@ struct hda_gen_spec { struct hda_jack_callback *cb); void (*mic_autoswitch_hook)(struct hda_codec *codec, struct hda_jack_callback *cb); + + /* leds */ + struct led_classdev *led_cdevs[NUM_AUDIO_LEDS]; }; /* values for add_stereo_mix_input flag */ From 007c95120d1b054fb637dff24636a307a4889016 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Jan 2022 16:37:56 -0800 Subject: [PATCH 0633/1295] ethernet: 3com/typhoon: don't write directly to netdev->dev_addr This driver casts off the const and writes directly to netdev->dev_addr. This will result in a MAC address tree corruption and a warning. Compile tested ppc6xx_defconfig. Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/3com/typhoon.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c index 481f1df3106c..8aec5d9fbfef 100644 --- a/drivers/net/ethernet/3com/typhoon.c +++ b/drivers/net/ethernet/3com/typhoon.c @@ -2278,6 +2278,7 @@ typhoon_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) struct net_device *dev; struct typhoon *tp; int card_id = (int) ent->driver_data; + u8 addr[ETH_ALEN] __aligned(4); void __iomem *ioaddr; void *shared; dma_addr_t shared_dma; @@ -2409,8 +2410,9 @@ typhoon_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) goto error_out_reset; } - *(__be16 *)&dev->dev_addr[0] = htons(le16_to_cpu(xp_resp[0].parm1)); - *(__be32 *)&dev->dev_addr[2] = htonl(le32_to_cpu(xp_resp[0].parm2)); + *(__be16 *)&addr[0] = htons(le16_to_cpu(xp_resp[0].parm1)); + *(__be32 *)&addr[2] = htonl(le32_to_cpu(xp_resp[0].parm2)); + eth_hw_addr_set(dev, addr); if (!is_valid_ether_addr(dev->dev_addr)) { err_msg = "Could not obtain valid ethernet address, aborting"; From 14ba66a60fbfbe69bb7faf6b45e9803c2efd7a23 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Jan 2022 16:37:57 -0800 Subject: [PATCH 0634/1295] ethernet: tundra: don't write directly to netdev->dev_addr netdev->dev_addr is const now. Maintain the questionable offsetting in ndo_set_mac_address. Compile tested holly_defconfig and mpc7448_hpc2_defconfig. Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/tundra/tsi108_eth.c | 35 ++++++++++++------------ 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/tundra/tsi108_eth.c b/drivers/net/ethernet/tundra/tsi108_eth.c index cf0917b29e30..5251fc324221 100644 --- a/drivers/net/ethernet/tundra/tsi108_eth.c +++ b/drivers/net/ethernet/tundra/tsi108_eth.c @@ -1091,20 +1091,22 @@ static int tsi108_get_mac(struct net_device *dev) struct tsi108_prv_data *data = netdev_priv(dev); u32 word1 = TSI_READ(TSI108_MAC_ADDR1); u32 word2 = TSI_READ(TSI108_MAC_ADDR2); + u8 addr[ETH_ALEN]; /* Note that the octets are reversed from what the manual says, * producing an even weirder ordering... */ if (word2 == 0 && word1 == 0) { - dev->dev_addr[0] = 0x00; - dev->dev_addr[1] = 0x06; - dev->dev_addr[2] = 0xd2; - dev->dev_addr[3] = 0x00; - dev->dev_addr[4] = 0x00; + addr[0] = 0x00; + addr[1] = 0x06; + addr[2] = 0xd2; + addr[3] = 0x00; + addr[4] = 0x00; if (0x8 == data->phy) - dev->dev_addr[5] = 0x01; + addr[5] = 0x01; else - dev->dev_addr[5] = 0x02; + addr[5] = 0x02; + eth_hw_addr_set(dev, addr); word2 = (dev->dev_addr[0] << 16) | (dev->dev_addr[1] << 24); @@ -1114,12 +1116,13 @@ static int tsi108_get_mac(struct net_device *dev) TSI_WRITE(TSI108_MAC_ADDR1, word1); TSI_WRITE(TSI108_MAC_ADDR2, word2); } else { - dev->dev_addr[0] = (word2 >> 16) & 0xff; - dev->dev_addr[1] = (word2 >> 24) & 0xff; - dev->dev_addr[2] = (word1 >> 0) & 0xff; - dev->dev_addr[3] = (word1 >> 8) & 0xff; - dev->dev_addr[4] = (word1 >> 16) & 0xff; - dev->dev_addr[5] = (word1 >> 24) & 0xff; + addr[0] = (word2 >> 16) & 0xff; + addr[1] = (word2 >> 24) & 0xff; + addr[2] = (word1 >> 0) & 0xff; + addr[3] = (word1 >> 8) & 0xff; + addr[4] = (word1 >> 16) & 0xff; + addr[5] = (word1 >> 24) & 0xff; + eth_hw_addr_set(dev, addr); } if (!is_valid_ether_addr(dev->dev_addr)) { @@ -1136,14 +1139,12 @@ static int tsi108_set_mac(struct net_device *dev, void *addr) { struct tsi108_prv_data *data = netdev_priv(dev); u32 word1, word2; - int i; if (!is_valid_ether_addr(addr)) return -EADDRNOTAVAIL; - for (i = 0; i < 6; i++) - /* +2 is for the offset of the HW addr type */ - dev->dev_addr[i] = ((unsigned char *)addr)[i + 2]; + /* +2 is for the offset of the HW addr type */ + eth_hw_addr_set(dev, ((unsigned char *)addr) + 2); word2 = (dev->dev_addr[0] << 16) | (dev->dev_addr[1] << 24); From 7f6ec2b2f01b45b39cd2ffcd49fdb30d0b6c626d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Jan 2022 16:37:58 -0800 Subject: [PATCH 0635/1295] ethernet: broadcom/sb1250-mac: don't write directly to netdev->dev_addr netdev->dev_addr is const now. Compile tested bigsur_defconfig and sb1250_swarm_defconfig. Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/sb1250-mac.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/sb1250-mac.c b/drivers/net/ethernet/broadcom/sb1250-mac.c index f38f40eb966e..a1a38456c9a3 100644 --- a/drivers/net/ethernet/broadcom/sb1250-mac.c +++ b/drivers/net/ethernet/broadcom/sb1250-mac.c @@ -2183,9 +2183,7 @@ static int sbmac_init(struct platform_device *pldev, long long base) ea_reg >>= 8; } - for (i = 0; i < 6; i++) { - dev->dev_addr[i] = eaddr[i]; - } + eth_hw_addr_set(dev, eaddr); /* * Initialize context (get pointers to registers and stuff), then From 98ef22bbae788e05109aba16b77deb9f37bfb285 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Jan 2022 16:37:59 -0800 Subject: [PATCH 0636/1295] ethernet: i825xx: don't write directly to netdev->dev_addr netdev->dev_addr is const now. Compile tested rpc_defconfig w/ GCC 8.5. Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Jakub Kicinski Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/ethernet/i825xx/ether1.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/i825xx/ether1.c b/drivers/net/ethernet/i825xx/ether1.c index c612ef526d16..3e7d7c4bafdc 100644 --- a/drivers/net/ethernet/i825xx/ether1.c +++ b/drivers/net/ethernet/i825xx/ether1.c @@ -986,6 +986,7 @@ static int ether1_probe(struct expansion_card *ec, const struct ecard_id *id) { struct net_device *dev; + u8 addr[ETH_ALEN]; int i, ret = 0; ether1_banner(); @@ -1015,7 +1016,8 @@ ether1_probe(struct expansion_card *ec, const struct ecard_id *id) } for (i = 0; i < 6; i++) - dev->dev_addr[i] = readb(IDPROM_ADDRESS + (i << 2)); + addr[i] = readb(IDPROM_ADDRESS + (i << 2)); + eth_hw_addr_set(dev, addr); if (ether1_init_2(dev)) { ret = -ENODEV; From 5518c5246ba600f52c108474ecea615ef8ff9e25 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Jan 2022 16:38:00 -0800 Subject: [PATCH 0637/1295] ethernet: 8390/etherh: don't write directly to netdev->dev_addr netdev->dev_addr is const now. Compile tested rpc_defconfig w/ GCC 8.5. Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Jakub Kicinski Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/ethernet/8390/etherh.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/8390/etherh.c b/drivers/net/ethernet/8390/etherh.c index bd22a534b1c0..e7b879123bb1 100644 --- a/drivers/net/ethernet/8390/etherh.c +++ b/drivers/net/ethernet/8390/etherh.c @@ -655,6 +655,7 @@ etherh_probe(struct expansion_card *ec, const struct ecard_id *id) struct ei_device *ei_local; struct net_device *dev; struct etherh_priv *eh; + u8 addr[ETH_ALEN]; int ret; ret = ecard_request_resources(ec); @@ -724,12 +725,13 @@ etherh_probe(struct expansion_card *ec, const struct ecard_id *id) spin_lock_init(&ei_local->page_lock); if (ec->cid.product == PROD_ANT_ETHERM) { - etherm_addr(dev->dev_addr); + etherm_addr(addr); ei_local->reg_offset = etherm_regoffsets; } else { - etherh_addr(dev->dev_addr, ec); + etherh_addr(addr, ec); ei_local->reg_offset = etherh_regoffsets; } + eth_hw_addr_set(dev, addr); ei_local->name = dev->name; ei_local->word16 = 1; From 8eb86fc2f4905899a4684c1e03555e42556f4d53 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Jan 2022 16:38:01 -0800 Subject: [PATCH 0638/1295] ethernet: seeq/ether3: don't write directly to netdev->dev_addr netdev->dev_addr is const now. Compile tested rpc_defconfig w/ GCC 8.5. Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Jakub Kicinski Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/ethernet/seeq/ether3.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/seeq/ether3.c b/drivers/net/ethernet/seeq/ether3.c index 16a4cbae9326..c672f92d65e9 100644 --- a/drivers/net/ethernet/seeq/ether3.c +++ b/drivers/net/ethernet/seeq/ether3.c @@ -749,6 +749,7 @@ ether3_probe(struct expansion_card *ec, const struct ecard_id *id) const struct ether3_data *data = id->data; struct net_device *dev; int bus_type, ret; + u8 addr[ETH_ALEN]; ether3_banner(); @@ -776,7 +777,8 @@ ether3_probe(struct expansion_card *ec, const struct ecard_id *id) priv(dev)->seeq = priv(dev)->base + data->base_offset; dev->irq = ec->irq; - ether3_addr(dev->dev_addr, ec); + ether3_addr(addr, ec); + eth_hw_addr_set(dev, addr); priv(dev)->dev = dev; timer_setup(&priv(dev)->timer, ether3_ledoff, 0); From b6ab149654ef1fada0b37b379c04c475c2fdc675 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 25 Jan 2022 12:48:15 +0100 Subject: [PATCH 0639/1295] net: lan966x: Fix sleep in atomic context when injecting frames On lan966x, when injecting a frame it was polling the register QS_INJ_STATUS to see if it can continue with the injection of the frame. The problem was that it was using readx_poll_timeout which could sleep in atomic context. This patch fixes this issue by using readx_poll_timeout_atomic. Fixes: d28d6d2e37d10d ("net: lan966x: add port module support") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/lan966x/lan966x_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c index 2cb70da63db3..1f60fd125a1d 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c @@ -182,9 +182,9 @@ static int lan966x_port_inj_ready(struct lan966x *lan966x, u8 grp) { u32 val; - return readx_poll_timeout(lan966x_port_inj_status, lan966x, val, - QS_INJ_STATUS_FIFO_RDY_GET(val) & BIT(grp), - READL_SLEEP_US, READL_TIMEOUT_US); + return readx_poll_timeout_atomic(lan966x_port_inj_status, lan966x, val, + QS_INJ_STATUS_FIFO_RDY_GET(val) & BIT(grp), + READL_SLEEP_US, READL_TIMEOUT_US); } static int lan966x_port_ifh_xmit(struct sk_buff *skb, From 77bdaf39f3c8a10892bace06ed60bd063b731529 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 25 Jan 2022 12:48:16 +0100 Subject: [PATCH 0640/1295] net: lan966x: Fix sleep in atomic context when updating MAC table The function lan966x_mac_wait_for_completion is used to poll the status of the MAC table using the function readx_poll_timeout. The problem with this function is that is called also from atomic context. Therefore update the function to use readx_poll_timeout_atomic. Fixes: e18aba8941b40b ("net: lan966x: add mactable support") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/lan966x/lan966x_mac.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c index ca5f1177963d..ce5970bdcc6a 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c @@ -40,11 +40,12 @@ static int lan966x_mac_wait_for_completion(struct lan966x *lan966x) { u32 val; - return readx_poll_timeout(lan966x_mac_get_status, - lan966x, val, - (ANA_MACACCESS_MAC_TABLE_CMD_GET(val)) == - MACACCESS_CMD_IDLE, - TABLE_UPDATE_SLEEP_US, TABLE_UPDATE_TIMEOUT_US); + return readx_poll_timeout_atomic(lan966x_mac_get_status, + lan966x, val, + (ANA_MACACCESS_MAC_TABLE_CMD_GET(val)) == + MACACCESS_CMD_IDLE, + TABLE_UPDATE_SLEEP_US, + TABLE_UPDATE_TIMEOUT_US); } static void lan966x_mac_select(struct lan966x *lan966x, From 37c2c83ca4f1ef4b6908181ac98e18360af89b42 Mon Sep 17 00:00:00 2001 From: Xin Xiong Date: Tue, 25 Jan 2022 18:12:15 +0800 Subject: [PATCH 0641/1295] spi: uniphier: fix reference count leak in uniphier_spi_probe() The issue happens in several error paths in uniphier_spi_probe(). When either dma_get_slave_caps() or devm_spi_register_master() returns an error code, the function forgets to decrease the refcount of both `dma_rx` and `dma_tx` objects, which may lead to refcount leaks. Fix it by decrementing the reference count of specific objects in those error paths. Signed-off-by: Xin Xiong Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Reviewed-by: Kunihiko Hayashi Fixes: 28d1dddc59f6 ("spi: uniphier: Add DMA transfer mode support") Link: https://lore.kernel.org/r/20220125101214.35677-1-xiongx18@fudan.edu.cn Signed-off-by: Mark Brown --- drivers/spi/spi-uniphier.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-uniphier.c b/drivers/spi/spi-uniphier.c index 342ee8d2c476..cc0da4822231 100644 --- a/drivers/spi/spi-uniphier.c +++ b/drivers/spi/spi-uniphier.c @@ -726,7 +726,7 @@ static int uniphier_spi_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "failed to get TX DMA capacities: %d\n", ret); - goto out_disable_clk; + goto out_release_dma; } dma_tx_burst = caps.max_burst; } @@ -735,7 +735,7 @@ static int uniphier_spi_probe(struct platform_device *pdev) if (IS_ERR_OR_NULL(master->dma_rx)) { if (PTR_ERR(master->dma_rx) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; - goto out_disable_clk; + goto out_release_dma; } master->dma_rx = NULL; dma_rx_burst = INT_MAX; @@ -744,7 +744,7 @@ static int uniphier_spi_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "failed to get RX DMA capacities: %d\n", ret); - goto out_disable_clk; + goto out_release_dma; } dma_rx_burst = caps.max_burst; } @@ -753,10 +753,20 @@ static int uniphier_spi_probe(struct platform_device *pdev) ret = devm_spi_register_master(&pdev->dev, master); if (ret) - goto out_disable_clk; + goto out_release_dma; return 0; +out_release_dma: + if (!IS_ERR_OR_NULL(master->dma_rx)) { + dma_release_channel(master->dma_rx); + master->dma_rx = NULL; + } + if (!IS_ERR_OR_NULL(master->dma_tx)) { + dma_release_channel(master->dma_tx); + master->dma_tx = NULL; + } + out_disable_clk: clk_disable_unprepare(priv->clk); From 66d28b21fe6b3da8d1e9f0a7ba38bc61b6c547e1 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 26 Jan 2022 09:40:01 -0600 Subject: [PATCH 0642/1295] PCI/sysfs: Find shadow ROM before static attribute initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ville reported that the sysfs "rom" file for VGA devices disappeared after 527139d738d7 ("PCI/sysfs: Convert "rom" to static attribute"). Prior to 527139d738d7, FINAL fixups, including pci_fixup_video() where we find shadow ROMs, were run before pci_create_sysfs_dev_files() created the sysfs "rom" file. After 527139d738d7, "rom" is a static attribute and is created before FINAL fixups are run, so we didn't create "rom" files for shadow ROMs: acpi_pci_root_add ... pci_scan_single_device pci_device_add pci_fixup_video # <-- new HEADER fixup device_add ... if (grp->is_visible()) pci_dev_rom_attr_is_visible # after 527139d738d7 pci_bus_add_devices pci_bus_add_device pci_fixup_device(pci_fixup_final) pci_fixup_video # <-- previous FINAL fixup pci_create_sysfs_dev_files if (pci_resource_len(pdev, PCI_ROM_RESOURCE)) sysfs_create_bin_file("rom") # before 527139d738d7 Change pci_fixup_video() to be a HEADER fixup so it runs before sysfs static attributes are initialized. Rename the Loongson pci_fixup_radeon() to pci_fixup_video() and make its dmesg logging identical to the others since it is doing the same job. Link: https://lore.kernel.org/r/YbxqIyrkv3GhZVxx@intel.com Fixes: 527139d738d7 ("PCI/sysfs: Convert "rom" to static attribute") Link: https://lore.kernel.org/r/20220126154001.16895-1-helgaas@kernel.org Reported-by: Ville Syrjälä Tested-by: Ville Syrjälä Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v5.13+ Cc: Huacai Chen Cc: Jiaxun Yang Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Krzysztof Wilczyński --- arch/ia64/pci/fixup.c | 4 ++-- arch/mips/loongson64/vbios_quirk.c | 9 ++++----- arch/x86/pci/fixup.c | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/ia64/pci/fixup.c b/arch/ia64/pci/fixup.c index acb55a41260d..2bcdd7d3a1ad 100644 --- a/arch/ia64/pci/fixup.c +++ b/arch/ia64/pci/fixup.c @@ -76,5 +76,5 @@ static void pci_fixup_video(struct pci_dev *pdev) } } } -DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID, - PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video); +DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID, + PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video); diff --git a/arch/mips/loongson64/vbios_quirk.c b/arch/mips/loongson64/vbios_quirk.c index 9a29e94d3db1..3115d4de982c 100644 --- a/arch/mips/loongson64/vbios_quirk.c +++ b/arch/mips/loongson64/vbios_quirk.c @@ -3,7 +3,7 @@ #include #include -static void pci_fixup_radeon(struct pci_dev *pdev) +static void pci_fixup_video(struct pci_dev *pdev) { struct resource *res = &pdev->resource[PCI_ROM_RESOURCE]; @@ -22,8 +22,7 @@ static void pci_fixup_radeon(struct pci_dev *pdev) res->flags = IORESOURCE_MEM | IORESOURCE_ROM_SHADOW | IORESOURCE_PCI_FIXED; - dev_info(&pdev->dev, "BAR %d: assigned %pR for Radeon ROM\n", - PCI_ROM_RESOURCE, res); + dev_info(&pdev->dev, "Video device with shadowed ROM at %pR\n", res); } -DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_ATI, 0x9615, - PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_radeon); +DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_ATI, 0x9615, + PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video); diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 2edd86649468..615a76d70019 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -353,8 +353,8 @@ static void pci_fixup_video(struct pci_dev *pdev) } } } -DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID, - PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video); +DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID, + PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video); static const struct dmi_system_id msi_k8t_dmi_table[] = { From c80d401c52a2d1baf2a5afeb06f0ffe678e56d23 Mon Sep 17 00:00:00 2001 From: Tianchen Ding Date: Tue, 18 Jan 2022 18:05:18 +0800 Subject: [PATCH 0643/1295] cpuset: Fix the bug that subpart_cpus updated wrongly in update_cpumask() subparts_cpus should be limited as a subset of cpus_allowed, but it is updated wrongly by using cpumask_andnot(). Use cpumask_and() instead to fix it. Fixes: ee8dde0cd2ce ("cpuset: Add new v2 cpuset.sched.partition flag") Signed-off-by: Tianchen Ding Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index bb3531e7fda7..804ff5738c5f 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1635,8 +1635,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Make sure that subparts_cpus is a subset of cpus_allowed. */ if (cs->nr_subparts_cpus) { - cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus, - cs->cpus_allowed); + cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed); cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); } spin_unlock_irq(&callback_lock); From 519669cc58368385db93fd1560c09bf3334a6ecc Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 24 Jan 2022 16:43:59 -0800 Subject: [PATCH 0644/1295] KVM: VMX: Remove vmcs_config.order The maximum size of a VMCS (or VMXON region) is 4096. By definition, these are order 0 allocations. Signed-off-by: Jim Mattson Message-Id: <20220125004359.147600-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/capabilities.h | 1 - arch/x86/kvm/vmx/vmx.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 959b59d13b5a..3f430e218375 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -54,7 +54,6 @@ struct nested_vmx_msrs { struct vmcs_config { int size; - int order; u32 basic_cap; u32 revision_id; u32 pin_based_exec_ctrl; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2833b095a804..a6446eb935df 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2603,7 +2603,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, return -EIO; vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->order = get_order(vmcs_conf->size); vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; vmcs_conf->revision_id = vmx_msr_low; @@ -2628,7 +2627,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) struct page *pages; struct vmcs *vmcs; - pages = __alloc_pages_node(node, flags, vmcs_config.order); + pages = __alloc_pages_node(node, flags, 0); if (!pages) return NULL; vmcs = page_address(pages); @@ -2647,7 +2646,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) void free_vmcs(struct vmcs *vmcs) { - free_pages((unsigned long)vmcs, vmcs_config.order); + free_page((unsigned long)vmcs); } /* From 35fe7cfbab2e81f1afb23fc4212210b1de6d9633 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 25 Jan 2022 01:17:00 -0800 Subject: [PATCH 0645/1295] KVM: LAPIC: Also cancel preemption timer during SET_LAPIC The below warning is splatting during guest reboot. ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1931 at arch/x86/kvm/x86.c:10322 kvm_arch_vcpu_ioctl_run+0x874/0x880 [kvm] CPU: 0 PID: 1931 Comm: qemu-system-x86 Tainted: G I 5.17.0-rc1+ #5 RIP: 0010:kvm_arch_vcpu_ioctl_run+0x874/0x880 [kvm] Call Trace: kvm_vcpu_ioctl+0x279/0x710 [kvm] __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fd39797350b This can be triggered by not exposing tsc-deadline mode and doing a reboot in the guest. The lapic_shutdown() function which is called in sys_reboot path will not disarm the flying timer, it just masks LVTT. lapic_shutdown() clears APIC state w/ LVT_MASKED and timer-mode bit is 0, this can trigger timer-mode switch between tsc-deadline and oneshot/periodic, which can result in preemption timer be cancelled in apic_update_lvtt(). However, We can't depend on this when not exposing tsc-deadline mode and oneshot/periodic modes emulated by preemption timer. Qemu will synchronise states around reset, let's cancel preemption timer under KVM_SET_LAPIC. Signed-off-by: Wanpeng Li Message-Id: <1643102220-35667-1-git-send-email-wanpengli@tencent.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index baca9fa37a91..4662469240bc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2629,7 +2629,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_apic_set_version(vcpu); apic_update_ppr(apic); - hrtimer_cancel(&apic->lapic_timer.timer); + cancel_apic_timer(apic); apic->lapic_timer.expired_tscdeadline = 0; apic_update_lvtt(apic); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); From 1ffce0924a8c86cf0590c039cd5f5c9375d32e9b Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 25 Jan 2022 19:52:23 +0800 Subject: [PATCH 0646/1295] KVM: x86/cpuid: Exclude unpermitted xfeatures sizes at KVM_GET_SUPPORTED_CPUID With the help of xstate_get_guest_group_perm(), KVM can exclude unpermitted xfeatures in cpuid.0xd.0.eax, in which case the corresponding xfeatures sizes should also be matched to the permitted xfeatures. To fix this inconsistency, the permitted_xcr0 and permitted_xss are defined consistently, which implies 'supported' plus certain permissions for this task, and it also fixes cpuid.0xd.1.ebx and later leaf-by-leaf queries. Fixes: 445ecdf79be0 ("kvm: x86: Exclude unpermitted xfeatures at KVM_GET_SUPPORTED_CPUID") Signed-off-by: Like Xu Message-Id: <20220125115223.33707-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 89d7822a8f5b..8e2704b50e47 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -899,13 +899,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xd: { - u64 guest_perm = xstate_get_guest_group_perm(); + u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm(); + u64 permitted_xss = supported_xss; - entry->eax &= supported_xcr0 & guest_perm; - entry->ebx = xstate_required_size(supported_xcr0, false); + entry->eax &= permitted_xcr0; + entry->ebx = xstate_required_size(permitted_xcr0, false); entry->ecx = entry->ebx; - entry->edx &= (supported_xcr0 & guest_perm) >> 32; - if (!supported_xcr0) + entry->edx &= permitted_xcr0 >> 32; + if (!permitted_xcr0) break; entry = do_host_cpuid(array, function, 1); @@ -914,20 +915,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) cpuid_entry_override(entry, CPUID_D_1_EAX); if (entry->eax & (F(XSAVES)|F(XSAVEC))) - entry->ebx = xstate_required_size(supported_xcr0 | supported_xss, + entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss, true); else { - WARN_ON_ONCE(supported_xss != 0); + WARN_ON_ONCE(permitted_xss != 0); entry->ebx = 0; } - entry->ecx &= supported_xss; - entry->edx &= supported_xss >> 32; + entry->ecx &= permitted_xss; + entry->edx &= permitted_xss >> 32; for (i = 2; i < 64; ++i) { bool s_state; - if (supported_xcr0 & BIT_ULL(i)) + if (permitted_xcr0 & BIT_ULL(i)) s_state = false; - else if (supported_xss & BIT_ULL(i)) + else if (permitted_xss & BIT_ULL(i)) s_state = true; else continue; @@ -941,7 +942,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * invalid sub-leafs. Only valid sub-leafs should * reach this point, and they should have a non-zero * save state size. Furthermore, check whether the - * processor agrees with supported_xcr0/supported_xss + * processor agrees with permitted_xcr0/permitted_xss * on whether this is an XCR0- or IA32_XSS-managed area. */ if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) { From 47c28d436f409f5b009dc82bd82d4971088aa391 Mon Sep 17 00:00:00 2001 From: Denis Valeev Date: Sat, 22 Jan 2022 23:13:57 +0300 Subject: [PATCH 0647/1295] KVM: x86: nSVM: skip eax alignment check for non-SVM instructions The bug occurs on #GP triggered by VMware backdoor when eax value is unaligned. eax alignment check should not be applied to non-SVM instructions because it leads to incorrect omission of the instructions emulation. Apply the alignment check only to SVM instructions to fix. Fixes: d1cba6c92237 ("KVM: x86: nSVM: test eax for 4K alignment for GP errata workaround") Signed-off-by: Denis Valeev Message-Id: Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6d31d357a83b..ec05f8bfc21b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2091,10 +2091,6 @@ static int gp_interception(struct kvm_vcpu *vcpu) if (error_code) goto reinject; - /* All SVM instructions expect page aligned RAX */ - if (svm->vmcb->save.rax & ~PAGE_MASK) - goto reinject; - /* Decode the instruction for usage later */ if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) goto reinject; @@ -2112,8 +2108,13 @@ static int gp_interception(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu)) return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); - } else + } else { + /* All SVM instructions expect page aligned RAX */ + if (svm->vmcb->save.rax & ~PAGE_MASK) + goto reinject; + return emulate_svm_instr(vcpu, opcode); + } reinject: kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); From 55467fcd55b89c622e62b4afe60ac0eb2fae91f2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 20 Jan 2022 01:07:11 +0000 Subject: [PATCH 0648/1295] KVM: SVM: Never reject emulation due to SMAP errata for !SEV guests Always signal that emulation is possible for !SEV guests regardless of whether or not the CPU provided a valid instruction byte stream. KVM can read all guest state (memory and registers) for !SEV guests, i.e. can fetch the code stream from memory even if the CPU failed to do so because of the SMAP errata. Fixes: 05d5a4863525 ("KVM: SVM: Workaround errata#1096 (insn_len maybe zero on SMAP violation)") Cc: stable@vger.kernel.org Cc: Tom Lendacky Cc: Brijesh Singh Signed-off-by: Sean Christopherson Reviewed-by: Liam Merwick Message-Id: <20220120010719.711476-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ec05f8bfc21b..a46e1c6abf0e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4258,8 +4258,13 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int i bool smep, smap, is_user; unsigned long cr4; + /* Emulation is always possible when KVM has access to all guest state. */ + if (!sev_guest(vcpu->kvm)) + return true; + /* - * When the guest is an SEV-ES guest, emulation is not possible. + * Emulation is impossible for SEV-ES guests as KVM doesn't have access + * to guest register state. */ if (sev_es_guest(vcpu->kvm)) return false; @@ -4319,9 +4324,6 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int i smap = cr4 & X86_CR4_SMAP; is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { - if (!sev_guest(vcpu->kvm)) - return true; - pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); } From 31c25585695abdf03d6160aa6d829e855b256329 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 20 Jan 2022 01:07:12 +0000 Subject: [PATCH 0649/1295] Revert "KVM: SVM: avoid infinite loop on NPF from bad address" Revert a completely broken check on an "invalid" RIP in SVM's workaround for the DecodeAssists SMAP errata. kvm_vcpu_gfn_to_memslot() obviously expects a gfn, i.e. operates in the guest physical address space, whereas RIP is a virtual (not even linear) address. The "fix" worked for the problematic KVM selftest because the test identity mapped RIP. Fully revert the hack instead of trying to translate RIP to a GPA, as the non-SEV case is now handled earlier, and KVM cannot access guest page tables to translate RIP. This reverts commit e72436bc3a5206f95bb384e741154166ddb3202e. Fixes: e72436bc3a52 ("KVM: SVM: avoid infinite loop on NPF from bad address") Reported-by: Liam Merwick Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Reviewed-by: Liam Merwick Message-Id: <20220120010719.711476-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 7 ------- virt/kvm/kvm_main.c | 1 - 2 files changed, 8 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a46e1c6abf0e..37eb3168e0ea 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4312,13 +4312,6 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int i if (likely(!insn || insn_len)) return true; - /* - * If RIP is invalid, go ahead with emulation which will cause an - * internal error exit. - */ - if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) - return true; - cr4 = kvm_read_cr4(vcpu); smep = cr4 & X86_CR4_SMEP; smap = cr4 & X86_CR4_SMAP; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0bbb3cbf6014..2755ba4177d6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2248,7 +2248,6 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn return NULL; } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { From 0b0be065b7563ac708aaa9f69dd4941c80b3446d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 20 Jan 2022 01:07:13 +0000 Subject: [PATCH 0650/1295] KVM: SVM: Don't intercept #GP for SEV guests Never intercept #GP for SEV guests as reading SEV guest private memory will return cyphertext, i.e. emulating on #GP can't work as intended. Cc: stable@vger.kernel.org Cc: Tom Lendacky Cc: Brijesh Singh Signed-off-by: Sean Christopherson Reviewed-by: Liam Merwick Message-Id: <20220120010719.711476-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 37eb3168e0ea..defc91a8c04c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -312,7 +312,11 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) return ret; } - if (svm_gp_erratum_intercept) + /* + * Never intercept #GP for SEV guests, KVM can't + * decrypt guest memory to workaround the erratum. + */ + if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) set_exception_intercept(svm, GP_VECTOR); } } @@ -1010,9 +1014,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu) * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway - * as VMware does. + * as VMware does. Don't intercept #GP for SEV guests as KVM can't + * decrypt guest memory to decode the faulting instruction. */ - if (enable_vmware_backdoor) + if (enable_vmware_backdoor && !sev_guest(vcpu->kvm)) set_exception_intercept(svm, GP_VECTOR); svm_set_intercept(svm, INTERCEPT_INTR); From c532f2903b69b775d27016511fbe29a14a098f95 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 20 Jan 2022 01:07:14 +0000 Subject: [PATCH 0651/1295] KVM: SVM: Explicitly require DECODEASSISTS to enable SEV support Add a sanity check on DECODEASSIST being support if SEV is supported, as KVM cannot read guest private memory and thus relies on the CPU to provide the instruction byte stream on #NPF for emulation. The intent of the check is to document the dependency, it should never fail in practice as producing hardware that supports SEV but not DECODEASSISTS would be non-sensical. Signed-off-by: Sean Christopherson Reviewed-by: Liam Merwick Message-Id: <20220120010719.711476-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 6a22798eaaee..17b53457d866 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2100,8 +2100,13 @@ void __init sev_hardware_setup(void) if (!sev_enabled || !npt_enabled) goto out; - /* Does the CPU support SEV? */ - if (!boot_cpu_has(X86_FEATURE_SEV)) + /* + * SEV must obviously be supported in hardware. Sanity check that the + * CPU supports decode assists, which is mandatory for SEV guests to + * support instruction emulation. + */ + if (!boot_cpu_has(X86_FEATURE_SEV) || + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS))) goto out; /* Retrieve SEV CPUID information */ From 4d31d9eff244e2631f028d658979ccbbdbcb423b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 20 Jan 2022 01:07:15 +0000 Subject: [PATCH 0652/1295] KVM: x86: Pass emulation type to can_emulate_instruction() Pass the emulation type to kvm_x86_ops.can_emulate_insutrction() so that a future commit can harden KVM's SEV support to WARN on emulation scenarios that should never happen. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Liam Merwick Message-Id: <20220120010719.711476-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/svm/svm.c | 3 ++- arch/x86/kvm/vmx/vmx.c | 7 ++++--- arch/x86/kvm/x86.c | 11 +++++++++-- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9b2fffeeab16..2fef4ef62061 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1482,7 +1482,8 @@ struct kvm_x86_ops { int (*get_msr_feature)(struct kvm_msr_entry *entry); - bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len); + bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index defc91a8c04c..115743e250bb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4258,7 +4258,8 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) } } -static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) +static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { bool smep, smap, is_user; unsigned long cr4; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a6446eb935df..b1165bb13a5a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1487,11 +1487,12 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } -static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) +static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { /* * Emulation of instructions in SGX enclaves is impossible as RIP does - * not point tthe failing instruction, and even if it did, the code + * not point at the failing instruction, and even if it did, the code * stream is inaccessible. Inject #UD instead of exiting to userspace * so that guest userspace can't DoS the guest simply by triggering * emulation (enclaves are CPL3 only). @@ -5425,7 +5426,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { gpa_t gpa; - if (!vmx_can_emulate_instruction(vcpu, NULL, 0)) + if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) return 1; /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9ceeab1e5bdc..c8f3d3ea8a96 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6810,6 +6810,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) +{ + return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type, + insn, insn_len); +} + int handle_ud(struct kvm_vcpu *vcpu) { static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX }; @@ -6817,7 +6824,7 @@ int handle_ud(struct kvm_vcpu *vcpu) char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; - if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0))) + if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0))) return 1; if (force_emulation_prefix && @@ -8193,7 +8200,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool writeback = true; bool write_fault_to_spt; - if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len))) + if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len))) return 1; vcpu->arch.l1tf_flush_l1d = true; From 132627c64d94b1561ba5a444824e46c9f84c3d5b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 20 Jan 2022 01:07:16 +0000 Subject: [PATCH 0653/1295] KVM: SVM: WARN if KVM attempts emulation on #UD or #GP for SEV guests WARN if KVM attempts to emulate in response to #UD or #GP for SEV guests, i.e. if KVM intercepts #UD or #GP, as emulation on any fault except #NPF is impossible since KVM cannot read guest private memory to get the code stream, and the CPU's DecodeAssists feature only provides the instruction bytes on #NPF. Signed-off-by: Sean Christopherson Reviewed-by: Liam Merwick Message-Id: <20220120010719.711476-7-seanjc@google.com> [Warn on EMULTYPE_TRAP_UD_FORCED according to Liam Merwick's review. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 115743e250bb..85bbfba1fa07 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4268,6 +4268,11 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, if (!sev_guest(vcpu->kvm)) return true; + /* #UD and #GP should never be intercepted for SEV guests. */ + WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD | + EMULTYPE_TRAP_UD_FORCED | + EMULTYPE_VMWARE_GP)); + /* * Emulation is impossible for SEV-ES guests as KVM doesn't have access * to guest register state. From 04c40f344defdbd842d8a64fcfb47ef74b39ef4e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 20 Jan 2022 01:07:17 +0000 Subject: [PATCH 0654/1295] KVM: SVM: Inject #UD on attempted emulation for SEV guest w/o insn buffer Inject #UD if KVM attempts emulation for an SEV guests without an insn buffer and instruction decoding is required. The previous behavior of allowing emulation if there is no insn buffer is undesirable as doing so means KVM is reading guest private memory and thus decoding cyphertext, i.e. is emulating garbage. The check was previously necessary as the emulation type was not provided, i.e. SVM needed to allow emulation to handle completion of emulation after exiting to userspace to handle I/O. Signed-off-by: Sean Christopherson Reviewed-by: Liam Merwick Message-Id: <20220120010719.711476-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 89 ++++++++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 34 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 85bbfba1fa07..fb65bfabea25 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4280,49 +4280,70 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, if (sev_es_guest(vcpu->kvm)) return false; + /* + * Emulation is possible if the instruction is already decoded, e.g. + * when completing I/O after returning from userspace. + */ + if (emul_type & EMULTYPE_NO_DECODE) + return true; + + /* + * Emulation is possible for SEV guests if and only if a prefilled + * buffer containing the bytes of the intercepted instruction is + * available. SEV guest memory is encrypted with a guest specific key + * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and + * decode garbage. + * + * Inject #UD if KVM reached this point without an instruction buffer. + * In practice, this path should never be hit by a well-behaved guest, + * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path + * is still theoretically reachable, e.g. via unaccelerated fault-like + * AVIC access, and needs to be handled by KVM to avoid putting the + * guest into an infinite loop. Injecting #UD is somewhat arbitrary, + * but its the least awful option given lack of insight into the guest. + */ + if (unlikely(!insn)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return false; + } + + /* + * Emulate for SEV guests if the insn buffer is not empty. The buffer + * will be empty if the DecodeAssist microcode cannot fetch bytes for + * the faulting instruction because the code fetch itself faulted, e.g. + * the guest attempted to fetch from emulated MMIO or a guest page + * table used to translate CS:RIP resides in emulated MMIO. + */ + if (likely(insn_len)) + return true; + /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. * * Errata: - * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is - * possible that CPU microcode implementing DecodeAssist will fail - * to read bytes of instruction which caused #NPF. In this case, - * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly - * return 0 instead of the correct guest instruction bytes. - * - * This happens because CPU microcode reading instruction bytes - * uses a special opcode which attempts to read data using CPL=0 - * privileges. The microcode reads CS:RIP and if it hits a SMAP - * fault, it gives up and returns no instruction bytes. + * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is + * possible that CPU microcode implementing DecodeAssist will fail to + * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly + * be '0'. This happens because microcode reads CS:RIP using a _data_ + * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode + * gives up and does not fill the instruction bytes buffer. * * Detection: - * We reach here in case CPU supports DecodeAssist, raised #NPF and - * returned 0 in GuestIntrBytes field of the VMCB. - * First, errata can only be triggered in case vCPU CR4.SMAP=1. - * Second, if vCPU CR4.SMEP=1, errata could only be triggered - * in case vCPU CPL==3 (Because otherwise guest would have triggered - * a SMEP fault instead of #NPF). - * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL. - * As most guests enable SMAP if they have also enabled SMEP, use above - * logic in order to attempt minimize false-positive of detecting errata - * while still preserving all cases semantic correctness. + * KVM reaches this point if the VM is an SEV guest, the CPU supports + * DecodeAssist, a #NPF was raised, KVM's page fault handler triggered + * emulation (e.g. for MMIO), and the CPU returned 0 in GuestIntrBytes + * field of the VMCB. * - * Workaround: - * To determine what instruction the guest was executing, the hypervisor - * will have to decode the instruction at the instruction pointer. + * This does _not_ mean that the erratum has been encountered, as the + * DecodeAssist will also fail if the load for CS:RIP hits a legitimate + * #PF, e.g. if the guest attempt to execute from emulated MMIO and + * encountered a reserved/not-present #PF. * - * In non SEV guest, hypervisor will be able to read the guest - * memory to decode the instruction pointer when insn_len is zero - * so we return true to indicate that decoding is possible. - * - * But in the SEV guest, the guest memory is encrypted with the - * guest specific key and hypervisor will not be able to decode the - * instruction pointer so we will not able to workaround it. Lets - * print the error and request to kill the guest. + * To reduce the likelihood of false positives, take action if and only + * if CR4.SMAP=1 (obviously required to hit the erratum) and CR4.SMEP=0 + * or CPL=3. If SMEP=1 and CPL!=3, the erratum cannot have been hit as + * the guest would have encountered a SMEP violation #PF, not a #NPF. */ - if (likely(!insn || insn_len)) - return true; - cr4 = kvm_read_cr4(vcpu); smep = cr4 & X86_CR4_SMEP; smap = cr4 & X86_CR4_SMAP; From 3280cc22aea74d78ebbea277ff8bc8d593582de3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 20 Jan 2022 01:07:18 +0000 Subject: [PATCH 0655/1295] KVM: SVM: Don't apply SEV+SMAP workaround on code fetch or PT access Resume the guest instead of synthesizing a triple fault shutdown if the instruction bytes buffer is empty due to the #NPF being on the code fetch itself or on a page table access. The SMAP errata applies if and only if the code fetch was successful and ucode's subsequent data read from the code page encountered a SMAP violation. In practice, the guest is likely hosed either way, but crashing the guest on a code fetch to emulated MMIO is technically wrong according to the behavior described in the APM. Signed-off-by: Sean Christopherson Reviewed-by: Liam Merwick Message-Id: <20220120010719.711476-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 43 +++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index fb65bfabea25..be25831830b0 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4263,6 +4263,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, { bool smep, smap, is_user; unsigned long cr4; + u64 error_code; /* Emulation is always possible when KVM has access to all guest state. */ if (!sev_guest(vcpu->kvm)) @@ -4328,22 +4329,31 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode * gives up and does not fill the instruction bytes buffer. * - * Detection: - * KVM reaches this point if the VM is an SEV guest, the CPU supports - * DecodeAssist, a #NPF was raised, KVM's page fault handler triggered - * emulation (e.g. for MMIO), and the CPU returned 0 in GuestIntrBytes - * field of the VMCB. + * As above, KVM reaches this point iff the VM is an SEV guest, the CPU + * supports DecodeAssist, a #NPF was raised, KVM's page fault handler + * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the + * GuestIntrBytes field of the VMCB. * * This does _not_ mean that the erratum has been encountered, as the * DecodeAssist will also fail if the load for CS:RIP hits a legitimate * #PF, e.g. if the guest attempt to execute from emulated MMIO and * encountered a reserved/not-present #PF. * - * To reduce the likelihood of false positives, take action if and only - * if CR4.SMAP=1 (obviously required to hit the erratum) and CR4.SMEP=0 - * or CPL=3. If SMEP=1 and CPL!=3, the erratum cannot have been hit as - * the guest would have encountered a SMEP violation #PF, not a #NPF. + * To hit the erratum, the following conditions must be true: + * 1. CR4.SMAP=1 (obviously). + * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot + * have been hit as the guest would have encountered a SMEP + * violation #PF, not a #NPF. + * 3. The #NPF is not due to a code fetch, in which case failure to + * retrieve the instruction bytes is legitimate (see abvoe). + * + * In addition, don't apply the erratum workaround if the #NPF occurred + * while translating guest page tables (see below). */ + error_code = to_svm(vcpu)->vmcb->control.exit_info_1; + if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) + goto resume_guest; + cr4 = kvm_read_cr4(vcpu); smep = cr4 & X86_CR4_SMEP; smap = cr4 & X86_CR4_SMAP; @@ -4353,6 +4363,21 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); } +resume_guest: + /* + * If the erratum was not hit, simply resume the guest and let it fault + * again. While awful, e.g. the vCPU may get stuck in an infinite loop + * if the fault is at CPL=0, it's the lesser of all evils. Exiting to + * userspace will kill the guest, and letting the emulator read garbage + * will yield random behavior and potentially corrupt the guest. + * + * Simply resuming the guest is technically not a violation of the SEV + * architecture. AMD's APM states that all code fetches and page table + * accesses for SEV guest are encrypted, regardless of the C-Bit. The + * APM also states that encrypted accesses to MMIO are "ignored", but + * doesn't explicitly define "ignored", i.e. doing nothing and letting + * the guest spin is technically "ignoring" the access. + */ return false; } From cdf85e0c5dc766fc7fc779466280e454a6d04f87 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 20 Jan 2022 01:07:19 +0000 Subject: [PATCH 0656/1295] KVM: SVM: Don't kill SEV guest if SMAP erratum triggers in usermode Inject a #GP instead of synthesizing triple fault to try to avoid killing the guest if emulation of an SEV guest fails due to encountering the SMAP erratum. The injected #GP may still be fatal to the guest, e.g. if the userspace process is providing critical functionality, but KVM should make every attempt to keep the guest alive. Signed-off-by: Sean Christopherson Reviewed-by: Liam Merwick Message-Id: <20220120010719.711476-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index be25831830b0..f14f4ab97a80 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4360,7 +4360,21 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + + /* + * If the fault occurred in userspace, arbitrarily inject #GP + * to avoid killing the guest and to hopefully avoid confusing + * the guest kernel too much, e.g. injecting #PF would not be + * coherent with respect to the guest's page tables. Request + * triple fault if the fault occurred in the kernel as there's + * no fault that KVM can inject without confusing the guest. + * In practice, the triple fault is moot as no sane SEV kernel + * will execute from user memory while also running with SMAP=1. + */ + if (is_user) + kvm_inject_gp(vcpu, 0); + else + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); } resume_guest: From 38dfa8308cfc43f671a74c753302fec26808edc0 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 20 Dec 2021 16:21:36 +0100 Subject: [PATCH 0657/1295] KVM: SVM: hyper-v: Enable Enlightened MSR-Bitmap support for real Commit c4327f15dfc7 ("KVM: SVM: hyper-v: Enlightened MSR-Bitmap support") introduced enlightened MSR-Bitmap support for KVM-on-Hyper-V but it didn't actually enable the support. Similar to enlightened NPT TLB flush and direct TLB flush features, the guest (KVM) has to tell L0 (Hyper-V) that it's using the feature by setting the appropriate feature fit in VMCB control area (sw reserved fields). Fixes: c4327f15dfc7 ("KVM: SVM: hyper-v: Enlightened MSR-Bitmap support") Signed-off-by: Vitaly Kuznetsov Message-Id: <20211220152139.418372-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm_onhyperv.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index c53b8bf8d013..1ad43e997053 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -46,6 +46,9 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb) if (npt_enabled && ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) hve->hv_enlightenments_control.enlightened_npt_tlb = 1; + + if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP) + hve->hv_enlightenments_control.msr_bitmap = 1; } static inline void svm_hv_hardware_setup(void) From aa3b39f38c7a5dfdd10b3f61a0d055b85aa85451 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 20 Dec 2021 16:21:35 +0100 Subject: [PATCH 0658/1295] KVM: SVM: drop unnecessary code in svm_hv_vmcb_dirty_nested_enlightenments() Commit 3fa5e8fd0a0e4 ("KVM: SVM: delay svm_vcpu_init_msrpm after svm->vmcb is initialized") re-arranged svm_vcpu_init_msrpm() call in svm_create_vcpu(), thus making the comment about vmcb being NULL obsolete. Drop it. While on it, drop superfluous vmcb_is_clean() check: vmcb_mark_dirty() is a bit flip, an extra check is unlikely to bring any performance gain. Drop now-unneeded vmcb_is_clean() helper as well. Fixes: 3fa5e8fd0a0e4 ("KVM: SVM: delay svm_vcpu_init_msrpm after svm->vmcb is initialized") Signed-off-by: Vitaly Kuznetsov Message-Id: <20211220152139.418372-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.h | 5 ----- arch/x86/kvm/svm/svm_onhyperv.h | 9 +-------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 47ef8f4a9358..1160faaf6ef3 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -304,11 +304,6 @@ static inline void vmcb_mark_all_clean(struct vmcb *vmcb) & ~VMCB_ALWAYS_DIRTY_MASK; } -static inline bool vmcb_is_clean(struct vmcb *vmcb, int bit) -{ - return (vmcb->control.clean & (1 << bit)); -} - static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit) { vmcb->control.clean &= ~(1 << bit); diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 1ad43e997053..489ca56212c6 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -86,14 +86,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments( struct hv_enlightenments *hve = (struct hv_enlightenments *)vmcb->control.reserved_sw; - /* - * vmcb can be NULL if called during early vcpu init. - * And its okay not to mark vmcb dirty during vcpu init - * as we mark it dirty unconditionally towards end of vcpu - * init phase. - */ - if (vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) && - hve->hv_enlightenments_control.msr_bitmap) + if (hve->hv_enlightenments_control.msr_bitmap) vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); } From f7e570780efc5cec9b2ed1e0472a7da14e864fdb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 25 Jan 2022 22:03:58 +0000 Subject: [PATCH 0659/1295] KVM: x86: Forcibly leave nested virt when SMM state is toggled Forcibly leave nested virtualization operation if userspace toggles SMM state via KVM_SET_VCPU_EVENTS or KVM_SYNC_X86_EVENTS. If userspace forces the vCPU out of SMM while it's post-VMXON and then injects an SMI, vmx_enter_smm() will overwrite vmx->nested.smm.vmxon and end up with both vmxon=false and smm.vmxon=false, but all other nVMX state allocated. Don't attempt to gracefully handle the transition as (a) most transitions are nonsencial, e.g. forcing SMM while L2 is running, (b) there isn't sufficient information to handle all transitions, e.g. SVM wants access to the SMRAM save state, and (c) KVM_SET_VCPU_EVENTS must precede KVM_SET_NESTED_STATE during state restore as the latter disallows putting the vCPU into L2 if SMM is active, and disallows tagging the vCPU as being post-VMXON in SMM if SMM is not active. Abuse of KVM_SET_VCPU_EVENTS manifests as a WARN and memory leak in nVMX due to failure to free vmcs01's shadow VMCS, but the bug goes far beyond just a memory leak, e.g. toggling SMM on while L2 is active puts the vCPU in an architecturally impossible state. WARNING: CPU: 0 PID: 3606 at free_loaded_vmcs arch/x86/kvm/vmx/vmx.c:2665 [inline] WARNING: CPU: 0 PID: 3606 at free_loaded_vmcs+0x158/0x1a0 arch/x86/kvm/vmx/vmx.c:2656 Modules linked in: CPU: 1 PID: 3606 Comm: syz-executor725 Not tainted 5.17.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:free_loaded_vmcs arch/x86/kvm/vmx/vmx.c:2665 [inline] RIP: 0010:free_loaded_vmcs+0x158/0x1a0 arch/x86/kvm/vmx/vmx.c:2656 Code: <0f> 0b eb b3 e8 8f 4d 9f 00 e9 f7 fe ff ff 48 89 df e8 92 4d 9f 00 Call Trace: kvm_arch_vcpu_destroy+0x72/0x2f0 arch/x86/kvm/x86.c:11123 kvm_vcpu_destroy arch/x86/kvm/../../../virt/kvm/kvm_main.c:441 [inline] kvm_destroy_vcpus+0x11f/0x290 arch/x86/kvm/../../../virt/kvm/kvm_main.c:460 kvm_free_vcpus arch/x86/kvm/x86.c:11564 [inline] kvm_arch_destroy_vm+0x2e8/0x470 arch/x86/kvm/x86.c:11676 kvm_destroy_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:1217 [inline] kvm_put_kvm+0x4fa/0xb00 arch/x86/kvm/../../../virt/kvm/kvm_main.c:1250 kvm_vm_release+0x3f/0x50 arch/x86/kvm/../../../virt/kvm/kvm_main.c:1273 __fput+0x286/0x9f0 fs/file_table.c:311 task_work_run+0xdd/0x1a0 kernel/task_work.c:164 exit_task_work include/linux/task_work.h:32 [inline] do_exit+0xb29/0x2a30 kernel/exit.c:806 do_group_exit+0xd2/0x2f0 kernel/exit.c:935 get_signal+0x4b0/0x28c0 kernel/signal.c:2862 arch_do_signal_or_restart+0x2a9/0x1c40 arch/x86/kernel/signal.c:868 handle_signal_work kernel/entry/common.c:148 [inline] exit_to_user_mode_loop kernel/entry/common.c:172 [inline] exit_to_user_mode_prepare+0x17d/0x290 kernel/entry/common.c:207 __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] syscall_exit_to_user_mode+0x19/0x60 kernel/entry/common.c:300 do_syscall_64+0x42/0xb0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x44/0xae Cc: stable@vger.kernel.org Reported-by: syzbot+8112db3ab20e70d50c31@syzkaller.appspotmail.com Signed-off-by: Sean Christopherson Message-Id: <20220125220358.2091737-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/nested.c | 9 +++++---- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/svm/svm.h | 2 +- arch/x86/kvm/vmx/nested.c | 1 + arch/x86/kvm/x86.c | 4 +++- 6 files changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2fef4ef62061..efee29d5ad4f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1496,6 +1496,7 @@ struct kvm_x86_ops { }; struct kvm_x86_nested_ops { + void (*leave_nested)(struct kvm_vcpu *vcpu); int (*check_events)(struct kvm_vcpu *vcpu); bool (*hv_timer_pending)(struct kvm_vcpu *vcpu); void (*triple_fault)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index cf206855ebf0..1218b5a342fc 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -983,9 +983,9 @@ void svm_free_nested(struct vcpu_svm *svm) /* * Forcibly leave nested mode in order to be able to reset the VCPU later on. */ -void svm_leave_nested(struct vcpu_svm *svm) +void svm_leave_nested(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; + struct vcpu_svm *svm = to_svm(vcpu); if (is_guest_mode(vcpu)) { svm->nested.nested_run_pending = 0; @@ -1411,7 +1411,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) { - svm_leave_nested(svm); + svm_leave_nested(vcpu); svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); return 0; } @@ -1478,7 +1478,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, */ if (is_guest_mode(vcpu)) - svm_leave_nested(svm); + svm_leave_nested(vcpu); else svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; @@ -1532,6 +1532,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) } struct kvm_x86_nested_ops svm_nested_ops = { + .leave_nested = svm_leave_nested, .check_events = svm_check_nested_events, .triple_fault = nested_svm_triple_fault, .get_nested_state_pages = svm_get_nested_state_pages, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f14f4ab97a80..9cef8e4598df 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -290,7 +290,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { if (!(efer & EFER_SVME)) { - svm_leave_nested(svm); + svm_leave_nested(vcpu); svm_set_gif(svm, true); /* #GP intercept is still needed for vmware backdoor */ if (!enable_vmware_backdoor) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 1160faaf6ef3..73525353e424 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -520,7 +520,7 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun); -void svm_leave_nested(struct vcpu_svm *svm); +void svm_leave_nested(struct kvm_vcpu *vcpu); void svm_free_nested(struct vcpu_svm *svm); int svm_allocate_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f235f77cbc03..7eebfdf7204f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6771,6 +6771,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) } struct kvm_x86_nested_ops vmx_nested_ops = { + .leave_nested = vmx_leave_nested, .check_events = vmx_check_nested_events, .hv_timer_pending = nested_vmx_preemption_timer_pending, .triple_fault = nested_vmx_triple_fault, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c8f3d3ea8a96..a50baf4c5bff 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4860,8 +4860,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { - if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) + if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { + kvm_x86_ops.nested_ops->leave_nested(vcpu); kvm_smm_changed(vcpu, events->smi.smm); + } vcpu->arch.smi_pending = events->smi.pending; From 033a3ea59a19df63edb4db6bfdbb357cd028258a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 26 Jan 2022 14:18:04 +0100 Subject: [PATCH 0660/1295] KVM: x86: Check .flags in kvm_cpuid_check_equal() too kvm_cpuid_check_equal() checks for the (full) equality of the supplied CPUID data so .flags need to be checked too. Reported-by: Sean Christopherson Fixes: c6617c61e8fe ("KVM: x86: Partially allow KVM_SET_CPUID{,2} after KVM_RUN") Signed-off-by: Vitaly Kuznetsov Message-Id: <20220126131804.2839410-1-vkuznets@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8e2704b50e47..d0c9b10d448d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -133,6 +133,7 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 orig = &vcpu->arch.cpuid_entries[i]; if (e2[i].function != orig->function || e2[i].index != orig->index || + e2[i].flags != orig->flags || e2[i].eax != orig->eax || e2[i].ebx != orig->ebx || e2[i].ecx != orig->ecx || e2[i].edx != orig->edx) return -EINVAL; From 4cf3d3ebe8794c449af3e0e8c1d790c97e461d20 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 25 Jan 2022 22:17:25 +0000 Subject: [PATCH 0661/1295] KVM: selftests: Don't skip L2's VMCALL in SMM test for SVM guest Don't skip the vmcall() in l2_guest_code() prior to re-entering L2, doing so will result in L2 running to completion, popping '0' off the stack for RET, jumping to address '0', and ultimately dying with a triple fault shutdown. It's not at all obvious why the test re-enters L2 and re-executes VMCALL, but presumably it serves a purpose. The VMX path doesn't skip vmcall(), and the test can't possibly have passed on SVM, so just do what VMX does. Fixes: d951b2210c1a ("KVM: selftests: smm_test: Test SMM enter from L2") Cc: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20220125221725.2101126-1-seanjc@google.com> Reviewed-by: Vitaly Kuznetsov Tested-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/smm_test.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index 2da8eb8e2d96..a626d40fdb48 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -105,7 +105,6 @@ static void guest_code(void *arg) if (cpu_has_svm()) { run_guest(svm->vmcb, svm->vmcb_gpa); - svm->vmcb->save.rip += 3; run_guest(svm->vmcb, svm->vmcb_gpa); } else { vmlaunch(); From d6e656cd266cdcc95abd372c7faef05bee271d1a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 25 Jan 2022 22:05:27 +0000 Subject: [PATCH 0662/1295] KVM: nVMX: WARN on any attempt to allocate shadow VMCS for vmcs02 WARN if KVM attempts to allocate a shadow VMCS for vmcs02. KVM emulates VMCS shadowing but doesn't virtualize it, i.e. KVM should never allocate a "real" shadow VMCS for L2. The previous code WARNed but continued anyway with the allocation, presumably in an attempt to avoid NULL pointer dereference. However, alloc_vmcs (and hence alloc_shadow_vmcs) can fail, and indeed the sole caller does: if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) goto out_shadow_vmcs; which makes it not a useful attempt. Signed-off-by: Sean Christopherson Message-Id: <20220125220527.2093146-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7eebfdf7204f..2777cea05cc0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4851,18 +4851,20 @@ static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; /* - * We should allocate a shadow vmcs for vmcs01 only when L1 - * executes VMXON and free it when L1 executes VMXOFF. - * As it is invalid to execute VMXON twice, we shouldn't reach - * here when vmcs01 already have an allocated shadow vmcs. + * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it + * when L1 executes VMXOFF or the vCPU is forced out of nested + * operation. VMXON faults if the CPU is already post-VMXON, so it + * should be impossible to already have an allocated shadow VMCS. KVM + * doesn't support virtualization of VMCS shadowing, so vmcs01 should + * always be the loaded VMCS. */ - WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); + if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) + return loaded_vmcs->shadow_vmcs; + + loaded_vmcs->shadow_vmcs = alloc_vmcs(true); + if (loaded_vmcs->shadow_vmcs) + vmcs_clear(loaded_vmcs->shadow_vmcs); - if (!loaded_vmcs->shadow_vmcs) { - loaded_vmcs->shadow_vmcs = alloc_vmcs(true); - if (loaded_vmcs->shadow_vmcs) - vmcs_clear(loaded_vmcs->shadow_vmcs); - } return loaded_vmcs->shadow_vmcs; } From ebb7fb1557b1d03b906b668aa2164b51e6b7d19a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 26 Jan 2022 09:19:20 -0800 Subject: [PATCH 0663/1295] xfs, iomap: limit individual ioend chain lengths in writeback Trond Myklebust reported soft lockups in XFS IO completion such as this: watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106] CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1 Workqueue: xfs-conv/md127 xfs_end_io [xfs] RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20 Call Trace: wake_up_page_bit+0x8a/0x110 iomap_finish_ioend+0xd7/0x1c0 iomap_finish_ioends+0x7f/0xb0 xfs_end_ioend+0x6b/0x100 [xfs] xfs_end_io+0xb9/0xe0 [xfs] process_one_work+0x1a7/0x360 worker_thread+0x1fa/0x390 kthread+0x116/0x130 ret_from_fork+0x35/0x40 Ioends are processed as an atomic completion unit when all the chained bios in the ioend have completed their IO. Logically contiguous ioends can also be merged and completed as a single, larger unit. Both of these things can be problematic as both the bio chains per ioend and the size of the merged ioends processed as a single completion are both unbound. If we have a large sequential dirty region in the page cache, write_cache_pages() will keep feeding us sequential pages and we will keep mapping them into ioends and bios until we get a dirty page at a non-sequential file offset. These large sequential runs can will result in bio and ioend chaining to optimise the io patterns. The pages iunder writeback are pinned within these chains until the submission chaining is broken, allowing the entire chain to be completed. This can result in huge chains being processed in IO completion context. We get deep bio chaining if we have large contiguous physical extents. We will keep adding pages to the current bio until it is full, then we'll chain a new bio to keep adding pages for writeback. Hence we can build bio chains that map millions of pages and tens of gigabytes of RAM if the page cache contains big enough contiguous dirty file regions. This long bio chain pins those pages until the final bio in the chain completes and the ioend can iterate all the chained bios and complete them. OTOH, if we have a physically fragmented file, we end up submitting one ioend per physical fragment that each have a small bio or bio chain attached to them. We do not chain these at IO submission time, but instead we chain them at completion time based on file offset via iomap_ioend_try_merge(). Hence we can end up with unbound ioend chains being built via completion merging. XFS can then do COW remapping or unwritten extent conversion on that merged chain, which involves walking an extent fragment at a time and running a transaction to modify the physical extent information. IOWs, we merge all the discontiguous ioends together into a contiguous file range, only to then process them individually as discontiguous extents. This extent manipulation is computationally expensive and can run in a tight loop, so merging logically contiguous but physically discontigous ioends gains us nothing except for hiding the fact the fact we broke the ioends up into individual physical extents at submission and then need to loop over those individual physical extents at completion. Hence we need to have mechanisms to limit ioend sizes and to break up completion processing of large merged ioend chains: 1. bio chains per ioend need to be bound in length. Pure overwrites go straight to iomap_finish_ioend() in softirq context with the exact bio chain attached to the ioend by submission. Hence the only way to prevent long holdoffs here is to bound ioend submission sizes because we can't reschedule in softirq context. 2. iomap_finish_ioends() has to handle unbound merged ioend chains correctly. This relies on any one call to iomap_finish_ioend() being bound in runtime so that cond_resched() can be issued regularly as the long ioend chain is processed. i.e. this relies on mechanism #1 to limit individual ioend sizes to work correctly. 3. filesystems have to loop over the merged ioends to process physical extent manipulations. This means they can loop internally, and so we break merging at physical extent boundaries so the filesystem can easily insert reschedule points between individual extent manipulations. Signed-off-by: Dave Chinner Reported-and-tested-by: Trond Myklebust Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 52 ++++++++++++++++++++++++++++++++++++++---- fs/xfs/xfs_aops.c | 16 ++++++++++++- include/linux/iomap.h | 2 ++ 3 files changed, 65 insertions(+), 5 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index c938bbad075e..6c51a75d0be6 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -21,6 +21,8 @@ #include "../internal.h" +#define IOEND_BATCH_SIZE 4096 + /* * Structure allocated for each folio when block size < folio size * to track sub-folio uptodate status and I/O completions. @@ -1039,7 +1041,7 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, * state, release holds on bios, and finally free up memory. Do not use the * ioend after this. */ -static void +static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) { struct inode *inode = ioend->io_inode; @@ -1048,6 +1050,7 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) u64 start = bio->bi_iter.bi_sector; loff_t offset = ioend->io_offset; bool quiet = bio_flagged(bio, BIO_QUIET); + u32 folio_count = 0; for (bio = &ioend->io_inline_bio; bio; bio = next) { struct folio_iter fi; @@ -1062,9 +1065,11 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) next = bio->bi_private; /* walk all folios in bio, ending page IO on them */ - bio_for_each_folio_all(fi, bio) + bio_for_each_folio_all(fi, bio) { iomap_finish_folio_write(inode, fi.folio, fi.length, error); + folio_count++; + } bio_put(bio); } /* The ioend has been freed by bio_put() */ @@ -1074,20 +1079,36 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) "%s: writeback error on inode %lu, offset %lld, sector %llu", inode->i_sb->s_id, inode->i_ino, offset, start); } + return folio_count; } +/* + * Ioend completion routine for merged bios. This can only be called from task + * contexts as merged ioends can be of unbound length. Hence we have to break up + * the writeback completions into manageable chunks to avoid long scheduler + * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get + * good batch processing throughput without creating adverse scheduler latency + * conditions. + */ void iomap_finish_ioends(struct iomap_ioend *ioend, int error) { struct list_head tmp; + u32 completions; + + might_sleep(); list_replace_init(&ioend->io_list, &tmp); - iomap_finish_ioend(ioend, error); + completions = iomap_finish_ioend(ioend, error); while (!list_empty(&tmp)) { + if (completions > IOEND_BATCH_SIZE * 8) { + cond_resched(); + completions = 0; + } ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); list_del_init(&ioend->io_list); - iomap_finish_ioend(ioend, error); + completions += iomap_finish_ioend(ioend, error); } } EXPORT_SYMBOL_GPL(iomap_finish_ioends); @@ -1108,6 +1129,18 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) return false; if (ioend->io_offset + ioend->io_size != next->io_offset) return false; + /* + * Do not merge physically discontiguous ioends. The filesystem + * completion functions will have to iterate the physical + * discontiguities even if we merge the ioends at a logical level, so + * we don't gain anything by merging physical discontiguities here. + * + * We cannot use bio->bi_iter.bi_sector here as it is modified during + * submission so does not point to the start sector of the bio at + * completion. + */ + if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) + return false; return true; } @@ -1209,8 +1242,10 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, ioend->io_flags = wpc->iomap.flags; ioend->io_inode = inode; ioend->io_size = 0; + ioend->io_folios = 0; ioend->io_offset = offset; ioend->io_bio = bio; + ioend->io_sector = sector; return ioend; } @@ -1251,6 +1286,13 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, return false; if (sector != bio_end_sector(wpc->ioend->io_bio)) return false; + /* + * Limit ioend bio chain lengths to minimise IO completion latency. This + * also prevents long tight loops ending page writeback on all the + * folios in the ioend. + */ + if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE) + return false; return true; } @@ -1335,6 +1377,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, &submit_list); count++; } + if (count) + wpc->ioend->io_folios++; WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); WARN_ON_ONCE(!folio_test_locked(folio)); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2705f91bdd0d..9d6a67c7d227 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -136,7 +136,20 @@ done: memalloc_nofs_restore(nofs_flag); } -/* Finish all pending io completions. */ +/* + * Finish all pending IO completions that require transactional modifications. + * + * We try to merge physical and logically contiguous ioends before completion to + * minimise the number of transactions we need to perform during IO completion. + * Both unwritten extent conversion and COW remapping need to iterate and modify + * one physical extent at a time, so we gain nothing by merging physically + * discontiguous extents here. + * + * The ioend chain length that we can be processing here is largely unbound in + * length and we may have to perform significant amounts of work on each ioend + * to complete it. Hence we have to be careful about holding the CPU for too + * long in this loop. + */ void xfs_end_io( struct work_struct *work) @@ -157,6 +170,7 @@ xfs_end_io( list_del_init(&ioend->io_list); iomap_ioend_try_merge(ioend, &tmp); xfs_end_ioend(ioend); + cond_resched(); } } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b55bd49e55f5..97a3a2edb585 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -263,9 +263,11 @@ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_type; u16 io_flags; /* IOMAP_F_* */ + u32 io_folios; /* folios added to ioend */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ + sector_t io_sector; /* start sector of ioend */ struct bio *io_bio; /* bio being built */ struct bio io_inline_bio; /* MUST BE LAST! */ }; From 811f95ff95270e6048197821434d9301e3d7f07c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 25 Jan 2022 21:04:45 +0000 Subject: [PATCH 0664/1295] KVM: x86: Free kvm_cpuid_entry2 array on post-KVM_RUN KVM_SET_CPUID{,2} Free the "struct kvm_cpuid_entry2" array on successful post-KVM_RUN KVM_SET_CPUID{,2} to fix a memory leak, the callers of kvm_set_cpuid() free the array only on failure. BUG: memory leak unreferenced object 0xffff88810963a800 (size 2048): comm "syz-executor025", pid 3610, jiffies 4294944928 (age 8.080s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 0d 00 00 00 ................ 47 65 6e 75 6e 74 65 6c 69 6e 65 49 00 00 00 00 GenuntelineI.... backtrace: [] kmalloc_node include/linux/slab.h:604 [inline] [] kvmalloc_node+0x3e/0x100 mm/util.c:580 [] kvmalloc include/linux/slab.h:732 [inline] [] vmemdup_user+0x22/0x100 mm/util.c:199 [] kvm_vcpu_ioctl_set_cpuid2+0x8f/0xf0 arch/x86/kvm/cpuid.c:423 [] kvm_arch_vcpu_ioctl+0xb99/0x1e60 arch/x86/kvm/x86.c:5251 [] kvm_vcpu_ioctl+0x4ad/0x950 arch/x86/kvm/../../../virt/kvm/kvm_main.c:4066 [] vfs_ioctl fs/ioctl.c:51 [inline] [] __do_sys_ioctl fs/ioctl.c:874 [inline] [] __se_sys_ioctl fs/ioctl.c:860 [inline] [] __x64_sys_ioctl+0xfc/0x140 fs/ioctl.c:860 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: c6617c61e8fe ("KVM: x86: Partially allow KVM_SET_CPUID{,2} after KVM_RUN") Cc: stable@vger.kernel.org Reported-by: syzbot+be576ad7655690586eec@syzkaller.appspotmail.com Signed-off-by: Sean Christopherson Message-Id: <20220125210445.2053429-1-seanjc@google.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index d0c9b10d448d..28be02adc669 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -359,8 +359,14 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, * KVM_SET_CPUID{,2} again. To support this legacy behavior, check * whether the supplied CPUID data is equal to what's already set. */ - if (vcpu->arch.last_vmentry_cpu != -1) - return kvm_cpuid_check_equal(vcpu, e2, nent); + if (vcpu->arch.last_vmentry_cpu != -1) { + r = kvm_cpuid_check_equal(vcpu, e2, nent); + if (r) + return r; + + kvfree(e2); + return 0; + } r = kvm_check_cpuid(vcpu, e2, nent); if (r) From 663d34c8df98740f1e90241e78e456d00b3c6cad Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 20 Jan 2022 16:23:19 +0100 Subject: [PATCH 0665/1295] s390/hypfs: include z/VM guests with access control group set Currently if z/VM guest is allowed to retrieve hypervisor performance data globally for all guests (privilege class B) the query is formed in a way to include all guests but the group name is left empty. This leads to that z/VM guests which have access control group set not being included in the results (even local vm). Change the query group identifier from empty to "any" to retrieve information about all guests from any groups (or without a group set). Cc: stable@vger.kernel.org Fixes: 31cb4bd31a48 ("[S390] Hypervisor filesystem (s390_hypfs) for z/VM") Reviewed-by: Gerald Schaefer Signed-off-by: Vasily Gorbik --- arch/s390/hypfs/hypfs_vm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c index 33f973ff9744..e8f15dbb89d0 100644 --- a/arch/s390/hypfs/hypfs_vm.c +++ b/arch/s390/hypfs/hypfs_vm.c @@ -20,6 +20,7 @@ static char local_guest[] = " "; static char all_guests[] = "* "; +static char *all_groups = all_guests; static char *guest_query; struct diag2fc_data { @@ -62,10 +63,11 @@ static int diag2fc(int size, char* query, void *addr) memcpy(parm_list.userid, query, NAME_LEN); ASCEBC(parm_list.userid, NAME_LEN); - parm_list.addr = (unsigned long) addr ; + memcpy(parm_list.aci_grp, all_groups, NAME_LEN); + ASCEBC(parm_list.aci_grp, NAME_LEN); + parm_list.addr = (unsigned long)addr; parm_list.size = size; parm_list.fmt = 0x02; - memset(parm_list.aci_grp, 0x40, NAME_LEN); rc = -1; diag_stat_inc(DIAG_STAT_X2FC); From be4f3b3f82271c3193ce200a996dc70682c8e622 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 26 Jan 2022 17:22:24 +0000 Subject: [PATCH 0666/1295] KVM: x86: Keep MSR_IA32_XSS unchanged for INIT It has been corrected from SDM version 075 that MSR_IA32_XSS is reset to zero on Power up and Reset but keeps unchanged on INIT. Fixes: a554d207dc46 ("KVM: X86: Processor States following Reset or INIT") Cc: stable@vger.kernel.org Signed-off-by: Xiaoyao Li Signed-off-by: Sean Christopherson Message-Id: <20220126172226.2298529-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a50baf4c5bff..b6d60c296eda 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11266,6 +11266,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.msr_misc_features_enables = 0; vcpu->arch.xcr0 = XFEATURE_MASK_FP; + vcpu->arch.ia32_xss = 0; } /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ @@ -11282,8 +11283,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0); kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); - vcpu->arch.ia32_xss = 0; - static_call(kvm_x86_vcpu_reset)(vcpu, init_event); kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); From 4c282e51e4450b94680d6ca3b10f830483b1f243 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 26 Jan 2022 17:22:25 +0000 Subject: [PATCH 0667/1295] KVM: x86: Update vCPU's runtime CPUID on write to MSR_IA32_XSS Do a runtime CPUID update for a vCPU if MSR_IA32_XSS is written, as the size in bytes of the XSAVE area is affected by the states enabled in XSS. Fixes: 203000993de5 ("kvm: vmx: add MSR logic for XSAVES") Cc: stable@vger.kernel.org Signed-off-by: Like Xu [sean: split out as a separate patch, adjust Fixes tag] Signed-off-by: Sean Christopherson Message-Id: <20220126172226.2298529-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b6d60c296eda..9c984eeed30c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3535,6 +3535,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~supported_xss) return 1; vcpu->arch.ia32_xss = data; + kvm_update_cpuid_runtime(vcpu); break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) From 05a9e065059e566f218f8778c4d17ee75db56c55 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 26 Jan 2022 17:22:26 +0000 Subject: [PATCH 0668/1295] KVM: x86: Sync the states size with the XCR0/IA32_XSS at, any time XCR0 is reset to 1 by RESET but not INIT and IA32_XSS is zeroed by both RESET and INIT. The kvm_set_msr_common()'s handling of MSR_IA32_XSS also needs to update kvm_update_cpuid_runtime(). In the above cases, the size in bytes of the XSAVE area containing all states enabled by XCR0 or (XCRO | IA32_XSS) needs to be updated. For simplicity and consistency, existing helpers are used to write values and call kvm_update_cpuid_runtime(), and it's not exactly a fast path. Fixes: a554d207dc46 ("KVM: X86: Processor States following Reset or INIT") Cc: stable@vger.kernel.org Signed-off-by: Like Xu Signed-off-by: Sean Christopherson Message-Id: <20220126172226.2298529-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9c984eeed30c..5481d2259f02 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11266,8 +11266,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.msr_misc_features_enables = 0; - vcpu->arch.xcr0 = XFEATURE_MASK_FP; - vcpu->arch.ia32_xss = 0; + __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); + __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); } /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ From dd4516aee365fc9c944c9d6036b6b87363398680 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 26 Jan 2022 07:44:34 -0500 Subject: [PATCH 0669/1295] selftests: kvm: move vm_xsave_req_perm call to amx_test There is no need for tests other than amx_test to enable dynamic xsave states. Remove the call to vm_xsave_req_perm from generic code, and move it inside the test. While at it, allow customizing the bit that is requested, so that future tests can use it differently. Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 1 - .../testing/selftests/kvm/include/x86_64/processor.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 7 ------- tools/testing/selftests/kvm/lib/x86_64/processor.c | 12 ++++++------ tools/testing/selftests/kvm/x86_64/amx_test.c | 2 ++ 5 files changed, 9 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 66775de26952..4ed6aa049a91 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -345,7 +345,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, * guest_code - The vCPU's entry point */ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code); -void vm_xsave_req_perm(void); bool vm_is_unrestricted_guest(struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 423d8a61bd2e..8a470da7b71a 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -458,6 +458,7 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); +void vm_xsave_req_perm(int bit); enum x86_page_size { X86_PAGE_SIZE_4K = 0, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8c53f96ab7fe..d8cf851ab119 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -393,13 +393,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, struct kvm_vm *vm; int i; -#ifdef __x86_64__ - /* - * Permission needs to be requested before KVM_SET_CPUID2. - */ - vm_xsave_req_perm(); -#endif - /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */ if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES) slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES; diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 5f9d7e91dc69..c1d1c195a838 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -665,16 +665,16 @@ static bool is_xfd_supported(void) return !!(eax & CPUID_XFD_BIT); } -void vm_xsave_req_perm(void) +void vm_xsave_req_perm(int bit) { - unsigned long bitmask; + u64 bitmask; long rc; if (!is_xfd_supported()) - return; + exit(KSFT_SKIP); + + rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit); - rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, - XSTATE_XTILE_DATA_BIT); /* * The older kernel version(<5.15) can't support * ARCH_REQ_XCOMP_GUEST_PERM and directly return. @@ -684,7 +684,7 @@ void vm_xsave_req_perm(void) rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask); TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc); - TEST_ASSERT(bitmask & XFEATURE_XTILE_MASK, + TEST_ASSERT(bitmask & (1ULL << bit), "prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure bitmask=0x%lx", bitmask); } diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index 523c1e99ed64..52a3ef6629e8 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -329,6 +329,8 @@ int main(int argc, char *argv[]) u32 amx_offset; int stage, ret; + vm_xsave_req_perm(XSTATE_XTILE_DATA_BIT); + /* Create VM */ vm = vm_create_default(VCPU_ID, 0, guest_code); From fc55e63e148f1db2180867da875460a00aac8bd1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 11 Jan 2022 20:32:43 +0300 Subject: [PATCH 0670/1295] counter: fix an IS_ERR() vs NULL bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are 8 callers for devm_counter_alloc() and they all check for NULL instead of error pointers. I think NULL is the better thing to return for allocation functions so update counter_alloc() and devm_counter_alloc() to return NULL instead of error pointers. Fixes: c18e2760308e ("counter: Provide alternative counter registration functions") Acked-by: Uwe Kleine-König Acked-by: William Breathitt Gray Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20220111173243.GA2192@kili Signed-off-by: Greg Kroah-Hartman --- drivers/counter/counter-core.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/counter/counter-core.c b/drivers/counter/counter-core.c index 7e0957eea094..869894b74741 100644 --- a/drivers/counter/counter-core.c +++ b/drivers/counter/counter-core.c @@ -90,10 +90,8 @@ struct counter_device *counter_alloc(size_t sizeof_priv) int err; ch = kzalloc(sizeof(*ch) + sizeof_priv, GFP_KERNEL); - if (!ch) { - err = -ENOMEM; - goto err_alloc_ch; - } + if (!ch) + return NULL; counter = &ch->counter; dev = &counter->dev; @@ -123,9 +121,8 @@ err_chrdev_add: err_ida_alloc: kfree(ch); -err_alloc_ch: - return ERR_PTR(err); + return NULL; } EXPORT_SYMBOL_GPL(counter_alloc); @@ -208,12 +205,12 @@ struct counter_device *devm_counter_alloc(struct device *dev, size_t sizeof_priv int err; counter = counter_alloc(sizeof_priv); - if (IS_ERR(counter)) - return counter; + if (!counter) + return NULL; err = devm_add_action_or_reset(dev, devm_counter_put, counter); if (err < 0) - return ERR_PTR(err); + return NULL; return counter; } From 3758a6c74e08bdc15ccccd6872a6ad37d165239a Mon Sep 17 00:00:00 2001 From: Evgenii Stepanov Date: Tue, 25 Jan 2022 10:22:17 -0800 Subject: [PATCH 0671/1295] arm64: extable: fix load_unaligned_zeropad() reg indices In ex_handler_load_unaligned_zeropad() we erroneously extract the data and addr register indices from ex->type rather than ex->data. As ex->type will contain EX_TYPE_LOAD_UNALIGNED_ZEROPAD (i.e. 4): * We'll always treat X0 as the address register, since EX_DATA_REG_ADDR is extracted from bits [9:5]. Thus, we may attempt to dereference an arbitrary address as X0 may hold an arbitrary value. * We'll always treat X4 as the data register, since EX_DATA_REG_DATA is extracted from bits [4:0]. Thus we will corrupt X4 and cause arbitrary behaviour within load_unaligned_zeropad() and its caller. Fix this by extracting both values from ex->data as originally intended. On an MTE-enabled QEMU image we are hitting the following crash: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 Call trace: fixup_exception+0xc4/0x108 __do_kernel_fault+0x3c/0x268 do_tag_check_fault+0x3c/0x104 do_mem_abort+0x44/0xf4 el1_abort+0x40/0x64 el1h_64_sync_handler+0x60/0xa0 el1h_64_sync+0x7c/0x80 link_path_walk+0x150/0x344 path_openat+0xa0/0x7dc do_filp_open+0xb8/0x168 do_sys_openat2+0x88/0x17c __arm64_sys_openat+0x74/0xa0 invoke_syscall+0x48/0x148 el0_svc_common+0xb8/0xf8 do_el0_svc+0x28/0x88 el0_svc+0x24/0x84 el0t_64_sync_handler+0x88/0xec el0t_64_sync+0x1b4/0x1b8 Code: f8695a69 71007d1f 540000e0 927df12a (f940014a) Fixes: 753b32368705 ("arm64: extable: add load_unaligned_zeropad() handler") Cc: # 5.16.x Reviewed-by: Mark Rutland Signed-off-by: Evgenii Stepanov Link: https://lore.kernel.org/r/20220125182217.2605202-1-eugenis@google.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/extable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c index c0181e60cc98..489455309695 100644 --- a/arch/arm64/mm/extable.c +++ b/arch/arm64/mm/extable.c @@ -40,8 +40,8 @@ static bool ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs) { - int reg_data = FIELD_GET(EX_DATA_REG_DATA, ex->type); - int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->type); + int reg_data = FIELD_GET(EX_DATA_REG_DATA, ex->data); + int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); unsigned long data, addr, offset; addr = pt_regs_read_reg(regs, reg_addr); From 89d43d0551a848e70e63d9ba11534aaeabc82443 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 12 Jan 2022 12:29:04 +0800 Subject: [PATCH 0672/1295] ceph: put the requests/sessions when it fails to alloc memory When failing to allocate the sessions memory we should make sure the req1 and req2 and the sessions get put. And also in case the max_sessions decreased so when kreallocate the new memory some sessions maybe missed being put. And if the max_sessions is 0 krealloc will return ZERO_SIZE_PTR, which will lead to a distinct access fault. URL: https://tracker.ceph.com/issues/53819 Fixes: e1a4541ec0b9 ("ceph: flush the mdlog before waiting on unsafe reqs") Signed-off-by: Xiubo Li Reviewed-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 55 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7d305b974824..b472cd066d1c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2218,6 +2218,7 @@ static int unsafe_request_wait(struct inode *inode) struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req1 = NULL, *req2 = NULL; + unsigned int max_sessions; int ret, err = 0; spin_lock(&ci->i_unsafe_lock); @@ -2235,37 +2236,45 @@ static int unsafe_request_wait(struct inode *inode) } spin_unlock(&ci->i_unsafe_lock); + /* + * The mdsc->max_sessions is unlikely to be changed + * mostly, here we will retry it by reallocating the + * sessions array memory to get rid of the mdsc->mutex + * lock. + */ +retry: + max_sessions = mdsc->max_sessions; + /* * Trigger to flush the journal logs in all the relevant MDSes * manually, or in the worst case we must wait at most 5 seconds * to wait the journal logs to be flushed by the MDSes periodically. */ - if (req1 || req2) { + if ((req1 || req2) && likely(max_sessions)) { struct ceph_mds_session **sessions = NULL; struct ceph_mds_session *s; struct ceph_mds_request *req; - unsigned int max; int i; - /* - * The mdsc->max_sessions is unlikely to be changed - * mostly, here we will retry it by reallocating the - * sessions arrary memory to get rid of the mdsc->mutex - * lock. - */ -retry: - max = mdsc->max_sessions; - sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO); - if (!sessions) - return -ENOMEM; + sessions = kzalloc(max_sessions * sizeof(s), GFP_KERNEL); + if (!sessions) { + err = -ENOMEM; + goto out; + } spin_lock(&ci->i_unsafe_lock); if (req1) { list_for_each_entry(req, &ci->i_unsafe_dirops, r_unsafe_dir_item) { s = req->r_session; - if (unlikely(s->s_mds >= max)) { + if (unlikely(s->s_mds >= max_sessions)) { spin_unlock(&ci->i_unsafe_lock); + for (i = 0; i < max_sessions; i++) { + s = sessions[i]; + if (s) + ceph_put_mds_session(s); + } + kfree(sessions); goto retry; } if (!sessions[s->s_mds]) { @@ -2278,8 +2287,14 @@ retry: list_for_each_entry(req, &ci->i_unsafe_iops, r_unsafe_target_item) { s = req->r_session; - if (unlikely(s->s_mds >= max)) { + if (unlikely(s->s_mds >= max_sessions)) { spin_unlock(&ci->i_unsafe_lock); + for (i = 0; i < max_sessions; i++) { + s = sessions[i]; + if (s) + ceph_put_mds_session(s); + } + kfree(sessions); goto retry; } if (!sessions[s->s_mds]) { @@ -2300,7 +2315,7 @@ retry: spin_unlock(&ci->i_ceph_lock); /* send flush mdlog request to MDSes */ - for (i = 0; i < max; i++) { + for (i = 0; i < max_sessions; i++) { s = sessions[i]; if (s) { send_flush_mdlog(s); @@ -2317,15 +2332,19 @@ retry: ceph_timeout_jiffies(req1->r_timeout)); if (ret) err = -EIO; - ceph_mdsc_put_request(req1); } if (req2) { ret = !wait_for_completion_timeout(&req2->r_safe_completion, ceph_timeout_jiffies(req2->r_timeout)); if (ret) err = -EIO; - ceph_mdsc_put_request(req2); } + +out: + if (req1) + ceph_mdsc_put_request(req1); + if (req2) + ceph_mdsc_put_request(req2); return err; } From 932a9b5870d38b87ba0a9923c804b1af7d3605b9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 25 Jan 2022 15:39:16 -0500 Subject: [PATCH 0673/1295] ceph: properly put ceph_string reference after async create attempt The reference acquired by try_prep_async_create is currently leaked. Ensure we put it. Cc: stable@vger.kernel.org Fixes: 9a8d03ca2e2c ("ceph: attempt to do async create when possible") Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5b9104b8e453..6afae97be9da 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -750,8 +750,10 @@ retry: restore_deleg_ino(dir, req->r_deleg_ino); ceph_mdsc_put_request(req); try_async = false; + ceph_put_string(rcu_dereference_raw(lo.pool_ns)); goto retry; } + ceph_put_string(rcu_dereference_raw(lo.pool_ns)); goto out_req; } } From 4584a768f22b7669cdebabc911543621ac661341 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 26 Jan 2022 12:36:49 -0500 Subject: [PATCH 0674/1295] ceph: set pool_ns in new inode layout for async creates Dan reported that he was unable to write to files that had been asynchronously created when the client's OSD caps are restricted to a particular namespace. The issue is that the layout for the new inode is only partially being filled. Ensure that we populate the pool_ns_data and pool_ns_len in the iinfo before calling ceph_fill_inode. Cc: stable@vger.kernel.org URL: https://tracker.ceph.com/issues/54013 Fixes: 9a8d03ca2e2c ("ceph: attempt to do async create when possible") Reported-by: Dan van der Ster Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6afae97be9da..bbed3224ad68 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -583,6 +583,7 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, struct ceph_inode_info *ci = ceph_inode(dir); struct inode *inode; struct timespec64 now; + struct ceph_string *pool_ns; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_vino vino = { .ino = req->r_deleg_ino, .snap = CEPH_NOSNAP }; @@ -632,6 +633,12 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, in.max_size = cpu_to_le64(lo->stripe_unit); ceph_file_layout_to_legacy(lo, &in.layout); + /* lo is private, so pool_ns can't change */ + pool_ns = rcu_dereference_raw(lo->pool_ns); + if (pool_ns) { + iinfo.pool_ns_len = pool_ns->len; + iinfo.pool_ns_data = pool_ns->str; + } down_read(&mdsc->snap_rwsem); ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, From da123016ca8cb5697366c0b2dd55059b976e67e4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 26 Jan 2022 10:42:58 -0800 Subject: [PATCH 0675/1295] rcu-tasks: Fix computation of CPU-to-list shift counts The ->percpu_enqueue_shift field is used to map from the running CPU number to the index of the corresponding callback list. This mapping can change at runtime in response to varying callback load, resulting in varying levels of contention on the callback-list locks. Unfortunately, the initial value of this field is correct only if the system happens to have a power-of-two number of CPUs, otherwise the callbacks from the high-numbered CPUs can be placed into the callback list indexed by 1 (rather than 0), and those index-1 callbacks will be ignored. This can result in soft lockups and hangs. This commit therefore corrects this mapping, adding one to this shift count as needed for systems having odd numbers of CPUs. Fixes: 7a30871b6a27 ("rcu-tasks: Introduce ->percpu_enqueue_shift for dynamic queue selection") Reported-by: Andrii Nakryiko Cc: Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 84f1d91604cc..d64f0b1d8cd3 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -123,7 +123,7 @@ static struct rcu_tasks rt_name = \ .call_func = call, \ .rtpcpu = &rt_name ## __percpu, \ .name = n, \ - .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS), \ + .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS) + 1, \ .percpu_enqueue_lim = 1, \ .percpu_dequeue_lim = 1, \ .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \ @@ -216,6 +216,7 @@ static void cblist_init_generic(struct rcu_tasks *rtp) int cpu; unsigned long flags; int lim; + int shift; raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rcu_task_enqueue_lim < 0) { @@ -229,7 +230,10 @@ static void cblist_init_generic(struct rcu_tasks *rtp) if (lim > nr_cpu_ids) lim = nr_cpu_ids; - WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids / lim)); + shift = ilog2(nr_cpu_ids / lim); + if (((nr_cpu_ids - 1) >> shift) >= lim) + shift++; + WRITE_ONCE(rtp->percpu_enqueue_shift, shift); WRITE_ONCE(rtp->percpu_dequeue_lim, lim); smp_store_release(&rtp->percpu_enqueue_lim, lim); for_each_possible_cpu(cpu) { @@ -298,7 +302,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, if (unlikely(needadjust)) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim != nr_cpu_ids) { - WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids) + 1); WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); @@ -413,7 +417,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim > 1) { - WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids) + 1); smp_store_release(&rtp->percpu_enqueue_lim, 1); rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name); From 7355bfe0e0cc27597d530f78e259a985cb85af40 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 23 Jan 2022 13:57:17 +0100 Subject: [PATCH 0676/1295] netfilter: Remove flowtable relics NF_FLOW_TABLE_IPV4 and NF_FLOW_TABLE_IPV6 are invisble, selected by nothing (so they can no longer be enabled), and their last real users have been removed (nf_flow_table_ipv6.c is empty). Clean up the leftovers. Fixes: c42ba4290b2147aa ("netfilter: flowtable: remove ipv4/ipv6 modules") Signed-off-by: Geert Uytterhoeven Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 4 ---- net/ipv6/netfilter/Kconfig | 4 ---- net/ipv6/netfilter/Makefile | 3 --- net/ipv6/netfilter/nf_flow_table_ipv6.c | 0 4 files changed, 11 deletions(-) delete mode 100644 net/ipv6/netfilter/nf_flow_table_ipv6.c diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 67087f95579f..aab384126f61 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -58,10 +58,6 @@ config NF_TABLES_ARP endif # NF_TABLES -config NF_FLOW_TABLE_IPV4 - tristate - select NF_FLOW_TABLE_INET - config NF_DUP_IPV4 tristate "Netfilter IPv4 packet duplication to alternate destination" depends on !NF_CONNTRACK || NF_CONNTRACK diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 97d3d1b36dbc..0ba62f4868f9 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -47,10 +47,6 @@ config NFT_FIB_IPV6 endif # NF_TABLES_IPV6 endif # NF_TABLES -config NF_FLOW_TABLE_IPV6 - tristate - select NF_FLOW_TABLE_INET - config NF_DUP_IPV6 tristate "Netfilter IPv6 packet duplication to alternate destination" depends on !NF_CONNTRACK || NF_CONNTRACK diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index b85383606df7..b8d6dc9aeeb6 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -28,9 +28,6 @@ obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o -# flow table support -obj-$(CONFIG_NF_FLOW_TABLE_IPV6) += nf_flow_table_ipv6.o - # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c deleted file mode 100644 index e69de29bb2d1..000000000000 From 34243b9ec856309339172b1507379074156947e8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 23 Jan 2022 15:24:00 +0100 Subject: [PATCH 0677/1295] netfilter: nft_ct: fix use after free when attaching zone template The conversion erroneously removed the refcount increment. In case we can use the percpu template, we need to increment the refcount, else it will be released when the skb gets freed. In case the slowpath is taken, the new template already has a refcount of 1. Fixes: 719774377622 ("netfilter: conntrack: convert to refcount_t api") Reported-by: kernel test robot Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 518d96c8c247..5adf8bb628a8 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -260,9 +260,12 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, ct = this_cpu_read(nft_ct_pcpu_template); if (likely(refcount_read(&ct->ct_general.use) == 1)) { + refcount_inc(&ct->ct_general.use); nf_ct_zone_add(ct, &zone); } else { - /* previous skb got queued to userspace */ + /* previous skb got queued to userspace, allocate temporary + * one until percpu template can be reused. + */ ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC); if (!ct) { regs->verdict.code = NF_DROP; From c858620d2ae3489409af593f005a48a8a324da3d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 23 Jan 2022 15:45:54 +0100 Subject: [PATCH 0678/1295] selftests: netfilter: reduce zone stress test running time This selftests needs almost 3 minutes to complete, reduce the insertes zones to 1000. Test now completes in about 20 seconds. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_zones_many.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/netfilter/nft_zones_many.sh b/tools/testing/selftests/netfilter/nft_zones_many.sh index 04633119b29a..5a8db0b48928 100755 --- a/tools/testing/selftests/netfilter/nft_zones_many.sh +++ b/tools/testing/selftests/netfilter/nft_zones_many.sh @@ -9,7 +9,7 @@ ns="ns-$sfx" # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 -zones=20000 +zones=2000 have_ct_tool=0 ret=0 @@ -75,10 +75,10 @@ EOF while [ $i -lt $max_zones ]; do local start=$(date +%s%3N) - i=$((i + 10000)) + i=$((i + 1000)) j=$((j + 1)) # nft rule in output places each packet in a different zone. - dd if=/dev/zero of=/dev/stdout bs=8k count=10000 2>/dev/null | ip netns exec "$ns" socat STDIN UDP:127.0.0.1:12345,sourceport=12345 + dd if=/dev/zero of=/dev/stdout bs=8k count=1000 2>/dev/null | ip netns exec "$ns" socat STDIN UDP:127.0.0.1:12345,sourceport=12345 if [ $? -ne 0 ] ;then ret=1 break @@ -86,7 +86,7 @@ EOF stop=$(date +%s%3N) local duration=$((stop-start)) - echo "PASS: added 10000 entries in $duration ms (now $i total, loop $j)" + echo "PASS: added 1000 entries in $duration ms (now $i total, loop $j)" done if [ $have_ct_tool -eq 1 ]; then @@ -128,11 +128,11 @@ test_conntrack_tool() { break fi - if [ $((i%10000)) -eq 0 ];then + if [ $((i%1000)) -eq 0 ];then stop=$(date +%s%3N) local duration=$((stop-start)) - echo "PASS: added 10000 entries in $duration ms (now $i total)" + echo "PASS: added 1000 entries in $duration ms (now $i total)" start=$stop fi done From aad51ca71ad83273e8826d6cfdcf53c98748d1fa Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 24 Jan 2022 22:09:15 +0100 Subject: [PATCH 0679/1295] selftests: netfilter: check stateless nat udp checksum fixup Add a test that sends large udp packet (which is fragmented) via a stateless nft nat rule, i.e. 'ip saddr set 10.2.3.4' and check that the datagram is received by peer. On kernels without commit 4e1860a38637 ("netfilter: nft_payload: do not update layer 4 checksum when mangling fragments")', this will fail with: cmp: EOF on /tmp/tmp.V1q0iXJyQF which is empty -rw------- 1 root root 4096 Jan 24 22:03 /tmp/tmp.Aaqnq4rBKS -rw------- 1 root root 0 Jan 24 22:03 /tmp/tmp.V1q0iXJyQF ERROR: in and output file mismatch when checking udp with stateless nat FAIL: nftables v1.0.0 (Fearless Fosdick #2) On patched kernels, this will show: PASS: IP statless for ns2-PFp89amx Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_nat.sh | 152 +++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh index 349a319a9e51..79fe627b9e81 100755 --- a/tools/testing/selftests/netfilter/nft_nat.sh +++ b/tools/testing/selftests/netfilter/nft_nat.sh @@ -899,6 +899,144 @@ EOF ip netns exec "$ns0" nft delete table $family nat } +test_stateless_nat_ip() +{ + local lret=0 + + ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null + ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + + ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from $ns2 before loading stateless rules" + return 1 + fi + +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from $ns2 with stateless rules" + lret=1 + fi + + # ns1 should have seen packets from .2.2, due to stateless rewrite. + expect="packets 1 bytes 84" + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0insl "$expect" "test_stateless 1" + lret=1 + fi + + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns2" ns1$dir "$expect" "test_stateless 2" + lret=1 + fi + done + + # ns1 should not have seen packets from ns2, due to masquerade + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0$dir "$expect" "test_stateless 3" + lret=1 + fi + + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns1$dir "$expect" "test_stateless 4" + lret=1 + fi + done + + reset_counters + + socat -h > /dev/null 2>&1 + if [ $? -ne 0 ];then + echo "SKIP: Could not run stateless nat frag test without socat tool" + if [ $lret -eq 0 ]; then + return $ksft_skip + fi + + ip netns exec "$ns0" nft delete table ip stateless + return $lret + fi + + local tmpfile=$(mktemp) + dd if=/dev/urandom of=$tmpfile bs=4096 count=1 2>/dev/null + + local outfile=$(mktemp) + ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:$outfile < /dev/null & + sc_r=$! + + sleep 1 + # re-do with large ping -> ip fragmentation + ip netns exec "$ns2" timeout 3 socat - UDP4-SENDTO:"10.0.1.99:4233" < "$tmpfile" > /dev/null + if [ $? -ne 0 ] ; then + echo "ERROR: failed to test udp $ns1 to $ns2 with stateless ip nat" 1>&2 + lret=1 + fi + + wait + + cmp "$tmpfile" "$outfile" + if [ $? -ne 0 ]; then + ls -l "$tmpfile" "$outfile" + echo "ERROR: in and output file mismatch when checking udp with stateless nat" 1>&2 + lret=1 + fi + + rm -f "$tmpfile" "$outfile" + + # ns1 should have seen packets from 2.2, due to stateless rewrite. + expect="packets 3 bytes 4164" + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0insl "$expect" "test_stateless 5" + lret=1 + fi + + ip netns exec "$ns0" nft delete table ip stateless + if [ $? -ne 0 ]; then + echo "ERROR: Could not delete table ip stateless" 1>&2 + lret=1 + fi + + test $lret -eq 0 && echo "PASS: IP statless for $ns2" + + return $lret +} + # ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 for i in 0 1 2; do ip netns exec ns$i-$sfx nft -f /dev/stdin < Date: Tue, 25 Jan 2022 20:06:03 +0100 Subject: [PATCH 0680/1295] netfilter: nft_reject_bridge: Fix for missing reply from prerouting Prior to commit fa538f7cf05aa ("netfilter: nf_reject: add reject skbuff creation helpers"), nft_reject_bridge did not assign to nskb->dev before passing nskb on to br_forward(). The shared skbuff creation helpers introduced in above commit do which seems to confuse br_forward() as reject statements in prerouting hook won't emit a packet anymore. Fix this by simply passing NULL instead of 'dev' to the helpers - they use the pointer for just that assignment, nothing else. Fixes: fa538f7cf05aa ("netfilter: nf_reject: add reject skbuff creation helpers") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nft_reject_bridge.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index eba0efe64d05..fbf858ddec35 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -49,7 +49,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net, { struct sk_buff *nskb; - nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, dev, hook); + nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, NULL, hook); if (!nskb) return; @@ -65,7 +65,7 @@ static void nft_reject_br_send_v4_unreach(struct net *net, { struct sk_buff *nskb; - nskb = nf_reject_skb_v4_unreach(net, oldskb, dev, hook, code); + nskb = nf_reject_skb_v4_unreach(net, oldskb, NULL, hook, code); if (!nskb) return; @@ -81,7 +81,7 @@ static void nft_reject_br_send_v6_tcp_reset(struct net *net, { struct sk_buff *nskb; - nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, dev, hook); + nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, NULL, hook); if (!nskb) return; @@ -98,7 +98,7 @@ static void nft_reject_br_send_v6_unreach(struct net *net, { struct sk_buff *nskb; - nskb = nf_reject_skb_v6_unreach(net, oldskb, dev, hook, code); + nskb = nf_reject_skb_v6_unreach(net, oldskb, NULL, hook, code); if (!nskb) return; From f459bfd4b9793f25e0fcf19878edd87d8dc569d9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 26 Jan 2022 01:46:58 +0100 Subject: [PATCH 0681/1295] netfilter: nft_byteorder: track register operations Cancel tracking for byteorder operation, otherwise selector + byteorder operation is incorrectly reduced if source and destination registers are the same. Reported-by: kernel test robot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_byteorder.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index 9d5947ab8d4e..e646e9ee4a98 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -167,12 +167,24 @@ nla_put_failure: return -1; } +static bool nft_byteorder_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + struct nft_byteorder *priv = nft_expr_priv(expr); + + track->regs[priv->dreg].selector = NULL; + track->regs[priv->dreg].bitwise = NULL; + + return false; +} + static const struct nft_expr_ops nft_byteorder_ops = { .type = &nft_byteorder_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)), .eval = nft_byteorder_eval, .init = nft_byteorder_init, .dump = nft_byteorder_dump, + .reduce = nft_byteorder_reduce, }; struct nft_expr_type nft_byteorder_type __read_mostly = { From eda0cf1202acf1ef47f93d8f92d4839213431424 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Jan 2022 12:54:54 +0100 Subject: [PATCH 0682/1295] selftests: nft_concat_range: add test for reload with no element add/del Add a specific test for the reload issue fixed with commit 23c54263efd7cb ("netfilter: nft_set_pipapo: allocate pcpu scratch maps on clone"). Add to set, then flush set content + restore without other add/remove in the transaction. On kernels before the fix, this test case fails: net,mac with reload [FAIL] Signed-off-by: Florian Westphal Reviewed-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- .../selftests/netfilter/nft_concat_range.sh | 72 ++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/netfilter/nft_concat_range.sh index ed61f6cab60f..df322e47a54f 100755 --- a/tools/testing/selftests/netfilter/nft_concat_range.sh +++ b/tools/testing/selftests/netfilter/nft_concat_range.sh @@ -27,7 +27,7 @@ TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto net6_port_net6_port net_port_mac_proto_net" # Reported bugs, also described by TYPE_ variables below -BUGS="flush_remove_add" +BUGS="flush_remove_add reload" # List of possible paths to pktgen script from kernel tree for performance tests PKTGEN_SCRIPT_PATHS=" @@ -354,6 +354,23 @@ TYPE_flush_remove_add=" display Add two elements, flush, re-add " +TYPE_reload=" +display net,mac with reload +type_spec ipv4_addr . ether_addr +chain_spec ip daddr . ether saddr +dst addr4 +src mac +start 1 +count 1 +src_delta 2000 +tools sendip nc bash +proto udp + +race_repeat 0 + +perf_duration 0 +" + # Set template for all tests, types and rules are filled in depending on test set_template=' flush ruleset @@ -1473,6 +1490,59 @@ test_bug_flush_remove_add() { nft flush ruleset } +# - add ranged element, check that packets match it +# - reload the set, check packets still match +test_bug_reload() { + setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + rstart=${start} + + range_size=1 + for i in $(seq "${start}" $((start + count))); do + end=$((start + range_size)) + + # Avoid negative or zero-sized port ranges + if [ $((end / 65534)) -gt $((start / 65534)) ]; then + start=${end} + end=$((end + 1)) + fi + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" || return 1 + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + # check kernel does allocate pcpu sctrach map + # for reload with no elemet add/delete + ( echo flush set inet filter test ; + nft list set inet filter test ) | nft -f - + + start=${rstart} + range_size=1 + + for i in $(seq "${start}" $((start + count))); do + end=$((start + range_size)) + + # Avoid negative or zero-sized port ranges + if [ $((end / 65534)) -gt $((start / 65534)) ]; then + start=${end} + end=$((end + 1)) + fi + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + send_match "${j}" $((j + src_delta)) || return 1 + done + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + nft flush ruleset +} + test_reported_issues() { eval test_bug_"${subtest}" } From f9d87929d451d3e649699d0f1d74f71f77ad38f5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 24 Jan 2022 12:46:50 -0600 Subject: [PATCH 0683/1295] ucount: Make get_ucount a safe get_user replacement When the ucount code was refactored to create get_ucount it was missed that some of the contexts in which a rlimit is kept elevated can be the only reference to the user/ucount in the system. Ordinary ucount references exist in places that also have a reference to the user namspace, but in POSIX message queues, the SysV shm code, and the SIGPENDING code there is no independent user namespace reference. Inspection of the the user_namespace show no instance of circular references between struct ucounts and the user_namespace. So hold a reference from struct ucount to i's user_namespace to resolve this problem. Link: https://lore.kernel.org/lkml/YZV7Z+yXbsx9p3JN@fixkernel.com/ Reported-by: Qian Cai Reported-by: Mathias Krause Tested-by: Mathias Krause Reviewed-by: Mathias Krause Reviewed-by: Alexey Gladkov Fixes: d64696905554 ("Reimplement RLIMIT_SIGPENDING on top of ucounts") Fixes: 6e52a9f0532f ("Reimplement RLIMIT_MSGQUEUE on top of ucounts") Fixes: d7c9e99aee48 ("Reimplement RLIMIT_MEMLOCK on top of ucounts") Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" --- kernel/ucount.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/ucount.c b/kernel/ucount.c index 7b32c356ebc5..65b597431c86 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -190,6 +190,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) kfree(new); } else { hlist_add_head(&new->node, hashent); + get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); return new; } @@ -210,6 +211,7 @@ void put_ucounts(struct ucounts *ucounts) if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { hlist_del_init(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); + put_user_ns(ucounts->ns); kfree(ucounts); } } From 9b6d90e2085ca2ce72ef9ea78658bf270855e62e Mon Sep 17 00:00:00 2001 From: Zhou Qingyang Date: Tue, 25 Jan 2022 00:45:25 +0800 Subject: [PATCH 0684/1295] ata: pata_platform: Fix a NULL pointer dereference in __pata_platform_probe() In __pata_platform_probe(), devm_kzalloc() is assigned to ap->ops and there is a dereference of it right after that, which could introduce a NULL pointer dereference bug. Fix this by adding a NULL check of ap->ops. This bug was found by a static analyzer. Builds with 'make allyesconfig' show no new warnings, and our static analyzer no longer warns about this code. Fixes: f3d5e4f18dba ("ata: pata_of_platform: Allow to use 16-bit wide data transfer") Signed-off-by: Zhou Qingyang Signed-off-by: Damien Le Moal Reviewed-by: Sergey Shtylyov --- drivers/ata/pata_platform.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ata/pata_platform.c b/drivers/ata/pata_platform.c index 028329428b75..87c7c90676ca 100644 --- a/drivers/ata/pata_platform.c +++ b/drivers/ata/pata_platform.c @@ -128,6 +128,8 @@ int __pata_platform_probe(struct device *dev, struct resource *io_res, ap = host->ports[0]; ap->ops = devm_kzalloc(dev, sizeof(*ap->ops), GFP_KERNEL); + if (!ap->ops) + return -ENOMEM; ap->ops->inherits = &ata_sff_port_ops; ap->ops->cable_detect = ata_cable_unknown; ap->ops->set_mode = pata_platform_set_mode; From a92f7a6feeb3884c69c1c7c1f13bccecb2228ad0 Mon Sep 17 00:00:00 2001 From: Catherine Sullivan Date: Tue, 25 Jan 2022 16:38:43 -0800 Subject: [PATCH 0685/1295] gve: Fix GFP flags when allocing pages Use GFP_ATOMIC when allocating pages out of the hotpath, continue to use GFP_KERNEL when allocating pages during setup. GFP_KERNEL will allow blocking which allows it to succeed more often in a low memory enviornment but in the hotpath we do not want to allow the allocation to block. Fixes: f5cedc84a30d2 ("gve: Add transmit and receive support") Signed-off-by: Catherine Sullivan Signed-off-by: David Awogbemila Link: https://lore.kernel.org/r/20220126003843.3584521-1-awogbemila@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve.h | 2 +- drivers/net/ethernet/google/gve/gve_main.c | 6 +++--- drivers/net/ethernet/google/gve/gve_rx.c | 3 ++- drivers/net/ethernet/google/gve/gve_rx_dqo.c | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 5f5d4f7aa813..160735484465 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -843,7 +843,7 @@ static inline bool gve_is_gqi(struct gve_priv *priv) /* buffers */ int gve_alloc_page(struct gve_priv *priv, struct device *dev, struct page **page, dma_addr_t *dma, - enum dma_data_direction); + enum dma_data_direction, gfp_t gfp_flags); void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma, enum dma_data_direction); /* tx handling */ diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index f7f65c4bf993..54e51c8221b8 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -766,9 +766,9 @@ static void gve_free_rings(struct gve_priv *priv) int gve_alloc_page(struct gve_priv *priv, struct device *dev, struct page **page, dma_addr_t *dma, - enum dma_data_direction dir) + enum dma_data_direction dir, gfp_t gfp_flags) { - *page = alloc_page(GFP_KERNEL); + *page = alloc_page(gfp_flags); if (!*page) { priv->page_alloc_fail++; return -ENOMEM; @@ -811,7 +811,7 @@ static int gve_alloc_queue_page_list(struct gve_priv *priv, u32 id, for (i = 0; i < pages; i++) { err = gve_alloc_page(priv, &priv->pdev->dev, &qpl->pages[i], &qpl->page_buses[i], - gve_qpl_dma_dir(priv, id)); + gve_qpl_dma_dir(priv, id), GFP_KERNEL); /* caller handles clean up */ if (err) return -ENOMEM; diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c index 9ddcc497f48e..2068199445bd 100644 --- a/drivers/net/ethernet/google/gve/gve_rx.c +++ b/drivers/net/ethernet/google/gve/gve_rx.c @@ -86,7 +86,8 @@ static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev, dma_addr_t dma; int err; - err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE); + err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE, + GFP_ATOMIC); if (err) return err; diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index beb8bb079023..8c939628e2d8 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -157,7 +157,7 @@ static int gve_alloc_page_dqo(struct gve_priv *priv, int err; err = gve_alloc_page(priv, &priv->pdev->dev, &buf_state->page_info.page, - &buf_state->addr, DMA_FROM_DEVICE); + &buf_state->addr, DMA_FROM_DEVICE, GFP_KERNEL); if (err) return err; From d7e4f8545b497b3f5687e592f1c355cbaee64c8c Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 26 Jan 2022 13:04:26 +0800 Subject: [PATCH 0686/1295] pid: Introduce helper task_is_in_init_pid_ns() Currently the kernel uses open code in multiple places to check if a task is in the root PID namespace with the kind of format: if (task_active_pid_ns(current) == &init_pid_ns) do_something(); This patch creates a new helper function, task_is_in_init_pid_ns(), it returns true if a passed task is in the root PID namespace, otherwise returns false. So it will be used to replace open codes. Suggested-by: Suzuki K Poulose Signed-off-by: Leo Yan Reviewed-by: Leon Romanovsky Acked-by: Suzuki K Poulose Acked-by: Balbir Singh Signed-off-by: Jakub Kicinski --- include/linux/pid_namespace.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c7e627503d2..07481bb87d4e 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -86,4 +86,9 @@ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); void pid_idr_init(void); +static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) +{ + return task_active_pid_ns(tsk) == &init_pid_ns; +} + #endif /* _LINUX_PID_NS_H */ From 42c66d16756402c4749d94f005a998a43e8fa338 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 26 Jan 2022 13:04:27 +0800 Subject: [PATCH 0687/1295] connector/cn_proc: Use task_is_in_init_pid_ns() This patch replaces open code with task_is_in_init_pid_ns() to check if a task is in root PID namespace. Signed-off-by: Leo Yan Acked-by: Balbir Singh Signed-off-by: Jakub Kicinski --- drivers/connector/cn_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index 646ad385e490..ccac1c453080 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -358,7 +358,7 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg, * other namespaces. */ if ((current_user_ns() != &init_user_ns) || - (task_active_pid_ns(current) != &init_pid_ns)) + !task_is_in_init_pid_ns(current)) return; /* Can only change if privileged. */ From 37291f60d0822f191748c2a54ce63b0bc669020f Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 25 Jan 2022 18:16:00 -0600 Subject: [PATCH 0688/1295] phy: xilinx: zynqmp: Fix bus width setting for SGMII TX_PROT_BUS_WIDTH and RX_PROT_BUS_WIDTH are single registers with separate bit fields for each lane. The code in xpsgtr_phy_init_sgmii was not preserving the existing register value for other lanes, so enabling the PHY in SGMII mode on one lane zeroed out the settings for all other lanes, causing other PS-GTR peripherals such as USB3 to malfunction. Use xpsgtr_clr_set to only manipulate the desired bits in the register. Fixes: 4a33bea00314 ("phy: zynqmp: Add PHY driver for the Xilinx ZynqMP Gigabit Transceiver") Signed-off-by: Robert Hancock Acked-by: Michal Simek Reviewed-by: Laurent Pinchart Link: https://lore.kernel.org/r/20220126001600.1592218-1-robert.hancock@calian.com Signed-off-by: Vinod Koul --- drivers/phy/xilinx/phy-zynqmp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/phy/xilinx/phy-zynqmp.c b/drivers/phy/xilinx/phy-zynqmp.c index f478d8a17115..9be9535ad7ab 100644 --- a/drivers/phy/xilinx/phy-zynqmp.c +++ b/drivers/phy/xilinx/phy-zynqmp.c @@ -134,7 +134,8 @@ #define PROT_BUS_WIDTH_10 0x0 #define PROT_BUS_WIDTH_20 0x1 #define PROT_BUS_WIDTH_40 0x2 -#define PROT_BUS_WIDTH_SHIFT 2 +#define PROT_BUS_WIDTH_SHIFT(n) ((n) * 2) +#define PROT_BUS_WIDTH_MASK(n) GENMASK((n) * 2 + 1, (n) * 2) /* Number of GT lanes */ #define NUM_LANES 4 @@ -445,12 +446,12 @@ static void xpsgtr_phy_init_sata(struct xpsgtr_phy *gtr_phy) static void xpsgtr_phy_init_sgmii(struct xpsgtr_phy *gtr_phy) { struct xpsgtr_dev *gtr_dev = gtr_phy->dev; + u32 mask = PROT_BUS_WIDTH_MASK(gtr_phy->lane); + u32 val = PROT_BUS_WIDTH_10 << PROT_BUS_WIDTH_SHIFT(gtr_phy->lane); /* Set SGMII protocol TX and RX bus width to 10 bits. */ - xpsgtr_write(gtr_dev, TX_PROT_BUS_WIDTH, - PROT_BUS_WIDTH_10 << (gtr_phy->lane * PROT_BUS_WIDTH_SHIFT)); - xpsgtr_write(gtr_dev, RX_PROT_BUS_WIDTH, - PROT_BUS_WIDTH_10 << (gtr_phy->lane * PROT_BUS_WIDTH_SHIFT)); + xpsgtr_clr_set(gtr_dev, TX_PROT_BUS_WIDTH, mask, val); + xpsgtr_clr_set(gtr_dev, RX_PROT_BUS_WIDTH, mask, val); xpsgtr_bypass_scrambler_8b10b(gtr_phy); } From cfc826c88a79e22ba5d8001556eb2c7efd8a01b6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Jan 2022 14:17:24 +0300 Subject: [PATCH 0689/1295] phy: stm32: fix a refcount leak in stm32_usbphyc_pll_enable() This error path needs to decrement "usbphyc->n_pll_cons.counter" before returning. Fixes: 5b1af71280ab ("phy: stm32: rework PLL Lock detection") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20220112111724.GB3019@kili Signed-off-by: Vinod Koul --- drivers/phy/st/phy-stm32-usbphyc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/st/phy-stm32-usbphyc.c b/drivers/phy/st/phy-stm32-usbphyc.c index 2ce9bfd783d4..007a23c78d56 100644 --- a/drivers/phy/st/phy-stm32-usbphyc.c +++ b/drivers/phy/st/phy-stm32-usbphyc.c @@ -304,7 +304,7 @@ static int stm32_usbphyc_pll_enable(struct stm32_usbphyc *usbphyc) ret = __stm32_usbphyc_pll_disable(usbphyc); if (ret) - return ret; + goto dec_n_pll_cons; } ret = stm32_usbphyc_regulators_enable(usbphyc); From 46e994717807f4b935c44d81dde9dd8bcd9a4f5d Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Fri, 7 Jan 2022 10:50:50 +0800 Subject: [PATCH 0690/1295] phy: phy-mtk-tphy: Fix duplicated argument in phy-mtk-tphy Fix following coccicheck warning: ./drivers/phy/mediatek/phy-mtk-tphy.c:994:6-29: duplicated argument to && or || The efuse_rx_imp is duplicate. Here should be efuse_tx_imp. Signed-off-by: Wan Jiabing Acked-by: Chunfeng Yun Link: https://lore.kernel.org/r/20220107025050.787720-1-wanjiabing@vivo.com Signed-off-by: Vinod Koul --- drivers/phy/mediatek/phy-mtk-tphy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/mediatek/phy-mtk-tphy.c b/drivers/phy/mediatek/phy-mtk-tphy.c index 6d307102f4f6..8ee7682b8e93 100644 --- a/drivers/phy/mediatek/phy-mtk-tphy.c +++ b/drivers/phy/mediatek/phy-mtk-tphy.c @@ -992,7 +992,7 @@ static int phy_efuse_get(struct mtk_tphy *tphy, struct mtk_phy_instance *instanc /* no efuse, ignore it */ if (!instance->efuse_intr && !instance->efuse_rx_imp && - !instance->efuse_rx_imp) { + !instance->efuse_tx_imp) { dev_warn(dev, "no u3 intr efuse, but dts enable it\n"); instance->efuse_sw_en = 0; break; From 25e58af4be412d59e056da65cc1cefbd89185bd2 Mon Sep 17 00:00:00 2001 From: Wu Zheng Date: Mon, 21 Jun 2021 19:07:01 -0400 Subject: [PATCH 0691/1295] nvme-pci: add the IGNORE_DEV_SUBNQN quirk for Intel P4500/P4600 SSDs The Intel P4500/P4600 SSDs do not report a subsystem NQN despite claiming compliance to a standards version where reporting one is required. Add the IGNORE_DEV_SUBNQN quirk to not fail the initialization of a second such SSDs in a system. Signed-off-by: Zheng Wu Signed-off-by: Ye Jinhe Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d8585df2c2fd..6a99ed680915 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3391,7 +3391,8 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_DEALLOCATE_ZEROES, }, { PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | - NVME_QUIRK_DEALLOCATE_ZEROES, }, + NVME_QUIRK_DEALLOCATE_ZEROES | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, From a5f3851b7f7951e8d4ba0a9ba3b5308a5f250a2d Mon Sep 17 00:00:00 2001 From: Changcheng Deng Date: Fri, 7 Jan 2022 02:23:06 +0000 Subject: [PATCH 0692/1295] nvme-fabrics: remove the unneeded ret variable in nvmf_dev_show Remove unneeded variable and directly return 0. Reported-by: Zeal Robot Signed-off-by: Changcheng Deng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 7ae041e2b3fb..f79a66d4e22c 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -1092,7 +1092,6 @@ static void __nvmf_concat_opt_tokens(struct seq_file *seq_file) static int nvmf_dev_show(struct seq_file *seq_file, void *private) { struct nvme_ctrl *ctrl; - int ret = 0; mutex_lock(&nvmf_dev_mutex); ctrl = seq_file->private; @@ -1106,7 +1105,7 @@ static int nvmf_dev_show(struct seq_file *seq_file, void *private) out_unlock: mutex_unlock(&nvmf_dev_mutex); - return ret; + return 0; } static int nvmf_dev_open(struct inode *inode, struct file *file) From 1293fccc9e892712d910ec96079d3717307f1d2d Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 25 Jan 2022 13:14:21 +0100 Subject: [PATCH 0693/1295] net: ieee802154: hwsim: Ensure proper channel selection at probe time Drivers are expected to set the PHY current_channel and current_page according to their default state. The hwsim driver is advertising being configured on channel 13 by default but that is not reflected in its own internal pib structure. In order to ensure that this driver consider the current channel as being 13 internally, we at least need to set the pib->channel field to 13. Fixes: f25da51fdc38 ("ieee802154: hwsim: add replacement for fakelb") Signed-off-by: Miquel Raynal [stefan@datenfreihafen.org: fixed assigment from page to channel] Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220125121426.848337-2-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mac802154_hwsim.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index 8caa61ec718f..36f1c5aa98fc 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -786,6 +786,7 @@ static int hwsim_add_one(struct genl_info *info, struct device *dev, goto err_pib; } + pib->channel = 13; rcu_assign_pointer(phy->pib, pib); phy->idx = idx; INIT_LIST_HEAD(&phy->edges); From d753c4004820a888ec007dd88b271fa9c3172c5c Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 25 Jan 2022 13:14:22 +0100 Subject: [PATCH 0694/1295] net: ieee802154: mcr20a: Fix lifs/sifs periods These periods are expressed in time units (microseconds) while 40 and 12 are the number of symbol durations these periods will last. We need to multiply them both with phy->symbol_duration in order to get these values in microseconds. Fixes: 8c6ad9cc5157 ("ieee802154: Add NXP MCR20A IEEE 802.15.4 transceiver driver") Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220125121426.848337-3-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mcr20a.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ieee802154/mcr20a.c b/drivers/net/ieee802154/mcr20a.c index 8dc04e2590b1..383231b85464 100644 --- a/drivers/net/ieee802154/mcr20a.c +++ b/drivers/net/ieee802154/mcr20a.c @@ -976,8 +976,8 @@ static void mcr20a_hw_setup(struct mcr20a_local *lp) dev_dbg(printdev(lp), "%s\n", __func__); phy->symbol_duration = 16; - phy->lifs_period = 40; - phy->sifs_period = 12; + phy->lifs_period = 40 * phy->symbol_duration; + phy->sifs_period = 12 * phy->symbol_duration; hw->flags = IEEE802154_HW_TX_OMIT_CKSUM | IEEE802154_HW_AFILT | From e5ce576d45bf72fd0e3dc37eff897bfcc488f6a9 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 25 Jan 2022 13:14:23 +0100 Subject: [PATCH 0695/1295] net: ieee802154: at86rf230: Stop leaking skb's Upon error the ieee802154_xmit_complete() helper is not called. Only ieee802154_wake_queue() is called manually. In the Tx case we then leak the skb structure. Free the skb structure upon error before returning when appropriate. As the 'is_tx = 0' cannot be moved in the complete handler because of a possible race between the delay in switching to STATE_RX_AACK_ON and a new interrupt, we introduce an intermediate 'was_tx' boolean just for this purpose. There is no Fixes tag applying here, many changes have been made on this area and the issue kind of always existed. Suggested-by: Alexander Aring Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220125121426.848337-4-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/at86rf230.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c index 7d67f41387f5..4f5ef8a9a9a8 100644 --- a/drivers/net/ieee802154/at86rf230.c +++ b/drivers/net/ieee802154/at86rf230.c @@ -100,6 +100,7 @@ struct at86rf230_local { unsigned long cal_timeout; bool is_tx; bool is_tx_from_off; + bool was_tx; u8 tx_retry; struct sk_buff *tx_skb; struct at86rf230_state_change tx; @@ -343,7 +344,11 @@ at86rf230_async_error_recover_complete(void *context) if (ctx->free) kfree(ctx); - ieee802154_wake_queue(lp->hw); + if (lp->was_tx) { + lp->was_tx = 0; + dev_kfree_skb_any(lp->tx_skb); + ieee802154_wake_queue(lp->hw); + } } static void @@ -352,7 +357,11 @@ at86rf230_async_error_recover(void *context) struct at86rf230_state_change *ctx = context; struct at86rf230_local *lp = ctx->lp; - lp->is_tx = 0; + if (lp->is_tx) { + lp->was_tx = 1; + lp->is_tx = 0; + } + at86rf230_async_state_change(lp, ctx, STATE_RX_AACK_ON, at86rf230_async_error_recover_complete); } From 621b24b09eb61c63f262da0c9c5f0e93348897e5 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 25 Jan 2022 13:14:24 +0100 Subject: [PATCH 0696/1295] net: ieee802154: ca8210: Stop leaking skb's Upon error the ieee802154_xmit_complete() helper is not called. Only ieee802154_wake_queue() is called manually. We then leak the skb structure. Free the skb structure upon error before returning. Fixes: ded845a781a5 ("ieee802154: Add CA8210 IEEE 802.15.4 device driver") Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220125121426.848337-5-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/ca8210.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c index ece6ff6049f6..f3438d3e104a 100644 --- a/drivers/net/ieee802154/ca8210.c +++ b/drivers/net/ieee802154/ca8210.c @@ -1771,6 +1771,7 @@ static int ca8210_async_xmit_complete( status ); if (status != MAC_TRANSACTION_OVERFLOW) { + dev_kfree_skb_any(priv->tx_skb); ieee802154_wake_queue(priv->hw); return 0; } From 79c37ca73a6e9a33f7b2b7783ba6af07a448c8a9 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 25 Jan 2022 13:14:25 +0100 Subject: [PATCH 0697/1295] net: ieee802154: Return meaningful error codes from the netlink helpers Returning -1 does not indicate anything useful. Use a standard and meaningful error code instead. Fixes: a26c5fd7622d ("nl802154: add support for security layer") Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220125121426.848337-6-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/ieee802154/nl802154.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 277124f206e0..e0b072aecf0f 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -1441,7 +1441,7 @@ static int nl802154_send_key(struct sk_buff *msg, u32 cmd, u32 portid, hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) - return -1; + return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; @@ -1634,7 +1634,7 @@ static int nl802154_send_device(struct sk_buff *msg, u32 cmd, u32 portid, hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) - return -1; + return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; @@ -1812,7 +1812,7 @@ static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid, hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) - return -1; + return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; @@ -1988,7 +1988,7 @@ static int nl802154_send_seclevel(struct sk_buff *msg, u32 cmd, u32 portid, hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) - return -1; + return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; From 5d8a8b324ff48c9d9fe4f1634e33dc647d2481b4 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 25 Jan 2022 13:14:26 +0100 Subject: [PATCH 0698/1295] MAINTAINERS: Remove Harry Morris bouncing address Harry's e-mail address from Cascoda bounces, I have not found any contributions from him since 2018 so let's drop the Maintainer entry from the CA8210 driver and mark it Orphan. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220125121426.848337-7-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0d7883977e9b..ee938e96b101 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4157,9 +4157,8 @@ N: csky K: csky CA8210 IEEE-802.15.4 RADIO DRIVER -M: Harry Morris L: linux-wpan@vger.kernel.org -S: Maintained +S: Orphan W: https://github.com/Cascoda/ca8210-linux.git F: Documentation/devicetree/bindings/net/ieee802154/ca8210.txt F: drivers/net/ieee802154/ca8210.c From d1ad2721b1eb05d54e81393a7ebc332d4a35c68f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 27 Jan 2022 08:33:04 +0100 Subject: [PATCH 0699/1295] kbuild: remove include/linux/cyclades.h from header file check The file now rightfully throws up a big warning that it should never be included, so remove it from the header_check test. Fixes: f23653fe6447 ("tty: Partially revert the removal of the Cyclades public API") Cc: stable Cc: Masahiro Yamada Cc: "Maciej W. Rozycki" Reported-by: Stephen Rothwell Reported-by: kernel test robot Link: https://lore.kernel.org/r/20220127073304.42399-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- usr/include/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/usr/include/Makefile b/usr/include/Makefile index 7be7468c177b..83822c33e9e7 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -28,6 +28,7 @@ no-header-test += linux/am437x-vpfe.h no-header-test += linux/android/binder.h no-header-test += linux/android/binderfs.h no-header-test += linux/coda.h +no-header-test += linux/cyclades.h no-header-test += linux/errqueue.h no-header-test += linux/fsmap.h no-header-test += linux/hdlc/ioctl.h From fa62f39dc7e25fc16371b958ac59b9a6fd260bea Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Tue, 25 Jan 2022 15:19:44 +0100 Subject: [PATCH 0700/1295] MIPS: Fix build error due to PTR used in more places Use PTR_WD instead of PTR to avoid clashes with other parts. Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/asm.h | 4 +- arch/mips/include/asm/ftrace.h | 4 +- arch/mips/include/asm/r4kcache.h | 4 +- arch/mips/include/asm/unaligned-emul.h | 176 ++++++++++++------------- arch/mips/kernel/mips-r2-to-r6-emul.c | 104 +++++++-------- arch/mips/kernel/r2300_fpu.S | 6 +- arch/mips/kernel/r4k_fpu.S | 2 +- arch/mips/kernel/relocate_kernel.S | 22 ++-- arch/mips/kernel/scall32-o32.S | 10 +- arch/mips/kernel/scall64-n32.S | 2 +- arch/mips/kernel/scall64-n64.S | 2 +- arch/mips/kernel/scall64-o32.S | 10 +- arch/mips/kernel/syscall.c | 8 +- arch/mips/lib/csum_partial.S | 4 +- arch/mips/lib/memcpy.S | 4 +- arch/mips/lib/memset.S | 2 +- arch/mips/lib/strncpy_user.S | 4 +- arch/mips/lib/strnlen_user.S | 2 +- 18 files changed, 185 insertions(+), 185 deletions(-) diff --git a/arch/mips/include/asm/asm.h b/arch/mips/include/asm/asm.h index 6ffdd4b5e1d0..336ac9b65235 100644 --- a/arch/mips/include/asm/asm.h +++ b/arch/mips/include/asm/asm.h @@ -285,7 +285,7 @@ symbol = value #define PTR_SCALESHIFT 2 -#define PTR .word +#define PTR_WD .word #define PTRSIZE 4 #define PTRLOG 2 #endif @@ -310,7 +310,7 @@ symbol = value #define PTR_SCALESHIFT 3 -#define PTR .dword +#define PTR_WD .dword #define PTRSIZE 8 #define PTRLOG 3 #endif diff --git a/arch/mips/include/asm/ftrace.h b/arch/mips/include/asm/ftrace.h index b463f2aa5a61..db497a8167da 100644 --- a/arch/mips/include/asm/ftrace.h +++ b/arch/mips/include/asm/ftrace.h @@ -32,7 +32,7 @@ do { \ ".previous\n" \ \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR) "\t1b, 3b\n\t" \ + STR(PTR_WD) "\t1b, 3b\n\t" \ ".previous\n" \ \ : [tmp_dst] "=&r" (dst), [tmp_err] "=r" (error)\ @@ -54,7 +54,7 @@ do { \ ".previous\n" \ \ ".section\t__ex_table,\"a\"\n\t"\ - STR(PTR) "\t1b, 3b\n\t" \ + STR(PTR_WD) "\t1b, 3b\n\t" \ ".previous\n" \ \ : [tmp_err] "=r" (error) \ diff --git a/arch/mips/include/asm/r4kcache.h b/arch/mips/include/asm/r4kcache.h index af3788589ee6..431a1c9d53fc 100644 --- a/arch/mips/include/asm/r4kcache.h +++ b/arch/mips/include/asm/r4kcache.h @@ -119,7 +119,7 @@ static inline void flush_scache_line(unsigned long addr) " j 2b \n" \ " .previous \n" \ " .section __ex_table,\"a\" \n" \ - " "STR(PTR)" 1b, 3b \n" \ + " "STR(PTR_WD)" 1b, 3b \n" \ " .previous" \ : "+r" (__err) \ : "i" (op), "r" (addr), "i" (-EFAULT)); \ @@ -142,7 +142,7 @@ static inline void flush_scache_line(unsigned long addr) " j 2b \n" \ " .previous \n" \ " .section __ex_table,\"a\" \n" \ - " "STR(PTR)" 1b, 3b \n" \ + " "STR(PTR_WD)" 1b, 3b \n" \ " .previous" \ : "+r" (__err) \ : "i" (op), "r" (addr), "i" (-EFAULT)); \ diff --git a/arch/mips/include/asm/unaligned-emul.h b/arch/mips/include/asm/unaligned-emul.h index 2022b18944b9..9af0f4d3d288 100644 --- a/arch/mips/include/asm/unaligned-emul.h +++ b/arch/mips/include/asm/unaligned-emul.h @@ -20,8 +20,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -41,8 +41,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -74,10 +74,10 @@ do { \ "j\t10b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 11b\n\t" \ - STR(PTR)"\t2b, 11b\n\t" \ - STR(PTR)"\t3b, 11b\n\t" \ - STR(PTR)"\t4b, 11b\n\t" \ + STR(PTR_WD)"\t1b, 11b\n\t" \ + STR(PTR_WD)"\t2b, 11b\n\t" \ + STR(PTR_WD)"\t3b, 11b\n\t" \ + STR(PTR_WD)"\t4b, 11b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -102,8 +102,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -125,8 +125,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -145,8 +145,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -178,10 +178,10 @@ do { \ "j\t10b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 11b\n\t" \ - STR(PTR)"\t2b, 11b\n\t" \ - STR(PTR)"\t3b, 11b\n\t" \ - STR(PTR)"\t4b, 11b\n\t" \ + STR(PTR_WD)"\t1b, 11b\n\t" \ + STR(PTR_WD)"\t2b, 11b\n\t" \ + STR(PTR_WD)"\t3b, 11b\n\t" \ + STR(PTR_WD)"\t4b, 11b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -223,14 +223,14 @@ do { \ "j\t10b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 11b\n\t" \ - STR(PTR)"\t2b, 11b\n\t" \ - STR(PTR)"\t3b, 11b\n\t" \ - STR(PTR)"\t4b, 11b\n\t" \ - STR(PTR)"\t5b, 11b\n\t" \ - STR(PTR)"\t6b, 11b\n\t" \ - STR(PTR)"\t7b, 11b\n\t" \ - STR(PTR)"\t8b, 11b\n\t" \ + STR(PTR_WD)"\t1b, 11b\n\t" \ + STR(PTR_WD)"\t2b, 11b\n\t" \ + STR(PTR_WD)"\t3b, 11b\n\t" \ + STR(PTR_WD)"\t4b, 11b\n\t" \ + STR(PTR_WD)"\t5b, 11b\n\t" \ + STR(PTR_WD)"\t6b, 11b\n\t" \ + STR(PTR_WD)"\t7b, 11b\n\t" \ + STR(PTR_WD)"\t8b, 11b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -255,8 +255,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=r" (res) \ : "r" (value), "r" (addr), "i" (-EFAULT));\ @@ -276,8 +276,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=r" (res) \ : "r" (value), "r" (addr), "i" (-EFAULT)); \ @@ -296,8 +296,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=r" (res) \ : "r" (value), "r" (addr), "i" (-EFAULT)); \ @@ -325,10 +325,10 @@ do { \ "j\t10b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 11b\n\t" \ - STR(PTR)"\t2b, 11b\n\t" \ - STR(PTR)"\t3b, 11b\n\t" \ - STR(PTR)"\t4b, 11b\n\t" \ + STR(PTR_WD)"\t1b, 11b\n\t" \ + STR(PTR_WD)"\t2b, 11b\n\t" \ + STR(PTR_WD)"\t3b, 11b\n\t" \ + STR(PTR_WD)"\t4b, 11b\n\t" \ ".previous" \ : "=&r" (res) \ : "r" (value), "r" (addr), "i" (-EFAULT) \ @@ -365,14 +365,14 @@ do { \ "j\t10b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 11b\n\t" \ - STR(PTR)"\t2b, 11b\n\t" \ - STR(PTR)"\t3b, 11b\n\t" \ - STR(PTR)"\t4b, 11b\n\t" \ - STR(PTR)"\t5b, 11b\n\t" \ - STR(PTR)"\t6b, 11b\n\t" \ - STR(PTR)"\t7b, 11b\n\t" \ - STR(PTR)"\t8b, 11b\n\t" \ + STR(PTR_WD)"\t1b, 11b\n\t" \ + STR(PTR_WD)"\t2b, 11b\n\t" \ + STR(PTR_WD)"\t3b, 11b\n\t" \ + STR(PTR_WD)"\t4b, 11b\n\t" \ + STR(PTR_WD)"\t5b, 11b\n\t" \ + STR(PTR_WD)"\t6b, 11b\n\t" \ + STR(PTR_WD)"\t7b, 11b\n\t" \ + STR(PTR_WD)"\t8b, 11b\n\t" \ ".previous" \ : "=&r" (res) \ : "r" (value), "r" (addr), "i" (-EFAULT) \ @@ -398,8 +398,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -419,8 +419,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -452,10 +452,10 @@ do { \ "j\t10b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 11b\n\t" \ - STR(PTR)"\t2b, 11b\n\t" \ - STR(PTR)"\t3b, 11b\n\t" \ - STR(PTR)"\t4b, 11b\n\t" \ + STR(PTR_WD)"\t1b, 11b\n\t" \ + STR(PTR_WD)"\t2b, 11b\n\t" \ + STR(PTR_WD)"\t3b, 11b\n\t" \ + STR(PTR_WD)"\t4b, 11b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -481,8 +481,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -504,8 +504,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -524,8 +524,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -557,10 +557,10 @@ do { \ "j\t10b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 11b\n\t" \ - STR(PTR)"\t2b, 11b\n\t" \ - STR(PTR)"\t3b, 11b\n\t" \ - STR(PTR)"\t4b, 11b\n\t" \ + STR(PTR_WD)"\t1b, 11b\n\t" \ + STR(PTR_WD)"\t2b, 11b\n\t" \ + STR(PTR_WD)"\t3b, 11b\n\t" \ + STR(PTR_WD)"\t4b, 11b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -602,14 +602,14 @@ do { \ "j\t10b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 11b\n\t" \ - STR(PTR)"\t2b, 11b\n\t" \ - STR(PTR)"\t3b, 11b\n\t" \ - STR(PTR)"\t4b, 11b\n\t" \ - STR(PTR)"\t5b, 11b\n\t" \ - STR(PTR)"\t6b, 11b\n\t" \ - STR(PTR)"\t7b, 11b\n\t" \ - STR(PTR)"\t8b, 11b\n\t" \ + STR(PTR_WD)"\t1b, 11b\n\t" \ + STR(PTR_WD)"\t2b, 11b\n\t" \ + STR(PTR_WD)"\t3b, 11b\n\t" \ + STR(PTR_WD)"\t4b, 11b\n\t" \ + STR(PTR_WD)"\t5b, 11b\n\t" \ + STR(PTR_WD)"\t6b, 11b\n\t" \ + STR(PTR_WD)"\t7b, 11b\n\t" \ + STR(PTR_WD)"\t8b, 11b\n\t" \ ".previous" \ : "=&r" (value), "=r" (res) \ : "r" (addr), "i" (-EFAULT)); \ @@ -632,8 +632,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=r" (res) \ : "r" (value), "r" (addr), "i" (-EFAULT));\ @@ -653,8 +653,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=r" (res) \ : "r" (value), "r" (addr), "i" (-EFAULT)); \ @@ -673,8 +673,8 @@ do { \ "j\t3b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 4b\n\t" \ - STR(PTR)"\t2b, 4b\n\t" \ + STR(PTR_WD)"\t1b, 4b\n\t" \ + STR(PTR_WD)"\t2b, 4b\n\t" \ ".previous" \ : "=r" (res) \ : "r" (value), "r" (addr), "i" (-EFAULT)); \ @@ -703,10 +703,10 @@ do { \ "j\t10b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 11b\n\t" \ - STR(PTR)"\t2b, 11b\n\t" \ - STR(PTR)"\t3b, 11b\n\t" \ - STR(PTR)"\t4b, 11b\n\t" \ + STR(PTR_WD)"\t1b, 11b\n\t" \ + STR(PTR_WD)"\t2b, 11b\n\t" \ + STR(PTR_WD)"\t3b, 11b\n\t" \ + STR(PTR_WD)"\t4b, 11b\n\t" \ ".previous" \ : "=&r" (res) \ : "r" (value), "r" (addr), "i" (-EFAULT) \ @@ -743,14 +743,14 @@ do { \ "j\t10b\n\t" \ ".previous\n\t" \ ".section\t__ex_table,\"a\"\n\t" \ - STR(PTR)"\t1b, 11b\n\t" \ - STR(PTR)"\t2b, 11b\n\t" \ - STR(PTR)"\t3b, 11b\n\t" \ - STR(PTR)"\t4b, 11b\n\t" \ - STR(PTR)"\t5b, 11b\n\t" \ - STR(PTR)"\t6b, 11b\n\t" \ - STR(PTR)"\t7b, 11b\n\t" \ - STR(PTR)"\t8b, 11b\n\t" \ + STR(PTR_WD)"\t1b, 11b\n\t" \ + STR(PTR_WD)"\t2b, 11b\n\t" \ + STR(PTR_WD)"\t3b, 11b\n\t" \ + STR(PTR_WD)"\t4b, 11b\n\t" \ + STR(PTR_WD)"\t5b, 11b\n\t" \ + STR(PTR_WD)"\t6b, 11b\n\t" \ + STR(PTR_WD)"\t7b, 11b\n\t" \ + STR(PTR_WD)"\t8b, 11b\n\t" \ ".previous" \ : "=&r" (res) \ : "r" (value), "r" (addr), "i" (-EFAULT) \ diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c index a39ec755e4c2..750fe569862b 100644 --- a/arch/mips/kernel/mips-r2-to-r6-emul.c +++ b/arch/mips/kernel/mips-r2-to-r6-emul.c @@ -1258,10 +1258,10 @@ fpu_emul: " j 10b\n" " .previous\n" " .section __ex_table,\"a\"\n" - STR(PTR) " 1b,8b\n" - STR(PTR) " 2b,8b\n" - STR(PTR) " 3b,8b\n" - STR(PTR) " 4b,8b\n" + STR(PTR_WD) " 1b,8b\n" + STR(PTR_WD) " 2b,8b\n" + STR(PTR_WD) " 3b,8b\n" + STR(PTR_WD) " 4b,8b\n" " .previous\n" " .set pop\n" : "+&r"(rt), "=&r"(rs), @@ -1333,10 +1333,10 @@ fpu_emul: " j 10b\n" " .previous\n" " .section __ex_table,\"a\"\n" - STR(PTR) " 1b,8b\n" - STR(PTR) " 2b,8b\n" - STR(PTR) " 3b,8b\n" - STR(PTR) " 4b,8b\n" + STR(PTR_WD) " 1b,8b\n" + STR(PTR_WD) " 2b,8b\n" + STR(PTR_WD) " 3b,8b\n" + STR(PTR_WD) " 4b,8b\n" " .previous\n" " .set pop\n" : "+&r"(rt), "=&r"(rs), @@ -1404,10 +1404,10 @@ fpu_emul: " j 9b\n" " .previous\n" " .section __ex_table,\"a\"\n" - STR(PTR) " 1b,8b\n" - STR(PTR) " 2b,8b\n" - STR(PTR) " 3b,8b\n" - STR(PTR) " 4b,8b\n" + STR(PTR_WD) " 1b,8b\n" + STR(PTR_WD) " 2b,8b\n" + STR(PTR_WD) " 3b,8b\n" + STR(PTR_WD) " 4b,8b\n" " .previous\n" " .set pop\n" : "+&r"(rt), "=&r"(rs), @@ -1474,10 +1474,10 @@ fpu_emul: " j 9b\n" " .previous\n" " .section __ex_table,\"a\"\n" - STR(PTR) " 1b,8b\n" - STR(PTR) " 2b,8b\n" - STR(PTR) " 3b,8b\n" - STR(PTR) " 4b,8b\n" + STR(PTR_WD) " 1b,8b\n" + STR(PTR_WD) " 2b,8b\n" + STR(PTR_WD) " 3b,8b\n" + STR(PTR_WD) " 4b,8b\n" " .previous\n" " .set pop\n" : "+&r"(rt), "=&r"(rs), @@ -1589,14 +1589,14 @@ fpu_emul: " j 9b\n" " .previous\n" " .section __ex_table,\"a\"\n" - STR(PTR) " 1b,8b\n" - STR(PTR) " 2b,8b\n" - STR(PTR) " 3b,8b\n" - STR(PTR) " 4b,8b\n" - STR(PTR) " 5b,8b\n" - STR(PTR) " 6b,8b\n" - STR(PTR) " 7b,8b\n" - STR(PTR) " 0b,8b\n" + STR(PTR_WD) " 1b,8b\n" + STR(PTR_WD) " 2b,8b\n" + STR(PTR_WD) " 3b,8b\n" + STR(PTR_WD) " 4b,8b\n" + STR(PTR_WD) " 5b,8b\n" + STR(PTR_WD) " 6b,8b\n" + STR(PTR_WD) " 7b,8b\n" + STR(PTR_WD) " 0b,8b\n" " .previous\n" " .set pop\n" : "+&r"(rt), "=&r"(rs), @@ -1708,14 +1708,14 @@ fpu_emul: " j 9b\n" " .previous\n" " .section __ex_table,\"a\"\n" - STR(PTR) " 1b,8b\n" - STR(PTR) " 2b,8b\n" - STR(PTR) " 3b,8b\n" - STR(PTR) " 4b,8b\n" - STR(PTR) " 5b,8b\n" - STR(PTR) " 6b,8b\n" - STR(PTR) " 7b,8b\n" - STR(PTR) " 0b,8b\n" + STR(PTR_WD) " 1b,8b\n" + STR(PTR_WD) " 2b,8b\n" + STR(PTR_WD) " 3b,8b\n" + STR(PTR_WD) " 4b,8b\n" + STR(PTR_WD) " 5b,8b\n" + STR(PTR_WD) " 6b,8b\n" + STR(PTR_WD) " 7b,8b\n" + STR(PTR_WD) " 0b,8b\n" " .previous\n" " .set pop\n" : "+&r"(rt), "=&r"(rs), @@ -1827,14 +1827,14 @@ fpu_emul: " j 9b\n" " .previous\n" " .section __ex_table,\"a\"\n" - STR(PTR) " 1b,8b\n" - STR(PTR) " 2b,8b\n" - STR(PTR) " 3b,8b\n" - STR(PTR) " 4b,8b\n" - STR(PTR) " 5b,8b\n" - STR(PTR) " 6b,8b\n" - STR(PTR) " 7b,8b\n" - STR(PTR) " 0b,8b\n" + STR(PTR_WD) " 1b,8b\n" + STR(PTR_WD) " 2b,8b\n" + STR(PTR_WD) " 3b,8b\n" + STR(PTR_WD) " 4b,8b\n" + STR(PTR_WD) " 5b,8b\n" + STR(PTR_WD) " 6b,8b\n" + STR(PTR_WD) " 7b,8b\n" + STR(PTR_WD) " 0b,8b\n" " .previous\n" " .set pop\n" : "+&r"(rt), "=&r"(rs), @@ -1945,14 +1945,14 @@ fpu_emul: " j 9b\n" " .previous\n" " .section __ex_table,\"a\"\n" - STR(PTR) " 1b,8b\n" - STR(PTR) " 2b,8b\n" - STR(PTR) " 3b,8b\n" - STR(PTR) " 4b,8b\n" - STR(PTR) " 5b,8b\n" - STR(PTR) " 6b,8b\n" - STR(PTR) " 7b,8b\n" - STR(PTR) " 0b,8b\n" + STR(PTR_WD) " 1b,8b\n" + STR(PTR_WD) " 2b,8b\n" + STR(PTR_WD) " 3b,8b\n" + STR(PTR_WD) " 4b,8b\n" + STR(PTR_WD) " 5b,8b\n" + STR(PTR_WD) " 6b,8b\n" + STR(PTR_WD) " 7b,8b\n" + STR(PTR_WD) " 0b,8b\n" " .previous\n" " .set pop\n" : "+&r"(rt), "=&r"(rs), @@ -2007,7 +2007,7 @@ fpu_emul: "j 2b\n" ".previous\n" ".section __ex_table,\"a\"\n" - STR(PTR) " 1b,3b\n" + STR(PTR_WD) " 1b,3b\n" ".previous\n" : "=&r"(res), "+&r"(err) : "r"(vaddr), "i"(SIGSEGV) @@ -2065,7 +2065,7 @@ fpu_emul: "j 2b\n" ".previous\n" ".section __ex_table,\"a\"\n" - STR(PTR) " 1b,3b\n" + STR(PTR_WD) " 1b,3b\n" ".previous\n" : "+&r"(res), "+&r"(err) : "r"(vaddr), "i"(SIGSEGV)); @@ -2126,7 +2126,7 @@ fpu_emul: "j 2b\n" ".previous\n" ".section __ex_table,\"a\"\n" - STR(PTR) " 1b,3b\n" + STR(PTR_WD) " 1b,3b\n" ".previous\n" : "=&r"(res), "+&r"(err) : "r"(vaddr), "i"(SIGSEGV) @@ -2189,7 +2189,7 @@ fpu_emul: "j 2b\n" ".previous\n" ".section __ex_table,\"a\"\n" - STR(PTR) " 1b,3b\n" + STR(PTR_WD) " 1b,3b\n" ".previous\n" : "+&r"(res), "+&r"(err) : "r"(vaddr), "i"(SIGSEGV)); diff --git a/arch/mips/kernel/r2300_fpu.S b/arch/mips/kernel/r2300_fpu.S index cbf6db98cfb3..2748c55820c2 100644 --- a/arch/mips/kernel/r2300_fpu.S +++ b/arch/mips/kernel/r2300_fpu.S @@ -23,14 +23,14 @@ #define EX(a,b) \ 9: a,##b; \ .section __ex_table,"a"; \ - PTR 9b,fault; \ + PTR_WD 9b,fault; \ .previous #define EX2(a,b) \ 9: a,##b; \ .section __ex_table,"a"; \ - PTR 9b,fault; \ - PTR 9b+4,fault; \ + PTR_WD 9b,fault; \ + PTR_WD 9b+4,fault; \ .previous .set mips1 diff --git a/arch/mips/kernel/r4k_fpu.S b/arch/mips/kernel/r4k_fpu.S index b91e91106475..2e687c60bc4f 100644 --- a/arch/mips/kernel/r4k_fpu.S +++ b/arch/mips/kernel/r4k_fpu.S @@ -31,7 +31,7 @@ .ex\@: \insn \reg, \src .set pop .section __ex_table,"a" - PTR .ex\@, fault + PTR_WD .ex\@, fault .previous .endm diff --git a/arch/mips/kernel/relocate_kernel.S b/arch/mips/kernel/relocate_kernel.S index f3c908abdbb8..cfde14b48fd8 100644 --- a/arch/mips/kernel/relocate_kernel.S +++ b/arch/mips/kernel/relocate_kernel.S @@ -147,10 +147,10 @@ LEAF(kexec_smp_wait) kexec_args: EXPORT(kexec_args) -arg0: PTR 0x0 -arg1: PTR 0x0 -arg2: PTR 0x0 -arg3: PTR 0x0 +arg0: PTR_WD 0x0 +arg1: PTR_WD 0x0 +arg2: PTR_WD 0x0 +arg3: PTR_WD 0x0 .size kexec_args,PTRSIZE*4 #ifdef CONFIG_SMP @@ -161,10 +161,10 @@ arg3: PTR 0x0 */ secondary_kexec_args: EXPORT(secondary_kexec_args) -s_arg0: PTR 0x0 -s_arg1: PTR 0x0 -s_arg2: PTR 0x0 -s_arg3: PTR 0x0 +s_arg0: PTR_WD 0x0 +s_arg1: PTR_WD 0x0 +s_arg2: PTR_WD 0x0 +s_arg3: PTR_WD 0x0 .size secondary_kexec_args,PTRSIZE*4 kexec_flag: LONG 0x1 @@ -173,17 +173,17 @@ kexec_flag: kexec_start_address: EXPORT(kexec_start_address) - PTR 0x0 + PTR_WD 0x0 .size kexec_start_address, PTRSIZE kexec_indirection_page: EXPORT(kexec_indirection_page) - PTR 0 + PTR_WD 0 .size kexec_indirection_page, PTRSIZE relocate_new_kernel_end: relocate_new_kernel_size: EXPORT(relocate_new_kernel_size) - PTR relocate_new_kernel_end - relocate_new_kernel + PTR_WD relocate_new_kernel_end - relocate_new_kernel .size relocate_new_kernel_size, PTRSIZE diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index b1b2e106f711..9bfce5f75f60 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -72,10 +72,10 @@ loads_done: .set pop .section __ex_table,"a" - PTR load_a4, bad_stack_a4 - PTR load_a5, bad_stack_a5 - PTR load_a6, bad_stack_a6 - PTR load_a7, bad_stack_a7 + PTR_WD load_a4, bad_stack_a4 + PTR_WD load_a5, bad_stack_a5 + PTR_WD load_a6, bad_stack_a6 + PTR_WD load_a7, bad_stack_a7 .previous lw t0, TI_FLAGS($28) # syscall tracing enabled? @@ -216,7 +216,7 @@ einval: li v0, -ENOSYS #endif /* CONFIG_MIPS_MT_FPAFF */ #define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) -#define __SYSCALL(nr, entry) PTR entry +#define __SYSCALL(nr, entry) PTR_WD entry .align 2 .type sys_call_table, @object EXPORT(sys_call_table) diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index f650c55a17dc..97456b2ca7dc 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -101,7 +101,7 @@ not_n32_scall: END(handle_sysn32) -#define __SYSCALL(nr, entry) PTR entry +#define __SYSCALL(nr, entry) PTR_WD entry .type sysn32_call_table, @object EXPORT(sysn32_call_table) #include diff --git a/arch/mips/kernel/scall64-n64.S b/arch/mips/kernel/scall64-n64.S index 5d7bfc65e4d0..5f6ed4b4c399 100644 --- a/arch/mips/kernel/scall64-n64.S +++ b/arch/mips/kernel/scall64-n64.S @@ -109,7 +109,7 @@ illegal_syscall: j n64_syscall_exit END(handle_sys64) -#define __SYSCALL(nr, entry) PTR entry +#define __SYSCALL(nr, entry) PTR_WD entry .align 3 .type sys_call_table, @object EXPORT(sys_call_table) diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index cedc8bd88804..d3c2616cba22 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -73,10 +73,10 @@ load_a7: lw a7, 28(t0) # argument #8 from usp loads_done: .section __ex_table,"a" - PTR load_a4, bad_stack_a4 - PTR load_a5, bad_stack_a5 - PTR load_a6, bad_stack_a6 - PTR load_a7, bad_stack_a7 + PTR_WD load_a4, bad_stack_a4 + PTR_WD load_a5, bad_stack_a5 + PTR_WD load_a6, bad_stack_a6 + PTR_WD load_a7, bad_stack_a7 .previous li t1, _TIF_WORK_SYSCALL_ENTRY @@ -214,7 +214,7 @@ einval: li v0, -ENOSYS END(sys32_syscall) #define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) -#define __SYSCALL(nr, entry) PTR entry +#define __SYSCALL(nr, entry) PTR_WD entry .align 3 .type sys32_call_table,@object EXPORT(sys32_call_table) diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index 5512cd586e6e..ae93a607ddf7 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -122,8 +122,8 @@ static inline int mips_atomic_set(unsigned long addr, unsigned long new) " j 3b \n" " .previous \n" " .section __ex_table,\"a\" \n" - " "STR(PTR)" 1b, 4b \n" - " "STR(PTR)" 2b, 4b \n" + " "STR(PTR_WD)" 1b, 4b \n" + " "STR(PTR_WD)" 2b, 4b \n" " .previous \n" " .set pop \n" : [old] "=&r" (old), @@ -152,8 +152,8 @@ static inline int mips_atomic_set(unsigned long addr, unsigned long new) " j 3b \n" " .previous \n" " .section __ex_table,\"a\" \n" - " "STR(PTR)" 1b, 5b \n" - " "STR(PTR)" 2b, 5b \n" + " "STR(PTR_WD)" 1b, 5b \n" + " "STR(PTR_WD)" 2b, 5b \n" " .previous \n" " .set pop \n" : [old] "=&r" (old), diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S index a46db0807195..7767137c3e49 100644 --- a/arch/mips/lib/csum_partial.S +++ b/arch/mips/lib/csum_partial.S @@ -347,7 +347,7 @@ EXPORT_SYMBOL(csum_partial) .if \mode == LEGACY_MODE; \ 9: insn reg, addr; \ .section __ex_table,"a"; \ - PTR 9b, .L_exc; \ + PTR_WD 9b, .L_exc; \ .previous; \ /* This is enabled in EVA mode */ \ .else; \ @@ -356,7 +356,7 @@ EXPORT_SYMBOL(csum_partial) ((\to == USEROP) && (type == ST_INSN)); \ 9: __BUILD_EVA_INSN(insn##e, reg, addr); \ .section __ex_table,"a"; \ - PTR 9b, .L_exc; \ + PTR_WD 9b, .L_exc; \ .previous; \ .else; \ /* EVA without exception */ \ diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S index 277c32296636..18a43f2e29c8 100644 --- a/arch/mips/lib/memcpy.S +++ b/arch/mips/lib/memcpy.S @@ -116,7 +116,7 @@ .if \mode == LEGACY_MODE; \ 9: insn reg, addr; \ .section __ex_table,"a"; \ - PTR 9b, handler; \ + PTR_WD 9b, handler; \ .previous; \ /* This is assembled in EVA mode */ \ .else; \ @@ -125,7 +125,7 @@ ((\to == USEROP) && (type == ST_INSN)); \ 9: __BUILD_EVA_INSN(insn##e, reg, addr); \ .section __ex_table,"a"; \ - PTR 9b, handler; \ + PTR_WD 9b, handler; \ .previous; \ .else; \ /* \ diff --git a/arch/mips/lib/memset.S b/arch/mips/lib/memset.S index b0baa3c79fad..0b342bae9a98 100644 --- a/arch/mips/lib/memset.S +++ b/arch/mips/lib/memset.S @@ -52,7 +52,7 @@ 9: ___BUILD_EVA_INSN(insn, reg, addr); \ .endif; \ .section __ex_table,"a"; \ - PTR 9b, handler; \ + PTR_WD 9b, handler; \ .previous .macro f_fill64 dst, offset, val, fixup, mode diff --git a/arch/mips/lib/strncpy_user.S b/arch/mips/lib/strncpy_user.S index 556acf684d7b..13aaa9927ad1 100644 --- a/arch/mips/lib/strncpy_user.S +++ b/arch/mips/lib/strncpy_user.S @@ -15,7 +15,7 @@ #define EX(insn,reg,addr,handler) \ 9: insn reg, addr; \ .section __ex_table,"a"; \ - PTR 9b, handler; \ + PTR_WD 9b, handler; \ .previous /* @@ -59,7 +59,7 @@ LEAF(__strncpy_from_user_asm) jr ra .section __ex_table,"a" - PTR 1b, .Lfault + PTR_WD 1b, .Lfault .previous EXPORT_SYMBOL(__strncpy_from_user_asm) diff --git a/arch/mips/lib/strnlen_user.S b/arch/mips/lib/strnlen_user.S index 92b63f20ec05..6de31b616f9c 100644 --- a/arch/mips/lib/strnlen_user.S +++ b/arch/mips/lib/strnlen_user.S @@ -14,7 +14,7 @@ #define EX(insn,reg,addr,handler) \ 9: insn reg, addr; \ .section __ex_table,"a"; \ - PTR 9b, handler; \ + PTR_WD 9b, handler; \ .previous /* From 36268983e90316b37000a005642af42234dabb36 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 26 Jan 2022 16:38:52 +0100 Subject: [PATCH 0701/1295] Revert "ipv6: Honor all IPv6 PIO Valid Lifetime values" This reverts commit b75326c201242de9495ff98e5d5cff41d7fc0d9d. This commit breaks Linux compatibility with USGv6 tests. The RFC this commit was based on is actually an expired draft: no published RFC currently allows the new behaviour it introduced. Without full IETF endorsement, the flash renumbering scenario this patch was supposed to enable is never going to work, as other IPv6 equipements on the same LAN will keep the 2 hours limit. Fixes: b75326c20124 ("ipv6: Honor all IPv6 PIO Valid Lifetime values") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 ++ net/ipv6/addrconf.c | 27 ++++++++++++++++++++------- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 78ea3e332688..e7ce719838b5 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -6,6 +6,8 @@ #define RTR_SOLICITATION_INTERVAL (4*HZ) #define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */ +#define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ + #define TEMP_VALID_LIFETIME (7*86400) #define TEMP_PREFERRED_LIFETIME (86400) #define REGEN_MAX_RETRY (3) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3eee17790a82..f927c199a93c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2589,7 +2589,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, __u32 valid_lft, u32 prefered_lft) { struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1); - int create = 0; + int create = 0, update_lft = 0; if (!ifp && valid_lft) { int max_addresses = in6_dev->cnf.max_addresses; @@ -2633,19 +2633,32 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, unsigned long now; u32 stored_lft; - /* Update lifetime (RFC4862 5.5.3 e) - * We deviate from RFC4862 by honoring all Valid Lifetimes to - * improve the reaction of SLAAC to renumbering events - * (draft-gont-6man-slaac-renum-06, Section 4.2) - */ + /* update lifetime (RFC2462 5.5.3 e) */ spin_lock_bh(&ifp->lock); now = jiffies; if (ifp->valid_lft > (now - ifp->tstamp) / HZ) stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; else stored_lft = 0; - if (!create && stored_lft) { + const u32 minimum_lft = min_t(u32, + stored_lft, MIN_VALID_LIFETIME); + valid_lft = max(valid_lft, minimum_lft); + + /* RFC4862 Section 5.5.3e: + * "Note that the preferred lifetime of the + * corresponding address is always reset to + * the Preferred Lifetime in the received + * Prefix Information option, regardless of + * whether the valid lifetime is also reset or + * ignored." + * + * So we should always update prefered_lft here. + */ + update_lft = 1; + } + + if (update_lft) { ifp->valid_lft = valid_lft; ifp->prefered_lft = prefered_lft; ifp->tstamp = now; From 3da4b7403db87d39bc2613cfd790de1de99a70ab Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Wed, 26 Jan 2022 10:21:42 -0800 Subject: [PATCH 0702/1295] ALSA: usb-audio: initialize variables that could ignore errors clang static analysis reports this representative issue mixer.c:1548:35: warning: Assigned value is garbage or undefined ucontrol->value.integer.value[0] = val; ^ ~~~ The filter_error() macro allows errors to be ignored. If errors can be ignored, initialize variables so garbage will not be used. Fixes: 48cc42973509 ("ALSA: usb-audio: Filter error from connector kctl ops, too") Signed-off-by: Tom Rix Link: https://lore.kernel.org/r/20220126182142.1184819-1-trix@redhat.com Signed-off-by: Takashi Iwai --- sound/usb/mixer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index e8f3f8d622ec..630766ba259f 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -1527,6 +1527,10 @@ error: usb_audio_err(chip, "cannot get connectors status: req = %#x, wValue = %#x, wIndex = %#x, type = %d\n", UAC_GET_CUR, validx, idx, cval->val_type); + + if (val) + *val = 0; + return filter_error(cval, ret); } From 94c82de43e01ef5747a95e4a590880de863fe423 Mon Sep 17 00:00:00 2001 From: Mohammad Athari Bin Ismail Date: Wed, 26 Jan 2022 17:47:22 +0800 Subject: [PATCH 0703/1295] net: stmmac: configure PTP clock source prior to PTP initialization For Intel platform, it is required to configure PTP clock source prior PTP initialization in MAC. So, need to move ptp_clk_freq_config execution from stmmac_ptp_register() to stmmac_init_ptp(). Fixes: 76da35dc99af ("stmmac: intel: Add PSE and PCH PTP clock source selection") Cc: # 5.15.x Signed-off-by: Mohammad Athari Bin Ismail Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 +++ drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 54071ae73879..25c802344356 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -890,6 +890,9 @@ static int stmmac_init_ptp(struct stmmac_priv *priv) bool xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac; int ret; + if (priv->plat->ptp_clk_freq_config) + priv->plat->ptp_clk_freq_config(priv); + ret = stmmac_init_tstamp_counter(priv, STMMAC_HWTS_ACTIVE); if (ret) return ret; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c index 0d24ebd37873..1c9f02f9c317 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c @@ -297,9 +297,6 @@ void stmmac_ptp_register(struct stmmac_priv *priv) { int i; - if (priv->plat->ptp_clk_freq_config) - priv->plat->ptp_clk_freq_config(priv); - for (i = 0; i < priv->dma_cap.pps_out_num; i++) { if (i >= STMMAC_PPS_MAX) break; From 0735e639f129dff455aeb91da291f5c578cc33db Mon Sep 17 00:00:00 2001 From: Mohammad Athari Bin Ismail Date: Wed, 26 Jan 2022 17:47:23 +0800 Subject: [PATCH 0704/1295] net: stmmac: skip only stmmac_ptp_register when resume from suspend When resume from suspend, besides skipping PTP registration, it also skipping PTP HW initialization. This could cause PTP clock not able to operate properly when resume from suspend. To fix this, only stmmac_ptp_register() is skipped when resume from suspend. Fixes: fe1319291150 ("stmmac: Don't init ptp again when resume from suspend/hibernation") Cc: # 5.15.x Signed-off-by: Mohammad Athari Bin Ismail Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 25c802344356..639a753266e6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -915,8 +915,6 @@ static int stmmac_init_ptp(struct stmmac_priv *priv) priv->hwts_tx_en = 0; priv->hwts_rx_en = 0; - stmmac_ptp_register(priv); - return 0; } @@ -3242,7 +3240,7 @@ static int stmmac_fpe_start_wq(struct stmmac_priv *priv) /** * stmmac_hw_setup - setup mac in a usable state. * @dev : pointer to the device structure. - * @init_ptp: initialize PTP if set + * @ptp_register: register PTP if set * Description: * this is the main function to setup the HW in a usable state because the * dma engine is reset, the core registers are configured (e.g. AXI, @@ -3252,7 +3250,7 @@ static int stmmac_fpe_start_wq(struct stmmac_priv *priv) * 0 on success and an appropriate (-)ve integer as defined in errno.h * file on failure. */ -static int stmmac_hw_setup(struct net_device *dev, bool init_ptp) +static int stmmac_hw_setup(struct net_device *dev, bool ptp_register) { struct stmmac_priv *priv = netdev_priv(dev); u32 rx_cnt = priv->plat->rx_queues_to_use; @@ -3309,13 +3307,13 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp) stmmac_mmc_setup(priv); - if (init_ptp) { - ret = stmmac_init_ptp(priv); - if (ret == -EOPNOTSUPP) - netdev_warn(priv->dev, "PTP not supported by HW\n"); - else if (ret) - netdev_warn(priv->dev, "PTP init failed\n"); - } + ret = stmmac_init_ptp(priv); + if (ret == -EOPNOTSUPP) + netdev_warn(priv->dev, "PTP not supported by HW\n"); + else if (ret) + netdev_warn(priv->dev, "PTP init failed\n"); + else if (ptp_register) + stmmac_ptp_register(priv); priv->eee_tw_timer = STMMAC_DEFAULT_TWT_LS; From dcb2c5c6ca9b9177f04abaf76e5a983d177c9414 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 26 Jan 2022 15:10:25 +0200 Subject: [PATCH 0705/1295] net: bridge: vlan: fix single net device option dumping When dumping vlan options for a single net device we send the same entries infinitely because user-space expects a 0 return at the end but we keep returning skb->len and restarting the dump on retry. Fix it by returning the value from br_vlan_dump_dev() if it completed or there was an error. The only case that must return skb->len is when the dump was incomplete and needs to continue (-EMSGSIZE). Reported-by: Benjamin Poirier Fixes: 8dcea187088b ("net: bridge: vlan: add rtm definitions and dump support") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_vlan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 84ba456a78cc..43201260e37b 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -2020,7 +2020,8 @@ static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) goto out_err; } err = br_vlan_dump_dev(dev, skb, cb, dump_flags); - if (err && err != -EMSGSIZE) + /* if the dump completed without an error we return 0 here */ + if (err != -EMSGSIZE) goto out_err; } else { for_each_netdev_rcu(net, dev) { From 9e0db41e7a0b6f1271cbcfb16dbf5b8641b4e440 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 27 Jan 2022 00:52:15 +0800 Subject: [PATCH 0706/1295] net: stmmac: dwmac-sun8i: use return val of readl_poll_timeout() When readl_poll_timeout() timeout, we'd better directly use its return value. Before this patch: [ 2.145528] dwmac-sun8i: probe of 4500000.ethernet failed with error -14 After this patch: [ 2.138520] dwmac-sun8i: probe of 4500000.ethernet failed with error -110 Signed-off-by: Jisheng Zhang Acked-by: Jernej Skrabec Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 617d0e4c6495..09644ab0d87a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -756,7 +756,7 @@ static int sun8i_dwmac_reset(struct stmmac_priv *priv) if (err) { dev_err(priv->device, "EMAC reset timeout\n"); - return -EFAULT; + return err; } return 0; } From 492fefbaafb9d9f49d8d14614d57c956a81f2ea1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 12:24:24 -0800 Subject: [PATCH 0707/1295] MAINTAINERS: add more files to eth PHY include/linux/linkmode.h and include/linux/mii.h do not match anything in MAINTAINERS. Looks like they should be under Ethernet PHY. Signed-off-by: Jakub Kicinski Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0d7883977e9b..3e22999669c4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7216,8 +7216,10 @@ F: drivers/net/mdio/of_mdio.c F: drivers/net/pcs/ F: drivers/net/phy/ F: include/dt-bindings/net/qca-ar803x.h +F: include/linux/linkmode.h F: include/linux/*mdio*.h F: include/linux/mdio/*.h +F: include/linux/mii.h F: include/linux/of_net.h F: include/linux/phy.h F: include/linux/phy_fixed.h From 966f435add4821f53bf1c507dadc3ba9aaeb5f01 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 14:55:35 -0800 Subject: [PATCH 0708/1295] MAINTAINERS: add missing IPv4/IPv6 header paths Add missing headers to the IP entry. Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- MAINTAINERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 3e22999669c4..b904348fed0d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13461,7 +13461,11 @@ L: netdev@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git F: arch/x86/net/* +F: include/linux/ip.h +F: include/linux/ipv6* +F: include/net/fib* F: include/net/ip* +F: include/net/route.h F: net/ipv4/ F: net/ipv6/ From 153a0d187e767c68733b8e9f46218eb1f41ab902 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jan 2022 16:51:16 -0800 Subject: [PATCH 0709/1295] ipv4: raw: lock the socket in raw_bind() For some reason, raw_bind() forgot to lock the socket. BUG: KCSAN: data-race in __ip4_datagram_connect / raw_bind write to 0xffff8881170d4308 of 4 bytes by task 5466 on cpu 0: raw_bind+0x1b0/0x250 net/ipv4/raw.c:739 inet_bind+0x56/0xa0 net/ipv4/af_inet.c:443 __sys_bind+0x14b/0x1b0 net/socket.c:1697 __do_sys_bind net/socket.c:1708 [inline] __se_sys_bind net/socket.c:1706 [inline] __x64_sys_bind+0x3d/0x50 net/socket.c:1706 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff8881170d4308 of 4 bytes by task 5468 on cpu 1: __ip4_datagram_connect+0xb7/0x7b0 net/ipv4/datagram.c:39 ip4_datagram_connect+0x2a/0x40 net/ipv4/datagram.c:89 inet_dgram_connect+0x107/0x190 net/ipv4/af_inet.c:576 __sys_connect_file net/socket.c:1900 [inline] __sys_connect+0x197/0x1b0 net/socket.c:1917 __do_sys_connect net/socket.c:1927 [inline] __se_sys_connect net/socket.c:1924 [inline] __x64_sys_connect+0x3d/0x50 net/socket.c:1924 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000000 -> 0x0003007f Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 5468 Comm: syz-executor.5 Not tainted 5.17.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/raw.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index a53f256bf9d3..0505935b6b8c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -722,6 +722,7 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) int ret = -EINVAL; int chk_addr_ret; + lock_sock(sk); if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; @@ -741,7 +742,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ sk_dst_reset(sk); ret = 0; -out: return ret; +out: + release_sock(sk); + return ret; } /* From d19a7af73b5ecaac8168712d18be72b9db166768 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 18 Jan 2022 17:00:51 -0500 Subject: [PATCH 0710/1295] lockd: fix failure to cleanup client locks In my testing, we're sometimes hitting the request->fl_flags & FL_EXISTS case in posix_lock_inode, presumably just by random luck since we're not actually initializing fl_flags here. This probably didn't matter before commit 7f024fcd5c97 ("Keep read and write fds with each nlm_file") since we wouldn't previously unlock unless we knew there were locks. But now it causes lockd to give up on removing more locks. We could just initialize fl_flags, but really it seems dubious to be calling vfs_lock_file with random values in some of the fields. Fixes: 7f024fcd5c97 ("Keep read and write fds with each nlm_file") Signed-off-by: J. Bruce Fields [ cel: fixed checkpatch.pl nit ] Signed-off-by: Chuck Lever --- fs/lockd/svcsubs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 54c2e42130ca..0a22a2faf552 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -180,6 +180,7 @@ static int nlm_unlock_files(struct nlm_file *file) { struct file_lock lock; + locks_init_lock(&lock); lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; From a0f90c8815706981c483a652a6aefca51a5e191c Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 27 Jan 2022 18:34:19 +1000 Subject: [PATCH 0711/1295] drm/vmwgfx: Fix stale file descriptors on failed usercopy A failing usercopy of the fence_rep object will lead to a stale entry in the file descriptor table as put_unused_fd() won't release it. This enables userland to refer to a dangling 'file' object through that still valid file descriptor, leading to all kinds of use-after-free exploitation scenarios. Fix this by deferring the call to fd_install() until after the usercopy has succeeded. Fixes: c906965dee22 ("drm/vmwgfx: Add export fence to file descriptor support") Signed-off-by: Mathias Krause Signed-off-by: Zack Rusin Signed-off-by: Dave Airlie Signed-off-by: Linus Torvalds --- drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 5 ++-- drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 33 +++++++++++++------------ drivers/gpu/drm/vmwgfx/vmwgfx_fence.c | 2 +- drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 2 +- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h index d6b66636a19b..ea3ecdda561d 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h @@ -1140,15 +1140,14 @@ extern int vmw_execbuf_fence_commands(struct drm_file *file_priv, struct vmw_private *dev_priv, struct vmw_fence_obj **p_fence, uint32_t *p_handle); -extern void vmw_execbuf_copy_fence_user(struct vmw_private *dev_priv, +extern int vmw_execbuf_copy_fence_user(struct vmw_private *dev_priv, struct vmw_fpriv *vmw_fp, int ret, struct drm_vmw_fence_rep __user *user_fence_rep, struct vmw_fence_obj *fence, uint32_t fence_handle, - int32_t out_fence_fd, - struct sync_file *sync_file); + int32_t out_fence_fd); bool vmw_cmd_describe(const void *buf, u32 *size, char const **cmd); /** diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c index 44ca23b0ea4e..dd2ff441068e 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c @@ -3879,17 +3879,17 @@ int vmw_execbuf_fence_commands(struct drm_file *file_priv, * Also if copying fails, user-space will be unable to signal the fence object * so we wait for it immediately, and then unreference the user-space reference. */ -void +int vmw_execbuf_copy_fence_user(struct vmw_private *dev_priv, struct vmw_fpriv *vmw_fp, int ret, struct drm_vmw_fence_rep __user *user_fence_rep, struct vmw_fence_obj *fence, uint32_t fence_handle, - int32_t out_fence_fd, struct sync_file *sync_file) + int32_t out_fence_fd) { struct drm_vmw_fence_rep fence_rep; if (user_fence_rep == NULL) - return; + return 0; memset(&fence_rep, 0, sizeof(fence_rep)); @@ -3917,19 +3917,13 @@ vmw_execbuf_copy_fence_user(struct vmw_private *dev_priv, * handle. */ if (unlikely(ret != 0) && (fence_rep.error == 0)) { - if (sync_file) - fput(sync_file->file); - - if (fence_rep.fd != -1) { - put_unused_fd(fence_rep.fd); - fence_rep.fd = -1; - } - ttm_ref_object_base_unref(vmw_fp->tfile, fence_handle); VMW_DEBUG_USER("Fence copy error. Syncing.\n"); (void) vmw_fence_obj_wait(fence, false, false, VMW_FENCE_WAIT_TIMEOUT); } + + return ret ? -EFAULT : 0; } /** @@ -4266,16 +4260,23 @@ int vmw_execbuf_process(struct drm_file *file_priv, (void) vmw_fence_obj_wait(fence, false, false, VMW_FENCE_WAIT_TIMEOUT); + } + } + + ret = vmw_execbuf_copy_fence_user(dev_priv, vmw_fpriv(file_priv), ret, + user_fence_rep, fence, handle, out_fence_fd); + + if (sync_file) { + if (ret) { + /* usercopy of fence failed, put the file object */ + fput(sync_file->file); + put_unused_fd(out_fence_fd); } else { /* Link the fence with the FD created earlier */ fd_install(out_fence_fd, sync_file->file); } } - vmw_execbuf_copy_fence_user(dev_priv, vmw_fpriv(file_priv), ret, - user_fence_rep, fence, handle, out_fence_fd, - sync_file); - /* Don't unreference when handing fence out */ if (unlikely(out_fence != NULL)) { *out_fence = fence; @@ -4293,7 +4294,7 @@ int vmw_execbuf_process(struct drm_file *file_priv, */ vmw_validation_unref_lists(&val_ctx); - return 0; + return ret; out_unlock_binding: mutex_unlock(&dev_priv->binding_mutex); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c index c60d395f9e2e..5001b87aebe8 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c @@ -1128,7 +1128,7 @@ int vmw_fence_event_ioctl(struct drm_device *dev, void *data, } vmw_execbuf_copy_fence_user(dev_priv, vmw_fp, 0, user_fence_rep, fence, - handle, -1, NULL); + handle, -1); vmw_fence_obj_unreference(&fence); return 0; out_no_create: diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 4e693e8de2c3..bbd2f4ec08ec 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -2501,7 +2501,7 @@ void vmw_kms_helper_validation_finish(struct vmw_private *dev_priv, if (file_priv) vmw_execbuf_copy_fence_user(dev_priv, vmw_fpriv(file_priv), ret, user_fence_rep, fence, - handle, -1, NULL); + handle, -1); if (out_fence) *out_fence = fence; else From 970a5a3ea86da637471d3cd04d513a0755aba4bf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jan 2022 17:10:21 -0800 Subject: [PATCH 0712/1295] ipv4: tcp: send zero IPID in SYNACK messages In commit 431280eebed9 ("ipv4: tcp: send zero IPID for RST and ACK sent in SYN-RECV and TIME-WAIT state") we took care of some ctl packets sent by TCP. It turns out we need to use a similar strategy for SYNACK packets. By default, they carry IP_DF and IPID==0, but there are ways to ask them to use the hashed IP ident generator and thus be used to build off-path attacks. (Ref: Off-Path TCP Exploits of the Mixed IPID Assignment) One of this way is to force (before listener is started) echo 1 >/proc/sys/net/ipv4/ip_no_pmtu_disc Another way is using forged ICMP ICMP_FRAG_NEEDED with a very small MTU (like 68) to force a false return from ip_dont_fragment() In this patch, ip_build_and_send_pkt() uses the following heuristics. 1) Most SYNACK packets are smaller than IPV4_MIN_MTU and therefore can use IP_DF regardless of the listener or route pmtu setting. 2) In case the SYNACK packet is bigger than IPV4_MIN_MTU, we use prandom_u32() generator instead of the IPv4 hashed ident one. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: Ray Che Reviewed-by: David Ahern Cc: Geoff Alexander Cc: Willy Tarreau Signed-off-by: Jakub Kicinski --- net/ipv4/ip_output.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e331c8d4e6cf..139cec29ed06 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -162,12 +162,19 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - if (ip_dont_fragment(sk, &rt->dst)) { + /* Do not bother generating IPID for small packets (eg SYNACK) */ + if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) { iph->frag_off = htons(IP_DF); iph->id = 0; } else { iph->frag_off = 0; - __ip_select_ident(net, iph, 1); + /* TCP packets here are SYNACK with fat IPv4/TCP options. + * Avoid using the hashed IP ident generator. + */ + if (sk->sk_protocol == IPPROTO_TCP) + iph->id = (__force __be16)prandom_u32(); + else + __ip_select_ident(net, iph, 1); } if (opt && opt->opt.optlen) { From 23f57406b82de51809d5812afd96f210f8b627f3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jan 2022 17:10:22 -0800 Subject: [PATCH 0713/1295] ipv4: avoid using shared IP generator for connected sockets ip_select_ident_segs() has been very conservative about using the connected socket private generator only for packets with IP_DF set, claiming it was needed for some VJ compression implementations. As mentioned in this referenced document, this can be abused. (Ref: Off-Path TCP Exploits of the Mixed IPID Assignment) Before switching to pure random IPID generation and possibly hurt some workloads, lets use the private inet socket generator. Not only this will remove one vulnerability, this will also improve performance of TCP flows using pmtudisc==IP_PMTUDISC_DONT Fixes: 73f156a6e8c1 ("inetpeer: get rid of ip_id_count") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reported-by: Ray Che Cc: Willy Tarreau Signed-off-by: Jakub Kicinski --- include/net/ip.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 81e23a102a0d..b51bae43b0dd 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -525,19 +525,18 @@ static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); + /* We had many attacks based on IPID, use the private + * generator as much as we can. + */ + if (sk && inet_sk(sk)->inet_daddr) { + iph->id = htons(inet_sk(sk)->inet_id); + inet_sk(sk)->inet_id += segs; + return; + } if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { - /* This is only to work around buggy Windows95/2000 - * VJ compression implementations. If the ID field - * does not change, they drop every other packet in - * a TCP stream using header compression. - */ - if (sk && inet_sk(sk)->inet_daddr) { - iph->id = htons(inet_sk(sk)->inet_id); - inet_sk(sk)->inet_id += segs; - } else { - iph->id = 0; - } + iph->id = 0; } else { + /* Unfortunately we need the big hammer to get a suitable IPID */ __ip_select_ident(net, iph, segs); } } From 3c42b2019863b327caa233072c50739d4144dd16 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jan 2022 17:34:04 -0800 Subject: [PATCH 0714/1295] ipv4: remove sparse error in ip_neigh_gw4() ./include/net/route.h:373:48: warning: incorrect type in argument 2 (different base types) ./include/net/route.h:373:48: expected unsigned int [usertype] key ./include/net/route.h:373:48: got restricted __be32 [usertype] daddr Fixes: 5c9f7c1dfc2e ("ipv4: Add helpers for neigh lookup for nexthop") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220127013404.1279313-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/route.h b/include/net/route.h index 4c858dcf1aa8..25404fc2b483 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -370,7 +370,7 @@ static inline struct neighbour *ip_neigh_gw4(struct net_device *dev, { struct neighbour *neigh; - neigh = __ipv4_neigh_lookup_noref(dev, daddr); + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)daddr); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &daddr, dev, false); From 364df53c081d93fcfd6b91085ff2650c7f17b3c7 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 27 Jan 2022 17:13:01 +0800 Subject: [PATCH 0715/1295] net: socket: rename SKB_DROP_REASON_SOCKET_FILTER Rename SKB_DROP_REASON_SOCKET_FILTER, which is used as the reason of skb drop out of socket filter before it's part of a released kernel. It will be used for more protocols than just TCP in future series. Signed-off-by: Menglong Dong Reviewed-by: David Ahern Link: https://lore.kernel.org/all/20220127091308.91401-2-imagedong@tencent.com/ Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 +- include/trace/events/skb.h | 2 +- net/ipv4/tcp_ipv4.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bf11e1fbd69b..8a636e678902 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -318,7 +318,7 @@ enum skb_drop_reason { SKB_DROP_REASON_NO_SOCKET, SKB_DROP_REASON_PKT_TOO_SMALL, SKB_DROP_REASON_TCP_CSUM, - SKB_DROP_REASON_TCP_FILTER, + SKB_DROP_REASON_SOCKET_FILTER, SKB_DROP_REASON_UDP_CSUM, SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 3e042ca2cedb..a8a64b97504d 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -14,7 +14,7 @@ EM(SKB_DROP_REASON_NO_SOCKET, NO_SOCKET) \ EM(SKB_DROP_REASON_PKT_TOO_SMALL, PKT_TOO_SMALL) \ EM(SKB_DROP_REASON_TCP_CSUM, TCP_CSUM) \ - EM(SKB_DROP_REASON_TCP_FILTER, TCP_FILTER) \ + EM(SKB_DROP_REASON_SOCKET_FILTER, SOCKET_FILTER) \ EM(SKB_DROP_REASON_UDP_CSUM, UDP_CSUM) \ EMe(SKB_DROP_REASON_MAX, MAX) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b3f34e366b27..938b59636578 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2095,7 +2095,7 @@ process: nf_reset_ct(skb); if (tcp_filter(sk, skb)) { - drop_reason = SKB_DROP_REASON_TCP_FILTER; + drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto discard_and_relse; } th = (const struct tcphdr *)skb->data; From b07f413732549e5a96e891411fbb5980f2d8e5a1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 27 Jan 2022 13:40:38 +0100 Subject: [PATCH 0716/1295] netfilter: nf_tables: remove assignment with no effect in chain blob builder cppcheck possible warnings: >> net/netfilter/nf_tables_api.c:2014:2: warning: Assignment of function parameter has no effect outside the function. Did you forget dereferencing it? [uselessAssignmentPtrArg] ptr += offsetof(struct nft_rule_dp, data); ^ Reported-by: kernel test robot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index cf454f8ca2b0..5fa16990da95 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2011,7 +2011,6 @@ static void nft_last_rule(struct nft_rule_blob *blob, const void *ptr) prule = (struct nft_rule_dp *)ptr; prule->is_last = 1; - ptr += offsetof(struct nft_rule_dp, data); /* blob size does not include the trailer rule */ } From fd20d9738395cf8e27d0a17eba34169699fccdff Mon Sep 17 00:00:00 2001 From: Tim Yi Date: Thu, 27 Jan 2022 15:49:53 +0800 Subject: [PATCH 0717/1295] net: bridge: vlan: fix memory leak in __allowed_ingress When using per-vlan state, if vlan snooping and stats are disabled, untagged or priority-tagged ingress frame will go to check pvid state. If the port state is forwarding and the pvid state is not learning/forwarding, untagged or priority-tagged frame will be dropped but skb memory is not freed. Should free skb when __allowed_ingress returns false. Fixes: a580c76d534c ("net: bridge: vlan: add per-vlan state") Signed-off-by: Tim Yi Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20220127074953.12632-1-tim.yi@pica8.com Signed-off-by: Jakub Kicinski --- net/bridge/br_vlan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 43201260e37b..1402d5ca242d 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -560,10 +560,10 @@ static bool __allowed_ingress(const struct net_bridge *br, !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_pvid_state(vg); - return br_vlan_state_allowed(*state, true); - } else { - return true; + if (!br_vlan_state_allowed(*state, true)) + goto drop; } + return true; } } v = br_vlan_find(vg, *vid); From 10825410b956dc1ed8c5fbc8bbedaffdadde7f20 Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Thu, 27 Jan 2022 18:00:47 +0800 Subject: [PATCH 0718/1295] blk-mq: Fix wrong wakeup batch configuration which will cause hang Commit 180dccb0dba4f ("blk-mq: fix tag_get wait task can't be awakened") will recalculate wake_batch when incrementing or decrementing active_queues to avoid wake_batch > hctx_max_depth. At the same time, in order to not affect performance as much as possible, the minimum wakeup batch is set to 4. But when the QD is small (such as QD=1), if inc or dec active_queues increases wakeup batch, that can lead to a hang: Fix this problem with the following strategies: QD : >= 32 | < 32 --------------------------------- wakeup batch: 8~4 | 3~1 Fixes: 180dccb0dba4f ("blk-mq: fix tag_get wait task can't be awakened") Link: https://lore.kernel.org/linux-block/78cafe94-a787-e006-8851-69906f0c2128@huawei.com/T/#t Reported-by: Alex Xu (Hello71) Signed-off-by: Laibin Qiu Tested-by: Alex Xu (Hello71) Link: https://lore.kernel.org/r/20220127100047.1763746-1-qiulaibin@huawei.com Signed-off-by: Jens Axboe --- lib/sbitmap.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 6220fa67fb7e..09d293c30fd2 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -488,9 +488,13 @@ void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, unsigned int users) { unsigned int wake_batch; + unsigned int min_batch; + unsigned int depth = (sbq->sb.depth + users - 1) / users; - wake_batch = clamp_val((sbq->sb.depth + users - 1) / - users, 4, SBQ_WAKE_BATCH); + min_batch = sbq->sb.depth >= (4 * SBQ_WAIT_QUEUES) ? 4 : 1; + + wake_batch = clamp_val(depth / SBQ_WAIT_QUEUES, + min_batch, SBQ_WAKE_BATCH); __sbitmap_queue_update_wake_batch(sbq, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch); From f6133fbd373811066c8441737e65f384c8f31974 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 27 Jan 2022 14:04:44 +0000 Subject: [PATCH 0719/1295] io_uring: remove unused argument from io_rsrc_node_alloc io_ring_ctx is not used in the function. Signed-off-by: Usama Arif Link: https://lore.kernel.org/r/20220127140444.4016585-1-usama.arif@bytedance.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index dd4c801c7afd..2e04f718319d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7846,7 +7846,7 @@ static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); } -static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) +static struct io_rsrc_node *io_rsrc_node_alloc(void) { struct io_rsrc_node *ref_node; @@ -7897,7 +7897,7 @@ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) { if (ctx->rsrc_backup_node) return 0; - ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); + ctx->rsrc_backup_node = io_rsrc_node_alloc(); return ctx->rsrc_backup_node ? 0 : -ENOMEM; } From dede34b2c1a88e26f8353b433e381ea355f7258d Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 26 Jan 2022 13:13:41 -0700 Subject: [PATCH 0720/1295] docs/kselftest: clarify running mainline tests on stables Update the document to clarifiy support for running mainline kselftest on stable releases and the reasons for not removing test code that can test older kernels. Signed-off-by: Shuah Khan --- Documentation/dev-tools/kselftest.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index dcefee707ccd..a833ecf12fbc 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -7,6 +7,14 @@ directory. These are intended to be small tests to exercise individual code paths in the kernel. Tests are intended to be run after building, installing and booting a kernel. +Kselftest from mainline can be run on older stable kernels. Running tests +from mainline offers the best coverage. Several test rings run mainline +kselftest suite on stable releases. The reason is that when a new test +gets added to test existing code to regression test a bug, we should be +able to run that test on an older kernel. Hence, it is important to keep +code that can still test an older kernel and make sure it skips the test +gracefully on newer releases. + You can find additional information on Kselftest framework, how to write new tests using the framework on Kselftest wiki: From fc4eb486a59d70bd35cf1209f0e68c2d8b979193 Mon Sep 17 00:00:00 2001 From: Yang Xu Date: Thu, 27 Jan 2022 17:11:35 +0800 Subject: [PATCH 0721/1295] selftests/zram: Skip max_comp_streams interface on newer kernel Since commit 43209ea2d17a ("zram: remove max_comp_streams internals"), zram has switched to per-cpu streams. Even kernel still keep this interface for some reasons, but writing to max_comp_stream doesn't take any effect. So skip it on newer kernel ie 4.7. The code that comparing kernel version is from xfstests testsuite ext4/053. Signed-off-by: Yang Xu Signed-off-by: Shuah Khan --- tools/testing/selftests/zram/zram_lib.sh | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tools/testing/selftests/zram/zram_lib.sh b/tools/testing/selftests/zram/zram_lib.sh index 6f872f266fd1..f47fc0f27e99 100755 --- a/tools/testing/selftests/zram/zram_lib.sh +++ b/tools/testing/selftests/zram/zram_lib.sh @@ -11,6 +11,9 @@ dev_mounted=-1 # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 +kernel_version=`uname -r | cut -d'.' -f1,2` +kernel_major=${kernel_version%.*} +kernel_minor=${kernel_version#*.} trap INT @@ -25,6 +28,20 @@ check_prereqs() fi } +kernel_gte() +{ + major=${1%.*} + minor=${1#*.} + + if [ $kernel_major -gt $major ]; then + return 0 + elif [[ $kernel_major -eq $major && $kernel_minor -ge $minor ]]; then + return 0 + fi + + return 1 +} + zram_cleanup() { echo "zram cleanup" @@ -86,6 +103,13 @@ zram_max_streams() { echo "set max_comp_streams to zram device(s)" + kernel_gte 4.7 + if [ $? -eq 0 ]; then + echo "The device attribute max_comp_streams was"\ + "deprecated in 4.7" + return 0 + fi + local i=0 for max_s in $zram_max_streams; do local sys_path="/sys/block/zram${i}/max_comp_streams" From d18da7ec3719559d6e74937266d0416e6c7e0b31 Mon Sep 17 00:00:00 2001 From: Yang Xu Date: Thu, 27 Jan 2022 17:11:36 +0800 Subject: [PATCH 0722/1295] selftests/zram01.sh: Fix compression ratio calculation zram01 uses `free -m` to measure zram memory usage. The results are no sense because they are polluted by all running processes on the system. We Should only calculate the free memory delta for the current process. So use the third field of /sys/block/zram/mm_stat to measure memory usage instead. The file is available since kernel 4.1. orig_data_size(first): uncompressed size of data stored in this disk. compr_data_size(second): compressed size of data stored in this disk mem_used_total(third): the amount of memory allocated for this disk Also remove useless zram cleanup call in zram_fill_fs and so we don't need to cleanup zram twice if fails. Signed-off-by: Yang Xu Signed-off-by: Shuah Khan --- tools/testing/selftests/zram/zram01.sh | 34 ++++++++------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/tools/testing/selftests/zram/zram01.sh b/tools/testing/selftests/zram/zram01.sh index 114863d9fb87..e9e9eb777e2c 100755 --- a/tools/testing/selftests/zram/zram01.sh +++ b/tools/testing/selftests/zram/zram01.sh @@ -33,8 +33,6 @@ zram_algs="lzo" zram_fill_fs() { - local mem_free0=$(free -m | awk 'NR==2 {print $4}') - for i in $(seq 0 $(($dev_num - 1))); do echo "fill zram$i..." local b=0 @@ -45,29 +43,17 @@ zram_fill_fs() b=$(($b + 1)) done echo "zram$i can be filled with '$b' KB" + + local mem_used_total=`awk '{print $3}' "/sys/block/zram$i/mm_stat"` + local v=$((100 * 1024 * $b / $mem_used_total)) + if [ "$v" -lt 100 ]; then + echo "FAIL compression ratio: 0.$v:1" + ERR_CODE=-1 + return + fi + + echo "zram compression ratio: $(echo "scale=2; $v / 100 " | bc):1: OK" done - - local mem_free1=$(free -m | awk 'NR==2 {print $4}') - local used_mem=$(($mem_free0 - $mem_free1)) - - local total_size=0 - for sm in $zram_sizes; do - local s=$(echo $sm | sed 's/M//') - total_size=$(($total_size + $s)) - done - - echo "zram used ${used_mem}M, zram disk sizes ${total_size}M" - - local v=$((100 * $total_size / $used_mem)) - - if [ "$v" -lt 100 ]; then - echo "FAIL compression ratio: 0.$v:1" - ERR_CODE=-1 - zram_cleanup - return - fi - - echo "zram compression ratio: $(echo "scale=2; $v / 100 " | bc):1: OK" } check_prereqs From 01dabed20573804750af5c7bf8d1598a6bf7bf6e Mon Sep 17 00:00:00 2001 From: Yang Xu Date: Thu, 27 Jan 2022 17:11:37 +0800 Subject: [PATCH 0723/1295] selftests/zram: Adapt the situation that /dev/zram0 is being used If zram-generator package is installed and works, then we can not remove zram module because zram swap is being used. This case needs a clean zram environment, change this test by using hot_add/hot_remove interface. So even zram device is being used, we still can add zram device and remove them in cleanup. The two interface was introduced since kernel commit 6566d1a32bf7("zram: add dynamic device add/remove functionality") in v4.2-rc1. If kernel supports these two interface, we use hot_add/hot_remove to slove this problem, if not, just check whether zram is being used or built in, then skip it on old kernel. Signed-off-by: Yang Xu Signed-off-by: Shuah Khan --- tools/testing/selftests/zram/zram.sh | 15 +-- tools/testing/selftests/zram/zram01.sh | 3 +- tools/testing/selftests/zram/zram02.sh | 1 - tools/testing/selftests/zram/zram_lib.sh | 112 +++++++++++++---------- 4 files changed, 67 insertions(+), 64 deletions(-) diff --git a/tools/testing/selftests/zram/zram.sh b/tools/testing/selftests/zram/zram.sh index 232e958ec454..b0b91d9b0dc2 100755 --- a/tools/testing/selftests/zram/zram.sh +++ b/tools/testing/selftests/zram/zram.sh @@ -2,9 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 TCID="zram.sh" -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - . ./zram_lib.sh run_zram () { @@ -18,14 +15,4 @@ echo "" check_prereqs -# check zram module exists -MODULE_PATH=/lib/modules/`uname -r`/kernel/drivers/block/zram/zram.ko -if [ -f $MODULE_PATH ]; then - run_zram -elif [ -b /dev/zram0 ]; then - run_zram -else - echo "$TCID : No zram.ko module or /dev/zram0 device file not found" - echo "$TCID : CONFIG_ZRAM is not set" - exit $ksft_skip -fi +run_zram diff --git a/tools/testing/selftests/zram/zram01.sh b/tools/testing/selftests/zram/zram01.sh index e9e9eb777e2c..8f4affe34f3e 100755 --- a/tools/testing/selftests/zram/zram01.sh +++ b/tools/testing/selftests/zram/zram01.sh @@ -33,7 +33,7 @@ zram_algs="lzo" zram_fill_fs() { - for i in $(seq 0 $(($dev_num - 1))); do + for i in $(seq $dev_start $dev_end); do echo "fill zram$i..." local b=0 while [ true ]; do @@ -67,7 +67,6 @@ zram_mount zram_fill_fs zram_cleanup -zram_unload if [ $ERR_CODE -ne 0 ]; then echo "$TCID : [FAIL]" diff --git a/tools/testing/selftests/zram/zram02.sh b/tools/testing/selftests/zram/zram02.sh index e83b404807c0..2418b0c4ed13 100755 --- a/tools/testing/selftests/zram/zram02.sh +++ b/tools/testing/selftests/zram/zram02.sh @@ -36,7 +36,6 @@ zram_set_memlimit zram_makeswap zram_swapoff zram_cleanup -zram_unload if [ $ERR_CODE -ne 0 ]; then echo "$TCID : [FAIL]" diff --git a/tools/testing/selftests/zram/zram_lib.sh b/tools/testing/selftests/zram/zram_lib.sh index f47fc0f27e99..21ec1966de76 100755 --- a/tools/testing/selftests/zram/zram_lib.sh +++ b/tools/testing/selftests/zram/zram_lib.sh @@ -5,10 +5,12 @@ # Author: Alexey Kodanev # Modified: Naresh Kamboju -MODULE=0 dev_makeswap=-1 dev_mounted=-1 - +dev_start=0 +dev_end=-1 +module_load=-1 +sys_control=-1 # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 kernel_version=`uname -r | cut -d'.' -f1,2` @@ -46,57 +48,72 @@ zram_cleanup() { echo "zram cleanup" local i= - for i in $(seq 0 $dev_makeswap); do + for i in $(seq $dev_start $dev_makeswap); do swapoff /dev/zram$i done - for i in $(seq 0 $dev_mounted); do + for i in $(seq $dev_start $dev_mounted); do umount /dev/zram$i done - for i in $(seq 0 $(($dev_num - 1))); do + for i in $(seq $dev_start $dev_end); do echo 1 > /sys/block/zram${i}/reset rm -rf zram$i done -} + if [ $sys_control -eq 1 ]; then + for i in $(seq $dev_start $dev_end); do + echo $i > /sys/class/zram-control/hot_remove + done + fi -zram_unload() -{ - if [ $MODULE -ne 0 ] ; then - echo "zram rmmod zram" + if [ $module_load -eq 1 ]; then rmmod zram > /dev/null 2>&1 fi } zram_load() { - # check zram module exists - MODULE_PATH=/lib/modules/`uname -r`/kernel/drivers/block/zram/zram.ko - if [ -f $MODULE_PATH ]; then - MODULE=1 - echo "create '$dev_num' zram device(s)" - modprobe zram num_devices=$dev_num - if [ $? -ne 0 ]; then - echo "failed to insert zram module" - exit 1 - fi + echo "create '$dev_num' zram device(s)" - dev_num_created=$(ls /dev/zram* | wc -w) + # zram module loaded, new kernel + if [ -d "/sys/class/zram-control" ]; then + echo "zram modules already loaded, kernel supports" \ + "zram-control interface" + dev_start=$(ls /dev/zram* | wc -w) + dev_end=$(($dev_start + $dev_num - 1)) + sys_control=1 - if [ "$dev_num_created" -ne "$dev_num" ]; then - echo "unexpected num of devices: $dev_num_created" - ERR_CODE=-1 - else - echo "zram load module successful" - fi - elif [ -b /dev/zram0 ]; then - echo "/dev/zram0 device file found: OK" - else - echo "ERROR: No zram.ko module or no /dev/zram0 device found" - echo "$TCID : CONFIG_ZRAM is not set" - exit 1 + for i in $(seq $dev_start $dev_end); do + cat /sys/class/zram-control/hot_add > /dev/null + done + + echo "all zram devices (/dev/zram$dev_start~$dev_end" \ + "successfully created" + return 0 fi + + # detect old kernel or built-in + modprobe zram num_devices=$dev_num + if [ ! -d "/sys/class/zram-control" ]; then + if grep -q '^zram' /proc/modules; then + rmmod zram > /dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "zram module is being used on old kernel" \ + "without zram-control interface" + exit $ksft_skip + fi + else + echo "test needs CONFIG_ZRAM=m on old kernel without" \ + "zram-control interface" + exit $ksft_skip + fi + modprobe zram num_devices=$dev_num + fi + + module_load=1 + dev_end=$(($dev_num - 1)) + echo "all zram devices (/dev/zram0~$dev_end) successfully created" } zram_max_streams() @@ -110,7 +127,7 @@ zram_max_streams() return 0 fi - local i=0 + local i=$dev_start for max_s in $zram_max_streams; do local sys_path="/sys/block/zram${i}/max_comp_streams" echo $max_s > $sys_path || \ @@ -122,7 +139,7 @@ zram_max_streams() echo "FAIL can't set max_streams '$max_s', get $max_stream" i=$(($i + 1)) - echo "$sys_path = '$max_streams' ($i/$dev_num)" + echo "$sys_path = '$max_streams'" done echo "zram max streams: OK" @@ -132,15 +149,16 @@ zram_compress_alg() { echo "test that we can set compression algorithm" - local algs=$(cat /sys/block/zram0/comp_algorithm) + local i=$dev_start + local algs=$(cat /sys/block/zram${i}/comp_algorithm) echo "supported algs: $algs" - local i=0 + for alg in $zram_algs; do local sys_path="/sys/block/zram${i}/comp_algorithm" echo "$alg" > $sys_path || \ echo "FAIL can't set '$alg' to $sys_path" i=$(($i + 1)) - echo "$sys_path = '$alg' ($i/$dev_num)" + echo "$sys_path = '$alg'" done echo "zram set compression algorithm: OK" @@ -149,14 +167,14 @@ zram_compress_alg() zram_set_disksizes() { echo "set disk size to zram device(s)" - local i=0 + local i=$dev_start for ds in $zram_sizes; do local sys_path="/sys/block/zram${i}/disksize" echo "$ds" > $sys_path || \ echo "FAIL can't set '$ds' to $sys_path" i=$(($i + 1)) - echo "$sys_path = '$ds' ($i/$dev_num)" + echo "$sys_path = '$ds'" done echo "zram set disksizes: OK" @@ -166,14 +184,14 @@ zram_set_memlimit() { echo "set memory limit to zram device(s)" - local i=0 + local i=$dev_start for ds in $zram_mem_limits; do local sys_path="/sys/block/zram${i}/mem_limit" echo "$ds" > $sys_path || \ echo "FAIL can't set '$ds' to $sys_path" i=$(($i + 1)) - echo "$sys_path = '$ds' ($i/$dev_num)" + echo "$sys_path = '$ds'" done echo "zram set memory limit: OK" @@ -182,8 +200,8 @@ zram_set_memlimit() zram_makeswap() { echo "make swap with zram device(s)" - local i=0 - for i in $(seq 0 $(($dev_num - 1))); do + local i=$dev_start + for i in $(seq $dev_start $dev_end); do mkswap /dev/zram$i > err.log 2>&1 if [ $? -ne 0 ]; then cat err.log @@ -206,7 +224,7 @@ zram_makeswap() zram_swapoff() { local i= - for i in $(seq 0 $dev_makeswap); do + for i in $(seq $dev_start $dev_end); do swapoff /dev/zram$i > err.log 2>&1 if [ $? -ne 0 ]; then cat err.log @@ -220,7 +238,7 @@ zram_swapoff() zram_makefs() { - local i=0 + local i=$dev_start for fs in $zram_filesystems; do # if requested fs not supported default it to ext2 which mkfs.$fs > /dev/null 2>&1 || fs=ext2 @@ -239,7 +257,7 @@ zram_makefs() zram_mount() { local i=0 - for i in $(seq 0 $(($dev_num - 1))); do + for i in $(seq $dev_start $dev_end); do echo "mount /dev/zram$i" mkdir zram$i mount /dev/zram$i zram$i > /dev/null || \ From 908a26e139e8cf21093acc56d8e90ddad2ad1eff Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 27 Jan 2022 21:33:45 +0500 Subject: [PATCH 0724/1295] selftests/exec: Remove pipe from TEST_GEN_FILES pipe named FIFO special file is being created in execveat.c to perform some tests. Makefile doesn't need to do anything with the pipe. When it isn't found, Makefile generates the following build error: make: *** No rule to make target '../tools/testing/selftests/exec/pipe', needed by 'all'. Stop. pipe is created and removed during test run-time. Amended change log to add pipe remove info: Shuah Khan Fixes: 61016db15b8e ("selftests/exec: Verify execve of non-regular files fail") Signed-off-by: Muhammad Usama Anjum Reviewed-by: Shuah Khan Signed-off-by: Shuah Khan --- tools/testing/selftests/exec/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index dd61118df66e..12c5e27d32c1 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -5,7 +5,7 @@ CFLAGS += -D_GNU_SOURCE TEST_PROGS := binfmt_script non-regular TEST_GEN_PROGS := execveat load_address_4096 load_address_2097152 load_address_16777216 -TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir pipe +TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir # Makefile is a run-time dependency, since it's accessed by the execveat test TEST_FILES := Makefile From 941518d6538afa5ea0edc26e6c009d0b3163d422 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Thu, 20 Jan 2022 17:00:33 -0700 Subject: [PATCH 0725/1295] docs: Hook the RTLA documents into the kernel docs build The RTLA documents were added to Documentation/ but never hooked into the rest of the docs build, leading to a bunch of warnings like: Documentation/tools/rtla/rtla-osnoise.rst: WARNING: document isn't included in any toctree Add some basic glue to wire these documents into the build so that they are available with the rest of the rendered docs. No attempt has been made to turn the RTLA docs into proper RST files rather than warmed-over man pages; that is an exercise for the future. Fixes: d40d48e1f1f2 ("rtla: Add Documentation") Acked-by: Daniel Bristot de Oliveira Acked-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/877dau555q.fsf@meer.lwn.net Signed-off-by: Jonathan Corbet --- Documentation/index.rst | 1 + Documentation/tools/index.rst | 20 ++++++++++++++++++++ Documentation/tools/rtla/index.rst | 26 ++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) create mode 100644 Documentation/tools/index.rst create mode 100644 Documentation/tools/rtla/index.rst diff --git a/Documentation/index.rst b/Documentation/index.rst index 2b4de3926858..b58692d687f6 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -166,6 +166,7 @@ to ReStructured Text format, or are simply too old. .. toctree:: :maxdepth: 2 + tools/index staging/index watch_queue diff --git a/Documentation/tools/index.rst b/Documentation/tools/index.rst new file mode 100644 index 000000000000..0bb1e61bdcc0 --- /dev/null +++ b/Documentation/tools/index.rst @@ -0,0 +1,20 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============ +Kernel tools +============ + +This book covers user-space tools that are shipped with the kernel source; +more additions are needed here: + +.. toctree:: + :maxdepth: 1 + + rtla/index + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/tools/rtla/index.rst b/Documentation/tools/rtla/index.rst new file mode 100644 index 000000000000..840f0bf3e803 --- /dev/null +++ b/Documentation/tools/rtla/index.rst @@ -0,0 +1,26 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================ +The realtime Linux analysis tool +================================ + +RTLA provides a set of tools for the analysis of the kernel's realtime +behavior on specific hardware. + +.. toctree:: + :maxdepth: 1 + + rtla + rtla-osnoise + rtla-osnoise-hist + rtla-osnoise-top + rtla-timerlat + rtla-timerlat-hist + rtla-timerlat-top + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` From 10855b45a428d8888b1a111d7f607c32a6a49a06 Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Mon, 24 Jan 2022 17:14:47 +0900 Subject: [PATCH 0726/1295] docs: fix typo in Documentation/kernel-hacking/locking.rst Change copy_from_user*( to copy_from_user() . Signed-off-by: Takahiro Itazuri Link: https://lore.kernel.org/r/20220124081447.34066-1-itazur@amazon.com Signed-off-by: Jonathan Corbet --- Documentation/kernel-hacking/locking.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/kernel-hacking/locking.rst b/Documentation/kernel-hacking/locking.rst index e6cd40663ea5..4cbd50edf277 100644 --- a/Documentation/kernel-hacking/locking.rst +++ b/Documentation/kernel-hacking/locking.rst @@ -295,7 +295,7 @@ Pete Zaitcev gives the following summary: - If you are in a process context (any syscall) and want to lock other process out, use a mutex. You can take a mutex and sleep - (``copy_from_user*(`` or ``kmalloc(x,GFP_KERNEL)``). + (``copy_from_user()`` or ``kmalloc(x,GFP_KERNEL)``). - Otherwise (== data can be touched in an interrupt), use spin_lock_irqsave() and From 573fe46e398f4b451d075e854d221f6197941540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Fri, 21 Jan 2022 12:58:04 +0100 Subject: [PATCH 0727/1295] Documentation: arm: marvell: Extend Avanta list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include another two SoCs from Avanta family. Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20220121115804.28824-1-pali@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/arm/marvell.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/arm/marvell.rst b/Documentation/arm/marvell.rst index 9485a5a2e2e9..2f41caa0096c 100644 --- a/Documentation/arm/marvell.rst +++ b/Documentation/arm/marvell.rst @@ -266,10 +266,12 @@ Avanta family ------------- Flavors: + - 88F6500 - 88F6510 - 88F6530P - 88F6550 - 88F6560 + - 88F6601 Homepage: https://web.archive.org/web/20181005145041/http://www.marvell.com/broadband/ From 854d0982eef0e424e8108d09d9275aaf445b1597 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Mon, 17 Jan 2022 12:13:37 +0100 Subject: [PATCH 0728/1295] docs/vm: Fix typo in *harden* Fixes: df4e817b7108 ("mm: page table check") Signed-off-by: Paul Menzel Link: https://lore.kernel.org/r/20220117111338.115455-1-pmenzel@molgen.mpg.de Signed-off-by: Jonathan Corbet --- Documentation/vm/page_table_check.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/vm/page_table_check.rst b/Documentation/vm/page_table_check.rst index 81f521ff7ea7..1a09472f10a3 100644 --- a/Documentation/vm/page_table_check.rst +++ b/Documentation/vm/page_table_check.rst @@ -9,7 +9,7 @@ Page Table Check Introduction ============ -Page table check allows to hardern the kernel by ensuring that some types of +Page table check allows to harden the kernel by ensuring that some types of the memory corruptions are prevented. Page table check performs extra verifications at the time when new pages become From 53960faf2b731dd2f9ed6e1334634b8ba6286850 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 25 Jan 2022 19:50:31 +0530 Subject: [PATCH 0729/1295] arm64: Add Cortex-A510 CPU part definition Add the CPU Partnumbers for the new Arm designs. Cc: Catalin Marinas Cc: Will Deacon Cc: Suzuki Poulose Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Suzuki K Poulose Acked-by: Catalin Marinas Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/1643120437-14352-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Mathieu Poirier --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 19b8441aa8f2..e8fdc10395b6 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -73,6 +73,7 @@ #define ARM_CPU_PART_CORTEX_A76 0xD0B #define ARM_CPU_PART_NEOVERSE_N1 0xD0C #define ARM_CPU_PART_CORTEX_A77 0xD0D +#define ARM_CPU_PART_CORTEX_A510 0xD46 #define ARM_CPU_PART_CORTEX_A710 0xD47 #define ARM_CPU_PART_NEOVERSE_N2 0xD49 @@ -115,6 +116,7 @@ #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) #define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1) #define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77) +#define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510) #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) #define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) From 607a9afaae09cde21ece458a8f10cb99d3f94f14 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 25 Jan 2022 19:50:32 +0530 Subject: [PATCH 0730/1295] arm64: errata: Add detection for TRBE ignored system register writes TRBE implementations affected by Arm erratum #2064142 might fail to write into certain system registers after the TRBE has been disabled. Under some conditions after TRBE has been disabled, writes into certain TRBE registers TRBLIMITR_EL1, TRBPTR_EL1, TRBBASER_EL1, TRBSR_EL1 and TRBTRG_EL1 will be ignored and not be effected. This adds a new errata ARM64_ERRATUM_2064142 in arm64 errata framework. Cc: Catalin Marinas Cc: Will Deacon Cc: Mathieu Poirier Cc: Suzuki Poulose Cc: coresight@lists.linaro.org Cc: linux-doc@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Suzuki K Poulose Acked-by: Catalin Marinas Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/1643120437-14352-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Mathieu Poirier --- Documentation/arm64/silicon-errata.rst | 2 ++ arch/arm64/Kconfig | 18 ++++++++++++++++++ arch/arm64/kernel/cpu_errata.c | 9 +++++++++ arch/arm64/tools/cpucaps | 1 + 4 files changed, 30 insertions(+) diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index 5342e895fb60..c9b30e6c2b6c 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -52,6 +52,8 @@ stable kernels. | Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 | +----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A510 | #2064142 | ARM64_ERRATUM_2064142 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6978140edfa4..0033436e3416 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -778,6 +778,24 @@ config ARM64_ERRATUM_2224489 If unsure, say Y. +config ARM64_ERRATUM_2064142 + bool "Cortex-A510: 2064142: workaround TRBE register writes while disabled" + depends on COMPILE_TEST # Until the CoreSight TRBE driver changes are in + default y + help + This option adds the workaround for ARM Cortex-A510 erratum 2064142. + + Affected Cortex-A510 core might fail to write into system registers after the + TRBE has been disabled. Under some conditions after the TRBE has been disabled + writes into TRBE registers TRBLIMITR_EL1, TRBPTR_EL1, TRBBASER_EL1, TRBSR_EL1, + and TRBTRG_EL1 will be ignored and will not be effected. + + Work around this in the driver by executing TSB CSYNC and DSB after collection + is stopped and before performing a system register write to one of the affected + registers. + + If unsure, say Y. + config CAVIUM_ERRATUM_22375 bool "Cavium erratum 22375, 24313" default y diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 9e1c1aef9ebd..cbb7d5a9aee7 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -597,6 +597,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, CAP_MIDR_RANGE_LIST(trbe_write_out_of_range_cpus), }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2064142 + { + .desc = "ARM erratum 2064142", + .capability = ARM64_WORKAROUND_2064142, + + /* Cortex-A510 r0p0 - r0p2 */ + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2) + }, #endif { } diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 870c39537dd0..fca3cb329e1d 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -55,6 +55,7 @@ WORKAROUND_1418040 WORKAROUND_1463225 WORKAROUND_1508412 WORKAROUND_1542419 +WORKAROUND_2064142 WORKAROUND_TRBE_OVERWRITE_FILL_MODE WORKAROUND_TSB_FLUSH_FAILURE WORKAROUND_TRBE_WRITE_OUT_OF_RANGE From 3bd94a8759de9b724b83a80942b0354acd7701eb Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 25 Jan 2022 19:50:33 +0530 Subject: [PATCH 0731/1295] arm64: errata: Add detection for TRBE invalid prohibited states TRBE implementations affected by Arm erratum #2038923 might get TRBE into an inconsistent view on whether trace is prohibited within the CPU. As a result, the trace buffer or trace buffer state might be corrupted. This happens after TRBE buffer has been enabled by setting TRBLIMITR_EL1.E, followed by just a single context synchronization event before execution changes from a context, in which trace is prohibited to one where it isn't, or vice versa. In these mentioned conditions, the view of whether trace is prohibited is inconsistent between parts of the CPU, and the trace buffer or the trace buffer state might be corrupted. This adds a new errata ARM64_ERRATUM_2038923 in arm64 errata framework. Cc: Catalin Marinas Cc: Will Deacon Cc: Mathieu Poirier Cc: Suzuki Poulose Cc: coresight@lists.linaro.org Cc: linux-doc@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Suzuki K Poulose Acked-by: Catalin Marinas Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/1643120437-14352-4-git-send-email-anshuman.khandual@arm.com Signed-off-by: Mathieu Poirier --- Documentation/arm64/silicon-errata.rst | 2 ++ arch/arm64/Kconfig | 23 +++++++++++++++++++++++ arch/arm64/kernel/cpu_errata.c | 9 +++++++++ arch/arm64/tools/cpucaps | 1 + 4 files changed, 35 insertions(+) diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index c9b30e6c2b6c..e0ef3e9a4b8b 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -54,6 +54,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A510 | #2064142 | ARM64_ERRATUM_2064142 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A510 | #2038923 | ARM64_ERRATUM_2038923 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 0033436e3416..fecf2b09e870 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -796,6 +796,29 @@ config ARM64_ERRATUM_2064142 If unsure, say Y. +config ARM64_ERRATUM_2038923 + bool "Cortex-A510: 2038923: workaround TRBE corruption with enable" + depends on COMPILE_TEST # Until the CoreSight TRBE driver changes are in + default y + help + This option adds the workaround for ARM Cortex-A510 erratum 2038923. + + Affected Cortex-A510 core might cause an inconsistent view on whether trace is + prohibited within the CPU. As a result, the trace buffer or trace buffer state + might be corrupted. This happens after TRBE buffer has been enabled by setting + TRBLIMITR_EL1.E, followed by just a single context synchronization event before + execution changes from a context, in which trace is prohibited to one where it + isn't, or vice versa. In these mentioned conditions, the view of whether trace + is prohibited is inconsistent between parts of the CPU, and the trace buffer or + the trace buffer state might be corrupted. + + Work around this in the driver by preventing an inconsistent view of whether the + trace is prohibited or not based on TRBLIMITR_EL1.E by immediately following a + change to TRBLIMITR_EL1.E with at least one ISB instruction before an ERET, or + two ISB instructions if no ERET is to take place. + + If unsure, say Y. + config CAVIUM_ERRATUM_22375 bool "Cavium erratum 22375, 24313" default y diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index cbb7d5a9aee7..60b0c1f1d912 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -607,6 +607,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2) }, #endif +#ifdef CONFIG_ARM64_ERRATUM_2038923 + { + .desc = "ARM erratum 2038923", + .capability = ARM64_WORKAROUND_2038923, + + /* Cortex-A510 r0p0 - r0p2 */ + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2) + }, +#endif { } }; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index fca3cb329e1d..45a06d36d080 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -56,6 +56,7 @@ WORKAROUND_1463225 WORKAROUND_1508412 WORKAROUND_1542419 WORKAROUND_2064142 +WORKAROUND_2038923 WORKAROUND_TRBE_OVERWRITE_FILL_MODE WORKAROUND_TSB_FLUSH_FAILURE WORKAROUND_TRBE_WRITE_OUT_OF_RANGE From 708e8af4924ec2fdd5b81fe09192c6bac2f86935 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 25 Jan 2022 19:50:34 +0530 Subject: [PATCH 0732/1295] arm64: errata: Add detection for TRBE trace data corruption TRBE implementations affected by Arm erratum #1902691 might corrupt trace data or deadlock, when it's being written into the memory. So effectively TRBE is broken and hence cannot be used to capture trace data. This adds a new errata ARM64_ERRATUM_1902691 in arm64 errata framework. Cc: Catalin Marinas Cc: Will Deacon Cc: Mathieu Poirier Cc: Suzuki Poulose Cc: coresight@lists.linaro.org Cc: linux-doc@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Suzuki K Poulose Acked-by: Catalin Marinas Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/1643120437-14352-5-git-send-email-anshuman.khandual@arm.com Signed-off-by: Mathieu Poirier --- Documentation/arm64/silicon-errata.rst | 2 ++ arch/arm64/Kconfig | 18 ++++++++++++++++++ arch/arm64/kernel/cpu_errata.c | 9 +++++++++ arch/arm64/tools/cpucaps | 1 + 4 files changed, 30 insertions(+) diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index e0ef3e9a4b8b..50018f60c4d4 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -56,6 +56,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A510 | #2038923 | ARM64_ERRATUM_2038923 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A510 | #1902691 | ARM64_ERRATUM_1902691 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fecf2b09e870..9c3cf3875785 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -819,6 +819,24 @@ config ARM64_ERRATUM_2038923 If unsure, say Y. +config ARM64_ERRATUM_1902691 + bool "Cortex-A510: 1902691: workaround TRBE trace corruption" + depends on COMPILE_TEST # Until the CoreSight TRBE driver changes are in + default y + help + This option adds the workaround for ARM Cortex-A510 erratum 1902691. + + Affected Cortex-A510 core might cause trace data corruption, when being written + into the memory. Effectively TRBE is broken and hence cannot be used to capture + trace data. + + Work around this problem in the driver by just preventing TRBE initialization on + affected cpus. The firmware must have disabled the access to TRBE for the kernel + on such implementations. This will cover the kernel for any firmware that doesn't + do this already. + + If unsure, say Y. + config CAVIUM_ERRATUM_22375 bool "Cavium erratum 22375, 24313" default y diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 60b0c1f1d912..a3336dfb5a8a 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -615,6 +615,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = { /* Cortex-A510 r0p0 - r0p2 */ ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2) }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_1902691 + { + .desc = "ARM erratum 1902691", + .capability = ARM64_WORKAROUND_1902691, + + /* Cortex-A510 r0p0 - r0p1 */ + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 1) + }, #endif { } diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 45a06d36d080..e7719e8f18de 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -57,6 +57,7 @@ WORKAROUND_1508412 WORKAROUND_1542419 WORKAROUND_2064142 WORKAROUND_2038923 +WORKAROUND_1902691 WORKAROUND_TRBE_OVERWRITE_FILL_MODE WORKAROUND_TSB_FLUSH_FAILURE WORKAROUND_TRBE_WRITE_OUT_OF_RANGE From 43f2517955875be5d96b641fba33d73097fe3cd9 Mon Sep 17 00:00:00 2001 From: Anitha Chrisanthus Date: Thu, 27 Jan 2022 10:45:46 -0800 Subject: [PATCH 0733/1295] drm/kmb: Fix for build errors with Warray-bounds This fixes the following build error drivers/gpu/drm/kmb/kmb_plane.c: In function 'kmb_plane_atomic_disable': drivers/gpu/drm/kmb/kmb_plane.c:165:34: error: array subscript 3 is above array bounds of 'struct layer_status[2]' [-Werror=array-bounds] 165 | kmb->plane_status[plane_id].ctrl = LCD_CTRL_GL2_ENABLE; | ~~~~~~~~~~~~~~~~~^~~~~~~~~~ In file included from drivers/gpu/drm/kmb/kmb_plane.c:17: drivers/gpu/drm/kmb/kmb_drv.h:61:41: note: while referencing 'plane_status' 61 | struct layer_status plane_status[KMB_MAX_PLANES]; | ^~~~~~~~~~~~ drivers/gpu/drm/kmb/kmb_plane.c:162:34: error: array subscript 2 is above array bounds of 'struct layer_status[2]' [-Werror=array-bounds] 162 | kmb->plane_status[plane_id].ctrl = LCD_CTRL_GL1_ENABLE; | ~~~~~~~~~~~~~~~~~^~~~~~~~~~ In file included from drivers/gpu/drm/kmb/kmb_plane.c:17: drivers/gpu/drm/kmb/kmb_drv.h:61:41: note: while referencing 'plane_status' 61 | struct layer_status plane_status[KMB_MAX_PLANES]; | ^~~~~~~~~~~~ Fixes: 7f7b96a8a0a1 ("drm/kmb: Add support for KeemBay Display") Signed-off-by: Anitha Chrisanthus Reviewed-by: Kees Cook Link: https://patchwork.freedesktop.org/patch/msgid/20220127194227.2213608-1-anitha.chrisanthus@intel.com --- drivers/gpu/drm/kmb/kmb_plane.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/gpu/drm/kmb/kmb_plane.c b/drivers/gpu/drm/kmb/kmb_plane.c index 00404ba4126d..2735b8eb3537 100644 --- a/drivers/gpu/drm/kmb/kmb_plane.c +++ b/drivers/gpu/drm/kmb/kmb_plane.c @@ -158,12 +158,6 @@ static void kmb_plane_atomic_disable(struct drm_plane *plane, case LAYER_1: kmb->plane_status[plane_id].ctrl = LCD_CTRL_VL2_ENABLE; break; - case LAYER_2: - kmb->plane_status[plane_id].ctrl = LCD_CTRL_GL1_ENABLE; - break; - case LAYER_3: - kmb->plane_status[plane_id].ctrl = LCD_CTRL_GL2_ENABLE; - break; } kmb->plane_status[plane_id].disable = true; From b9199181a9ef8252e47e207be8c23e1f50662620 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 27 Jan 2022 22:44:46 +0500 Subject: [PATCH 0734/1295] selftests: futex: Use variable MAKE instead of make MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recursive make commands should always use the variable MAKE, not the explicit command name ‘make’. This has benefits and removes the following warning when multiple jobs are used for the build: make[2]: warning: jobserver unavailable: using -j1. Add '+' to parent make rule. Fixes: a8ba798bc8ec ("selftests: enable O and KBUILD_OUTPUT") Signed-off-by: Muhammad Usama Anjum Reviewed-by: André Almeida Signed-off-by: Shuah Khan --- tools/testing/selftests/futex/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/futex/Makefile b/tools/testing/selftests/futex/Makefile index 12631f0076a1..11e157d7533b 100644 --- a/tools/testing/selftests/futex/Makefile +++ b/tools/testing/selftests/futex/Makefile @@ -11,7 +11,7 @@ all: @for DIR in $(SUBDIRS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir $$BUILD_TARGET -p; \ - make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ if [ -e $$DIR/$(TEST_PROGS) ]; then \ rsync -a $$DIR/$(TEST_PROGS) $$BUILD_TARGET/; \ fi \ @@ -32,6 +32,6 @@ override define CLEAN @for DIR in $(SUBDIRS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir $$BUILD_TARGET -p; \ - make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ done endef From e051cdf655fa016692008a446a060eff06222bb5 Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 26 Jan 2022 10:27:21 +0000 Subject: [PATCH 0735/1295] selftests: openat2: Print also errno in failure messages In E_func() macro, on error, print also errno in order to aid debugging. Cc: Aleksa Sarai Signed-off-by: Cristian Marussi Signed-off-by: Shuah Khan --- tools/testing/selftests/openat2/helpers.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/openat2/helpers.h b/tools/testing/selftests/openat2/helpers.h index a6ea27344db2..ad5d0ba5b6ce 100644 --- a/tools/testing/selftests/openat2/helpers.h +++ b/tools/testing/selftests/openat2/helpers.h @@ -62,11 +62,12 @@ bool needs_openat2(const struct open_how *how); (similar to chroot(2)). */ #endif /* RESOLVE_IN_ROOT */ -#define E_func(func, ...) \ - do { \ - if (func(__VA_ARGS__) < 0) \ - ksft_exit_fail_msg("%s:%d %s failed\n", \ - __FILE__, __LINE__, #func);\ +#define E_func(func, ...) \ + do { \ + errno = 0; \ + if (func(__VA_ARGS__) < 0) \ + ksft_exit_fail_msg("%s:%d %s failed - errno:%d\n", \ + __FILE__, __LINE__, #func, errno); \ } while (0) #define E_asprintf(...) E_func(asprintf, __VA_ARGS__) From ea3396725aa143dd42fe388cb67e44c90d2fb719 Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 26 Jan 2022 10:27:22 +0000 Subject: [PATCH 0736/1295] selftests: openat2: Add missing dependency in Makefile Add a dependency on header helpers.h to the main target; while at that add to helpers.h also a missing include for bool types. Cc: Aleksa Sarai Signed-off-by: Cristian Marussi Signed-off-by: Shuah Khan --- tools/testing/selftests/openat2/Makefile | 2 +- tools/testing/selftests/openat2/helpers.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/openat2/Makefile b/tools/testing/selftests/openat2/Makefile index 4b93b1417b86..843ba56d8e49 100644 --- a/tools/testing/selftests/openat2/Makefile +++ b/tools/testing/selftests/openat2/Makefile @@ -5,4 +5,4 @@ TEST_GEN_PROGS := openat2_test resolve_test rename_attack_test include ../lib.mk -$(TEST_GEN_PROGS): helpers.c +$(TEST_GEN_PROGS): helpers.c helpers.h diff --git a/tools/testing/selftests/openat2/helpers.h b/tools/testing/selftests/openat2/helpers.h index ad5d0ba5b6ce..7056340b9339 100644 --- a/tools/testing/selftests/openat2/helpers.h +++ b/tools/testing/selftests/openat2/helpers.h @@ -9,6 +9,7 @@ #define _GNU_SOURCE #include +#include #include #include #include "../kselftest.h" From ac9e0a250bb155078601a5b999aab05f2a04d1ab Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 26 Jan 2022 10:27:23 +0000 Subject: [PATCH 0737/1295] selftests: openat2: Skip testcases that fail with EOPNOTSUPP Skip testcases that fail since the requested valid flags combination is not supported by the underlying filesystem. Cc: Aleksa Sarai Signed-off-by: Cristian Marussi Signed-off-by: Shuah Khan --- tools/testing/selftests/openat2/openat2_test.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/openat2/openat2_test.c b/tools/testing/selftests/openat2/openat2_test.c index 1bddbe934204..7fb902099de4 100644 --- a/tools/testing/selftests/openat2/openat2_test.c +++ b/tools/testing/selftests/openat2/openat2_test.c @@ -259,6 +259,16 @@ void test_openat2_flags(void) unlink(path); fd = sys_openat2(AT_FDCWD, path, &test->how); + if (fd < 0 && fd == -EOPNOTSUPP) { + /* + * Skip the testcase if it failed because not supported + * by FS. (e.g. a valid O_TMPFILE combination on NFS) + */ + ksft_test_result_skip("openat2 with %s fails with %d (%s)\n", + test->name, fd, strerror(-fd)); + goto next; + } + if (test->err >= 0) failed = (fd < 0); else @@ -303,7 +313,7 @@ skip: else resultfn("openat2 with %s fails with %d (%s)\n", test->name, test->err, strerror(-test->err)); - +next: free(fdpath); fflush(stdout); } From dae1d8ac31896988e7313384c0370176a75e9b45 Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 26 Jan 2022 10:27:19 +0000 Subject: [PATCH 0738/1295] selftests: skip mincore.check_file_mmap when fs lacks needed support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Report mincore.check_file_mmap as SKIP instead of FAIL if the underlying filesystem lacks support of O_TMPFILE or fallocate since such failures are not really related to mincore functionality. Cc: Ricardo Cañuelo Signed-off-by: Cristian Marussi Signed-off-by: Shuah Khan --- .../selftests/mincore/mincore_selftest.c | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/mincore/mincore_selftest.c b/tools/testing/selftests/mincore/mincore_selftest.c index e54106643337..4c88238fc8f0 100644 --- a/tools/testing/selftests/mincore/mincore_selftest.c +++ b/tools/testing/selftests/mincore/mincore_selftest.c @@ -207,15 +207,21 @@ TEST(check_file_mmap) errno = 0; fd = open(".", O_TMPFILE | O_RDWR, 0600); - ASSERT_NE(-1, fd) { - TH_LOG("Can't create temporary file: %s", - strerror(errno)); + if (fd < 0) { + ASSERT_EQ(errno, EOPNOTSUPP) { + TH_LOG("Can't create temporary file: %s", + strerror(errno)); + } + SKIP(goto out_free, "O_TMPFILE not supported by filesystem."); } errno = 0; retval = fallocate(fd, 0, 0, FILE_SIZE); - ASSERT_EQ(0, retval) { - TH_LOG("Error allocating space for the temporary file: %s", - strerror(errno)); + if (retval) { + ASSERT_EQ(errno, EOPNOTSUPP) { + TH_LOG("Error allocating space for the temporary file: %s", + strerror(errno)); + } + SKIP(goto out_close, "fallocate not supported by filesystem."); } /* @@ -271,7 +277,9 @@ TEST(check_file_mmap) } munmap(addr, FILE_SIZE); +out_close: close(fd); +out_free: free(vec); } From 4ed308c445a1e3abac8f6c17928c1cb533867e38 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 25 Jan 2022 09:19:10 -0500 Subject: [PATCH 0739/1295] ftrace: Have architectures opt-in for mcount build time sorting First S390 complained that the sorting of the mcount sections at build time caused the kernel to crash on their architecture. Now PowerPC is complaining about it too. And also ARM64 appears to be having issues. It may be necessary to also update the relocation table for the values in the mcount table. Not only do we have to sort the table, but also update the relocations that may be applied to the items in the table. If the system is not relocatable, then it is fine to sort, but if it is, some architectures may have issues (although x86 does not as it shifts all addresses the same). Add a HAVE_BUILDTIME_MCOUNT_SORT that an architecture can set to say it is safe to do the sorting at build time. Also update the config to compile in build time sorting in the sorttable code in scripts/ to depend on CONFIG_BUILDTIME_MCOUNT_SORT. Link: https://lore.kernel.org/all/944D10DA-8200-4BA9-8D0A-3BED9AA99F82@linux.ibm.com/ Link: https://lkml.kernel.org/r/20220127153821.3bc1ac6e@gandalf.local.home Cc: Ingo Molnar Cc: Andrew Morton Cc: Russell King Cc: Yinan Liu Cc: Ard Biesheuvel Cc: Kees Cook Reported-by: Sachin Sant Reviewed-by: Mark Rutland Tested-by: Mark Rutland [arm64] Tested-by: Sachin Sant Fixes: 72b3942a173c ("scripts: ftrace - move the sort-processing in ftrace_init") Signed-off-by: Steven Rostedt (Google) --- arch/arm/Kconfig | 1 + arch/x86/Kconfig | 1 + kernel/trace/Kconfig | 8 +++++++- scripts/Makefile | 2 +- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index fabe39169b12..4c97cb40eebb 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -83,6 +83,7 @@ config ARM select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32 select HAVE_CONTEXT_TRACKING select HAVE_C_RECORDMCOUNT + select HAVE_BUILDTIME_MCOUNT_SORT select HAVE_DEBUG_KMEMLEAK if !XIP_KERNEL select HAVE_DMA_CONTIGUOUS if MMU select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ebe8fc76949a..9f5bd41bf660 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -186,6 +186,7 @@ config X86 select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING select HAVE_C_RECORDMCOUNT select HAVE_OBJTOOL_MCOUNT if STACK_VALIDATION + select HAVE_BUILDTIME_MCOUNT_SORT select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 752ed89a293b..a5eb5e7fd624 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -70,10 +70,16 @@ config HAVE_C_RECORDMCOUNT help C version of recordmcount available? +config HAVE_BUILDTIME_MCOUNT_SORT + bool + help + An architecture selects this if it sorts the mcount_loc section + at build time. + config BUILDTIME_MCOUNT_SORT bool default y - depends on BUILDTIME_TABLE_SORT && !S390 + depends on HAVE_BUILDTIME_MCOUNT_SORT && DYNAMIC_FTRACE help Sort the mcount_loc section at build time. diff --git a/scripts/Makefile b/scripts/Makefile index ecd3acacd0ec..ce5aa9030b74 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -25,7 +25,7 @@ HOSTCFLAGS_sorttable.o += -I$(srctree)/tools/arch/x86/include HOSTCFLAGS_sorttable.o += -DUNWINDER_ORC_ENABLED endif -ifdef CONFIG_DYNAMIC_FTRACE +ifdef CONFIG_BUILDTIME_MCOUNT_SORT HOSTCFLAGS_sorttable.o += -DMCOUNT_SORT_ENABLED endif From e629e7b525a179e29d53463d992bdee759c950fb Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Tue, 25 Jan 2022 12:07:15 +0800 Subject: [PATCH 0740/1295] tracing/histogram: Fix a potential memory leak for kstrdup() kfree() is missing on an error path to free the memory allocated by kstrdup(): p = param = kstrdup(data->params[i], GFP_KERNEL); So it is better to free it via kfree(p). Link: https://lkml.kernel.org/r/tencent_C52895FD37802832A3E5B272D05008866F0A@qq.com Cc: stable@vger.kernel.org Fixes: d380dcde9a07c ("tracing: Fix now invalid var_ref_vals assumption in trace action") Signed-off-by: Xiaoke Wang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5e6a988a8a51..cd9610688ddc 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3935,6 +3935,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data, var_ref_idx = find_var_ref_idx(hist_data, var_ref); if (WARN_ON(var_ref_idx < 0)) { + kfree(p); ret = var_ref_idx; goto err; } From 58c5724ec2cdd72b22107ec5de00d90cc4797796 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 25 Jan 2022 23:19:30 +0900 Subject: [PATCH 0741/1295] tracing: Avoid -Warray-bounds warning for __rel_loc macro Since -Warray-bounds checks the destination size from the type of given pointer, __assign_rel_str() macro gets warned because it passes the pointer to the 'u32' field instead of 'trace_event_raw_*' data structure. Pass the data address calculated from the 'trace_event_raw_*' instead of 'u32' __rel_loc field. Link: https://lkml.kernel.org/r/20220125233154.dac280ed36944c0c2fe6f3ac@kernel.org Cc: Stephen Rothwell Cc: Kees Cook Signed-off-by: Masami Hiramatsu [ This did not fix the warning, but is still a nice clean up ] Signed-off-by: Steven Rostedt (Google) --- include/trace/trace_events.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 8c6f7c433518..65d927e059d3 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -318,9 +318,10 @@ TRACE_MAKE_SYSTEM_STR(); #define __get_str(field) ((char *)__get_dynamic_array(field)) #undef __get_rel_dynamic_array -#define __get_rel_dynamic_array(field) \ - ((void *)(&__entry->__rel_loc_##field) + \ - sizeof(__entry->__rel_loc_##field) + \ +#define __get_rel_dynamic_array(field) \ + ((void *)__entry + \ + offsetof(typeof(*__entry), __rel_loc_##field) + \ + sizeof(__entry->__rel_loc_##field) + \ (__entry->__rel_loc_##field & 0xffff)) #undef __get_rel_dynamic_array_len From c6d777acdf8f62d4ebaef0e5c6cd8fedbd6e8546 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 25 Jan 2022 14:00:37 -0800 Subject: [PATCH 0742/1295] tracing/perf: Avoid -Warray-bounds warning for __rel_loc macro As done for trace_events.h, also fix the __rel_loc macro in perf.h, which silences the -Warray-bounds warning: In file included from ./include/linux/string.h:253, from ./include/linux/bitmap.h:11, from ./include/linux/cpumask.h:12, from ./include/linux/mm_types_task.h:14, from ./include/linux/mm_types.h:5, from ./include/linux/buildid.h:5, from ./include/linux/module.h:14, from samples/trace_events/trace-events-sample.c:2: In function '__fortify_strcpy', inlined from 'perf_trace_foo_rel_loc' at samples/trace_events/./trace-events-sample.h:519:1: ./include/linux/fortify-string.h:47:33: warning: '__builtin_strcpy' offset 12 is out of the bounds [ 0, 4] [-Warray-bounds] 47 | #define __underlying_strcpy __builtin_strcpy | ^ ./include/linux/fortify-string.h:445:24: note: in expansion of macro '__underlying_strcpy' 445 | return __underlying_strcpy(p, q); | ^~~~~~~~~~~~~~~~~~~ Also make __data struct member a proper flexible array to avoid future problems. Link: https://lkml.kernel.org/r/20220125220037.2738923-1-keescook@chromium.org Cc: Steven Rostedt Cc: Masami Hiramatsu Fixes: 55de2c0b5610c ("tracing: Add '__rel_loc' using trace event macros") Reported-by: Stephen Rothwell Signed-off-by: Kees Cook Signed-off-by: Steven Rostedt (Google) --- include/trace/perf.h | 5 +++-- include/trace/trace_events.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/trace/perf.h b/include/trace/perf.h index ea4405de175a..5d48c46a3008 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -23,8 +23,9 @@ #undef __get_rel_dynamic_array #define __get_rel_dynamic_array(field) \ - ((void *)(&__entry->__rel_loc_##field) + \ - sizeof(__entry->__rel_loc_##field) + \ + ((void *)__entry + \ + offsetof(typeof(*__entry), __rel_loc_##field) + \ + sizeof(__entry->__rel_loc_##field) + \ (__entry->__rel_loc_##field & 0xffff)) #undef __get_rel_dynamic_array_len diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 65d927e059d3..3d29919045af 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -128,7 +128,7 @@ TRACE_MAKE_SYSTEM_STR(); struct trace_event_raw_##name { \ struct trace_entry ent; \ tstruct \ - char __data[0]; \ + char __data[]; \ }; \ \ static struct trace_event_class event_class_##name; From 2201aea114d4c7eedd05865df545dbb987cee5c1 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Tue, 25 Jan 2022 17:13:01 -0700 Subject: [PATCH 0743/1295] rtla: Make doc build optional rtla build fails due to doc build dependency on rst2man. Make doc build optional so rtla could be built without docs. Leave the install dependency on doc_install alone. Link: https://lkml.kernel.org/r/20220126001301.79096-1-skhan@linuxfoundation.org Acked-by: Daniel Bristot de Oliveira Signed-off-by: Shuah Khan Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile index 2d52ff0bff7d..7c39728d08de 100644 --- a/tools/tracing/rtla/Makefile +++ b/tools/tracing/rtla/Makefile @@ -59,7 +59,7 @@ endif .PHONY: all all: rtla -rtla: $(OBJ) doc +rtla: $(OBJ) $(CC) -o rtla $(LDFLAGS) $(OBJ) $(LIBS) static: $(OBJ) From aa814c51ab7cb3ddeeeeedaa37d599aee7ba6649 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Tue, 25 Jan 2022 17:22:34 -0700 Subject: [PATCH 0744/1295] tools/tracing: Update Makefile to build rtla Update tracing Makefile to build/install/clean rtla tragets. Link: https://lkml.kernel.org/r/20220126002234.79337-1-skhan@linuxfoundation.org Reviewed-by: Daniel Bristot de Oliveira Signed-off-by: Shuah Khan Signed-off-by: Steven Rostedt (Google) --- tools/tracing/Makefile | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tools/tracing/Makefile b/tools/tracing/Makefile index 87e0ec48e2e7..95e485f12d97 100644 --- a/tools/tracing/Makefile +++ b/tools/tracing/Makefile @@ -1,11 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 include ../scripts/Makefile.include -all: latency +all: latency rtla -clean: latency_clean +clean: latency_clean rtla_clean -install: latency_install +install: latency_install rtla_install latency: $(call descend,latency) @@ -16,4 +16,14 @@ latency_install: latency_clean: $(call descend,latency,clean) -.PHONY: all install clean latency latency_install latency_clean +rtla: + $(call descend,rtla) + +rtla_install: + $(call descend,rtla,install) + +rtla_clean: + $(call descend,rtla,clean) + +.PHONY: all install clean latency latency_install latency_clean \ + rtla rtla_install rtla_clean From 798a5b6c195d1c64fd5e9dd252381feb17e5ff22 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 27 Jan 2022 15:44:15 -0600 Subject: [PATCH 0745/1295] tracing: Fix smatch warning for null glob in event_hist_trigger_parse() The recent rename of event_hist_trigger_parse() caused smatch re-evaluation of trace_events_hist.c and as a result an old warning was found: kernel/trace/trace_events_hist.c:6174 event_hist_trigger_parse() error: we previously assumed 'glob' could be null (see line 6166) glob should never be null (and apparently smatch can also figure that out and skip the warning when using the cross-function DB (but which can't be used with a 0day build as it takes too much time to generate)). Nonetheless for clarity, remove the test but add a WARN_ON() in case the code ever changes. Link: https://lkml.kernel.org/r/96925e5c1f116654ada7ea0613d930b1266b5e1c.1643319703.git.zanussi@kernel.org Fixes: f404da6e1d46c ("tracing: Add 'last error' error facility for hist triggers") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index cd9610688ddc..e0860146dd39 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6164,7 +6164,9 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, lockdep_assert_held(&event_mutex); - if (glob && strlen(glob)) { + WARN_ON(!glob); + + if (strlen(glob)) { hist_err_clear(); last_cmd_set(file, param); } From b59f2f2b865cedd6d1641394b9cd84399bd738ff Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 27 Jan 2022 15:44:16 -0600 Subject: [PATCH 0746/1295] tracing: Fix smatch warning for do while check in event_hist_trigger_parse() The patch ec5ce0987541: "tracing: Allow whitespace to surround hist trigger filter" from Jan 15, 2018, leads to the following Smatch static checker warning: kernel/trace/trace_events_hist.c:6199 event_hist_trigger_parse() warn: 'p' can't be NULL. Since p is always checked for a NULL value at the top of loop and nothing in the rest of the loop will set it to NULL, the warning is correct and might as well be 1 to silence the warning. Link: https://lkml.kernel.org/r/a1d4c79766c0cf61e20438dc35244d216633fef6.1643319703.git.zanussi@kernel.org Fixes: ec5ce09875410 ("tracing: Allow whitespace to surround hist trigger filter") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e0860146dd39..b894d68082ea 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6199,7 +6199,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, continue; } break; - } while (p); + } while (1); if (!p) param = NULL; From 097f1eefedeab528cecbd35586dfe293853ffb17 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 27 Jan 2022 15:44:17 -0600 Subject: [PATCH 0747/1295] tracing: Propagate is_signed to expression During expression parsing, a new expression field is created which should inherit the properties of the operands, such as size and is_signed. is_signed propagation was missing, causing spurious errors with signed operands. Add it in parse_expr() and parse_unary() to fix the problem. Link: https://lkml.kernel.org/r/f4dac08742fd7a0920bf80a73c6c44042f5eaa40.1643319703.git.zanussi@kernel.org Cc: stable@vger.kernel.org Fixes: 100719dcef447 ("tracing: Add simple expression support to hist triggers") Reported-by: Yordan Karadzhov BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215513 Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b894d68082ea..ada87bfb5bb8 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2503,6 +2503,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); expr->fn = hist_field_unary_minus; expr->operands[0] = operand1; + expr->size = operand1->size; + expr->is_signed = operand1->is_signed; expr->operator = FIELD_OP_UNARY_MINUS; expr->name = expr_str(expr, 0); expr->type = kstrdup_const(operand1->type, GFP_KERNEL); @@ -2719,6 +2721,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, /* The operand sizes should be the same, so just pick one */ expr->size = operand1->size; + expr->is_signed = operand1->is_signed; expr->operator = field_op; expr->type = kstrdup_const(operand1->type, GFP_KERNEL); From 67ab5eb71b37b55f7c5522d080a1b42823351776 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 27 Jan 2022 15:44:18 -0600 Subject: [PATCH 0748/1295] tracing: Don't inc err_log entry count if entry allocation fails tr->n_err_log_entries should only be increased if entry allocation succeeds. Doing it when it fails won't cause any problems other than wasting an entry, but should be fixed anyway. Link: https://lkml.kernel.org/r/cad1ab28f75968db0f466925e7cba5970cec6c29.1643319703.git.zanussi@kernel.org Cc: stable@vger.kernel.org Fixes: 2f754e771b1a6 ("tracing: Don't inc err_log entry count if entry allocation fails") Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a569a0cb81ee..c860f582b078 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7740,7 +7740,8 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) err = kzalloc(sizeof(*err), GFP_KERNEL); if (!err) err = ERR_PTR(-ENOMEM); - tr->n_err_log_entries++; + else + tr->n_err_log_entries++; return err; } From 5aac9108a180fc06e28d4e7fb00247ce603b72ee Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Thu, 27 Jan 2022 14:50:03 +0530 Subject: [PATCH 0749/1295] net: amd-xgbe: Fix skb data length underflow There will be BUG_ON() triggered in include/linux/skbuff.h leading to intermittent kernel panic, when the skb length underflow is detected. Fix this by dropping the packet if such length underflows are seen because of inconsistencies in the hardware descriptors. Fixes: 622c36f143fc ("amd-xgbe: Fix jumbo MTU processing on newer hardware") Suggested-by: Tom Lendacky Signed-off-by: Shyam Sundar S K Acked-by: Tom Lendacky Link: https://lore.kernel.org/r/20220127092003.2812745-1-Shyam-sundar.S-k@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 492ac383f16d..ec3b287e3a71 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -2550,6 +2550,14 @@ read_again: buf2_len = xgbe_rx_buf2_len(rdata, packet, len); len += buf2_len; + if (buf2_len > rdata->rx.buf.dma_len) { + /* Hardware inconsistency within the descriptors + * that has resulted in a length underflow. + */ + error = 1; + goto skip_data; + } + if (!skb) { skb = xgbe_create_skb(pdata, napi, rdata, buf1_len); @@ -2579,8 +2587,10 @@ skip_data: if (!last || context_next) goto read_again; - if (!skb) + if (!skb || error) { + dev_kfree_skb(skb); goto next_packet; + } /* Be sure we don't exceed the configured MTU */ max_len = netdev->mtu + ETH_HLEN; From 7674b7b559b683478c3832527c59bceb169e701d Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Thu, 27 Jan 2022 11:32:22 +0530 Subject: [PATCH 0750/1295] net: amd-xgbe: ensure to reset the tx_timer_active flag Ensure to reset the tx_timer_active flag in xgbe_stop(), otherwise a port restart may result in tx timeout due to uncleared flag. Fixes: c635eaacbf77 ("amd-xgbe: Remove Tx coalescing") Co-developed-by: Sudheesh Mavila Signed-off-by: Sudheesh Mavila Signed-off-by: Raju Rangoju Acked-by: Tom Lendacky Link: https://lore.kernel.org/r/20220127060222.453371-1-Raju.Rangoju@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index ec3b287e3a71..a3593290886f 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -721,7 +721,9 @@ static void xgbe_stop_timers(struct xgbe_prv_data *pdata) if (!channel->tx_ring) break; + /* Deactivate the Tx timer */ del_timer_sync(&channel->tx_timer); + channel->tx_timer_active = 0; } } From 42c9b28e6862d16db82a56f5667cf4d1f6658cf6 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 27 Dec 2021 13:14:02 -0300 Subject: [PATCH 0751/1295] ARM: dts: imx23-evk: Remove MX23_PAD_SSP1_DETECT from hog group Currently, SD card fails to mount due to the following pinctrl error: [ 11.170000] imx23-pinctrl 80018000.pinctrl: pin SSP1_DETECT already requested by 80018000.pinctrl; cannot claim for 80010000.spi [ 11.180000] imx23-pinctrl 80018000.pinctrl: pin-65 (80010000.spi) status -22 [ 11.190000] imx23-pinctrl 80018000.pinctrl: could not request pin 65 (SSP1_DETECT) from group mmc0-pins-fixup.0 on device 80018000.pinctrl [ 11.200000] mxs-mmc 80010000.spi: Error applying setting, reverse things back Fix it by removing the MX23_PAD_SSP1_DETECT pin from the hog group as it is already been used by the mmc0-pins-fixup pinctrl group. With this change the rootfs can be mounted and the imx23-evk board can boot successfully. Cc: Fixes: bc3875f1a61e ("ARM: dts: mxs: modify mx23/mx28 dts files to use pinctrl headers") Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx23-evk.dts | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/boot/dts/imx23-evk.dts b/arch/arm/boot/dts/imx23-evk.dts index 8cbaf1c81174..3b609d987d88 100644 --- a/arch/arm/boot/dts/imx23-evk.dts +++ b/arch/arm/boot/dts/imx23-evk.dts @@ -79,7 +79,6 @@ MX23_PAD_LCD_RESET__GPIO_1_18 MX23_PAD_PWM3__GPIO_1_29 MX23_PAD_PWM4__GPIO_1_30 - MX23_PAD_SSP1_DETECT__SSP1_DETECT >; fsl,drive-strength = ; fsl,voltage = ; From 0444f82766f0b5b9c8302ad802dafa5dd0e722d0 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 27 Jan 2022 14:57:17 +0100 Subject: [PATCH 0752/1295] ALSA: hda: Fix signedness of sscanf() arguments The %x format of sscanf() takes an unsigned int pointer, while we pass a signed int pointer. Practically it's OK, but this may result in a compile warning. Let's fix it. Fixes: a235d5b8e550 ("ALSA: hda: Allow model option to specify PCI SSID alias") Reported-by: kernel test robot Link: https://lore.kernel.org/r/20220127135717.31751-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_auto_parser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/hda_auto_parser.c b/sound/pci/hda/hda_auto_parser.c index 82c492b05667..cd1db943b7e0 100644 --- a/sound/pci/hda/hda_auto_parser.c +++ b/sound/pci/hda/hda_auto_parser.c @@ -981,7 +981,7 @@ void snd_hda_pick_fixup(struct hda_codec *codec, int id = HDA_FIXUP_ID_NOT_SET; const char *name = NULL; const char *type = NULL; - int vendor, device; + unsigned int vendor, device; if (codec->fixup_id != HDA_FIXUP_ID_NOT_SET) return; From 9129886b88185962538180625ca8051362b01327 Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Sat, 22 Jan 2022 18:19:49 +0000 Subject: [PATCH 0753/1295] parisc: Drop __init from map_pages declaration With huge kernel pages, we randomly eat a SPARC in map_pages(). This is fixed by dropping __init from the declaration. However, map_pages references the __init routine memblock_alloc_try_nid via memblock_alloc. Thus, it needs to be marked with __ref. memblock_alloc is only called before the kernel text is set to readonly. The __ref on free_initmem is no longer needed. Comment regarding map_pages being in the init section is removed. Signed-off-by: John David Anglin Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Helge Deller --- arch/parisc/mm/init.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 1ae31db9988f..1dc2e88e7b04 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -337,9 +337,9 @@ static void __init setup_bootmem(void) static bool kernel_set_to_readonly; -static void __init map_pages(unsigned long start_vaddr, - unsigned long start_paddr, unsigned long size, - pgprot_t pgprot, int force) +static void __ref map_pages(unsigned long start_vaddr, + unsigned long start_paddr, unsigned long size, + pgprot_t pgprot, int force) { pmd_t *pmd; pte_t *pg_table; @@ -449,7 +449,7 @@ void __init set_kernel_text_rw(int enable_read_write) flush_tlb_all(); } -void __ref free_initmem(void) +void free_initmem(void) { unsigned long init_begin = (unsigned long)__init_begin; unsigned long init_end = (unsigned long)__init_end; @@ -463,7 +463,6 @@ void __ref free_initmem(void) /* The init text pages are marked R-X. We have to * flush the icache and mark them RW- * - * This is tricky, because map_pages is in the init section. * Do a dummy remap of the data section first (the data * section is already PAGE_KERNEL) to pull in the TLB entries * for map_kernel */ From b7d6f44a0fa716a82969725516dc0b16bc7cd514 Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Wed, 26 Jan 2022 20:39:05 +0000 Subject: [PATCH 0754/1295] parisc: Fix data TLB miss in sba_unmap_sg Rolf Eike Beer reported the following bug: [1274934.746891] Bad Address (null pointer deref?): Code=15 (Data TLB miss fault) at addr 0000004140000018 [1274934.746891] CPU: 3 PID: 5549 Comm: cmake Not tainted 5.15.4-gentoo-parisc64 #4 [1274934.746891] Hardware name: 9000/785/C8000 [1274934.746891] [1274934.746891] YZrvWESTHLNXBCVMcbcbcbcbOGFRQPDI [1274934.746891] PSW: 00001000000001001111111000001110 Not tainted [1274934.746891] r00-03 000000ff0804fe0e 0000000040bc9bc0 00000000406760e4 0000004140000000 [1274934.746891] r04-07 0000000040b693c0 0000004140000000 000000004a2b08b0 0000000000000001 [1274934.746891] r08-11 0000000041f98810 0000000000000000 000000004a0a7000 0000000000000001 [1274934.746891] r12-15 0000000040bddbc0 0000000040c0cbc0 0000000040bddbc0 0000000040bddbc0 [1274934.746891] r16-19 0000000040bde3c0 0000000040bddbc0 0000000040bde3c0 0000000000000007 [1274934.746891] r20-23 0000000000000006 000000004a368950 0000000000000000 0000000000000001 [1274934.746891] r24-27 0000000000001fff 000000000800000e 000000004a1710f0 0000000040b693c0 [1274934.746891] r28-31 0000000000000001 0000000041f988b0 0000000041f98840 000000004a171118 [1274934.746891] sr00-03 00000000066e5800 0000000000000000 0000000000000000 00000000066e5800 [1274934.746891] sr04-07 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [1274934.746891] [1274934.746891] IASQ: 0000000000000000 0000000000000000 IAOQ: 00000000406760e8 00000000406760ec [1274934.746891] IIR: 48780030 ISR: 0000000000000000 IOR: 0000004140000018 [1274934.746891] CPU: 3 CR30: 00000040e3a9c000 CR31: ffffffffffffffff [1274934.746891] ORIG_R28: 0000000040acdd58 [1274934.746891] IAOQ[0]: sba_unmap_sg+0xb0/0x118 [1274934.746891] IAOQ[1]: sba_unmap_sg+0xb4/0x118 [1274934.746891] RP(r2): sba_unmap_sg+0xac/0x118 [1274934.746891] Backtrace: [1274934.746891] [<00000000402740cc>] dma_unmap_sg_attrs+0x6c/0x70 [1274934.746891] [<000000004074d6bc>] scsi_dma_unmap+0x54/0x60 [1274934.746891] [<00000000407a3488>] mptscsih_io_done+0x150/0xd70 [1274934.746891] [<0000000040798600>] mpt_interrupt+0x168/0xa68 [1274934.746891] [<0000000040255a48>] __handle_irq_event_percpu+0xc8/0x278 [1274934.746891] [<0000000040255c34>] handle_irq_event_percpu+0x3c/0xd8 [1274934.746891] [<000000004025ecb4>] handle_percpu_irq+0xb4/0xf0 [1274934.746891] [<00000000402548e0>] generic_handle_irq+0x50/0x70 [1274934.746891] [<000000004019a254>] call_on_stack+0x18/0x24 [1274934.746891] [1274934.746891] Kernel panic - not syncing: Bad Address (null pointer deref?) The bug is caused by overrunning the sglist and incorrectly testing sg_dma_len(sglist) before nents. Normally this doesn't cause a crash, but in this case sglist crossed a page boundary. This occurs in the following code: while (sg_dma_len(sglist) && nents--) { The fix is simply to test nents first and move the decrement of nents into the loop. Reported-by: Rolf Eike Beer Signed-off-by: John David Anglin Cc: stable@vger.kernel.org Signed-off-by: Helge Deller --- drivers/parisc/sba_iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index e60690d38d67..374b9199878d 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -1047,7 +1047,7 @@ sba_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, spin_unlock_irqrestore(&ioc->res_lock, flags); #endif - while (sg_dma_len(sglist) && nents--) { + while (nents && sg_dma_len(sglist)) { sba_unmap_page(dev, sg_dma_address(sglist), sg_dma_len(sglist), direction, 0); @@ -1056,6 +1056,7 @@ sba_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, ioc->usingle_calls--; /* kluge since call is unmap_sg() */ #endif ++sglist; + nents--; } DBG_RUN_SG("%s() DONE (nents %d)\n", __func__, nents); From d7da660cab47183cded65e11b64497d0f56c6edf Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Thu, 27 Jan 2022 22:33:41 +0000 Subject: [PATCH 0755/1295] parisc: Fix sglist access in ccio-dma.c This patch implements the same bug fix to ccio-dma.c as to sba_iommu.c. It ensures that only the allocated entries of the sglist are accessed. Signed-off-by: John David Anglin Cc: stable@vger.kernel.org Signed-off-by: Helge Deller --- drivers/parisc/ccio-dma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index 059566f54429..9be007c9420f 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -1003,7 +1003,7 @@ ccio_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, ioc->usg_calls++; #endif - while(sg_dma_len(sglist) && nents--) { + while (nents && sg_dma_len(sglist)) { #ifdef CCIO_COLLECT_STATS ioc->usg_pages += sg_dma_len(sglist) >> PAGE_SHIFT; @@ -1011,6 +1011,7 @@ ccio_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, ccio_unmap_page(dev, sg_dma_address(sglist), sg_dma_len(sglist), direction, 0); ++sglist; + nents--; } DBG_RUN_SG("%s() DONE (nents %d)\n", __func__, nents); From 50806fd91428a28d5daa649310dd0ca05b39c118 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 24 Jan 2022 17:55:26 +0000 Subject: [PATCH 0756/1295] kselftest/arm64: Skip VL_INHERIT tests for unsupported vector types Currently we unconditionally test the ability to set the vector length inheritance flag via ptrace meaning that we generate false failures on systems that don't support SVE when we attempt to set the vector length there. Check the hwcap and mark the tests as skipped when it's not present. Fixes: 0ba1ce1e8605 ("selftests: arm64: Add coverage of ptrace flags for SVE VL inheritance") Signed-off-by: Mark Brown Reviewed-by: Shuah Khan Link: https://lore.kernel.org/r/20220124175527.3260234-2-broonie@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/sve-ptrace.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index af798b9d232c..0cf78360c5bc 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -557,7 +557,14 @@ static int do_parent(pid_t child) } /* prctl() flags */ - ptrace_set_get_inherit(child, &vec_types[i]); + if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) { + ptrace_set_get_inherit(child, &vec_types[i]); + } else { + ksft_test_result_skip("%s SVE_PT_VL_INHERIT set\n", + vec_types[i].name); + ksft_test_result_skip("%s SVE_PT_VL_INHERIT cleared\n", + vec_types[i].name); + } /* Step through every possible VQ */ for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; vq++) { From 9ae279ecabe3e5bb85567e6c7371f4d35cfa00d6 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 24 Jan 2022 17:55:27 +0000 Subject: [PATCH 0757/1295] kselftest/arm64: Correct logging of FPSIMD register read via ptrace There's a cut'n'paste error in the logging for our test for reading register state back via ptrace, correctly say that we did a read instead of a write. Signed-off-by: Mark Brown Reviewed-by: Shuah Khan Link: https://lore.kernel.org/r/20220124175527.3260234-3-broonie@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/sve-ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index 0cf78360c5bc..a3c1e67441f9 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -261,7 +261,7 @@ static void ptrace_sve_fpsimd(pid_t child, const struct vec_type *type) } ksft_test_result((sve->flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD, - "Set FPSIMD registers via %s\n", type->name); + "Got FPSIMD registers via %s\n", type->name); if ((sve->flags & SVE_PT_REGS_MASK) != SVE_PT_REGS_FPSIMD) goto out; From c8980fcb210851138cb34c9a8cb0cf0c09f07bf9 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Fri, 21 Jan 2022 10:01:46 +0100 Subject: [PATCH 0758/1295] xen/x2apic: enable x2apic mode when supported for HVM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no point in disabling x2APIC mode when running as a Xen HVM guest, just enable it when available. Remove some unneeded wrapping around the detection functions, and simply provide a xen_x2apic_available helper that's a wrapper around x2apic_supported. Signed-off-by: Roger Pau Monné Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20220121090146.13697-1-roger.pau@citrix.com Signed-off-by: Juergen Gross --- arch/x86/include/asm/xen/hypervisor.h | 14 -------------- arch/x86/xen/enlighten_hvm.c | 13 ++++--------- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 1bf2ad34188a..16f548a661cf 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -43,20 +43,6 @@ static inline uint32_t xen_cpuid_base(void) return hypervisor_cpuid_base("XenVMMXenVMM", 2); } -#ifdef CONFIG_XEN -extern bool __init xen_hvm_need_lapic(void); - -static inline bool __init xen_x2apic_para_available(void) -{ - return xen_hvm_need_lapic(); -} -#else -static inline bool __init xen_x2apic_para_available(void) -{ - return (xen_cpuid_base() != 0); -} -#endif - struct pci_dev; #ifdef CONFIG_XEN_PV_DOM0 diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 42300941ec29..6448c5071117 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -242,15 +243,9 @@ static __init int xen_parse_no_vector_callback(char *arg) } early_param("xen_no_vector_callback", xen_parse_no_vector_callback); -bool __init xen_hvm_need_lapic(void) +static __init bool xen_x2apic_available(void) { - if (xen_pv_domain()) - return false; - if (!xen_hvm_domain()) - return false; - if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback) - return false; - return true; + return x2apic_supported(); } static __init void xen_hvm_guest_late_init(void) @@ -312,7 +307,7 @@ struct hypervisor_x86 x86_hyper_xen_hvm __initdata = { .detect = xen_platform_hvm, .type = X86_HYPER_XEN_HVM, .init.init_platform = xen_hvm_guest_init, - .init.x2apic_available = xen_x2apic_para_available, + .init.x2apic_available = xen_x2apic_available, .init.init_mem_mapping = xen_hvm_init_mem_mapping, .init.guest_late_init = xen_hvm_guest_late_init, .runtime.pin_vcpu = xen_pin_vcpu, From 56f289a8d23addfa4408a08f07f42fcfe2a7bd69 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 27 Jan 2022 07:31:53 -0800 Subject: [PATCH 0759/1295] KVM: x86: Add a helper to retrieve userspace address from kvm_device_attr Add a helper to handle converting the u64 userspace address embedded in struct kvm_device_attr into a userspace pointer, it's all too easy to forget the intermediate "unsigned long" cast as well as the truncation check. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5481d2259f02..76a5dcfcabdb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4332,7 +4332,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; } return r; +} +static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr) +{ + void __user *uaddr = (void __user*)(unsigned long)attr->addr; + + if ((u64)(unsigned long)uaddr != attr->addr) + return ERR_PTR(-EFAULT); + return uaddr; } long kvm_arch_dev_ioctl(struct file *filp, @@ -5025,11 +5033,11 @@ static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu, static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr; + u64 __user *uaddr = kvm_get_attr_addr(attr); int r; - if ((u64)(unsigned long)uaddr != attr->addr) - return -EFAULT; + if (IS_ERR(uaddr)) + return PTR_ERR(uaddr); switch (attr->attr) { case KVM_VCPU_TSC_OFFSET: @@ -5048,12 +5056,12 @@ static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr; + u64 __user *uaddr = kvm_get_attr_addr(attr); struct kvm *kvm = vcpu->kvm; int r; - if ((u64)(unsigned long)uaddr != attr->addr) - return -EFAULT; + if (IS_ERR(uaddr)) + return PTR_ERR(uaddr); switch (attr->attr) { case KVM_VCPU_TSC_OFFSET: { From dd6e631220181162478984d2d46dd979e04d8e75 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 26 Jan 2022 07:49:45 -0500 Subject: [PATCH 0760/1295] KVM: x86: add system attribute to retrieve full set of supported xsave states Because KVM_GET_SUPPORTED_CPUID is meant to be passed (by simple-minded VMMs) to KVM_SET_CPUID2, it cannot include any dynamic xsave states that have not been enabled. Probing those, for example so that they can be passed to ARCH_REQ_XCOMP_GUEST_PERM, requires a new ioctl or arch_prctl. The latter is in fact worse, even though that is what the rest of the API uses, because it would require supported_xcr0 to be moved from the KVM module to the kernel just for this use. In addition, the value would be nonsensical (or an error would have to be returned) until the KVM module is loaded in. Therefore, to limit the growth of system ioctls, add a /dev/kvm variant of KVM_{GET,HAS}_DEVICE_ATTR, and implement it in x86 with just one group (0) and attribute (KVM_X86_XCOMP_GUEST_SUPP). Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 4 ++- arch/x86/include/uapi/asm/kvm.h | 3 ++ arch/x86/kvm/x86.c | 51 +++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 1 + 4 files changed, 58 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index bb8cfddbb22d..a4267104db50 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -3268,6 +3268,7 @@ number. :Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, KVM_CAP_VCPU_ATTRIBUTES for vcpu device + KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device (no set) :Type: device ioctl, vm ioctl, vcpu ioctl :Parameters: struct kvm_device_attr :Returns: 0 on success, -1 on error @@ -3302,7 +3303,8 @@ transferred is defined by the particular attribute. ------------------------ :Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, - KVM_CAP_VCPU_ATTRIBUTES for vcpu device + KVM_CAP_VCPU_ATTRIBUTES for vcpu device + KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device :Type: device ioctl, vm ioctl, vcpu ioctl :Parameters: struct kvm_device_attr :Returns: 0 on success, -1 on error diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 2da3316bb559..bf6e96011dfe 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -452,6 +452,9 @@ struct kvm_sync_regs { #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 +/* attributes for system fd (group 0) */ +#define KVM_X86_XCOMP_GUEST_SUPP 0 + struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 76a5dcfcabdb..c25a6ef0ff06 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4230,6 +4230,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SREGS2: case KVM_CAP_EXIT_ON_EMULATION_FAILURE: case KVM_CAP_VCPU_ATTRIBUTES: + case KVM_CAP_SYS_ATTRIBUTES: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -4343,6 +4344,40 @@ static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr) return uaddr; } +static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) +{ + u64 __user *uaddr = kvm_get_attr_addr(attr); + + if (attr->group) + return -ENXIO; + + if (IS_ERR(uaddr)) + return PTR_ERR(uaddr); + + switch (attr->attr) { + case KVM_X86_XCOMP_GUEST_SUPP: + if (put_user(supported_xcr0, uaddr)) + return -EFAULT; + return 0; + default: + return -ENXIO; + break; + } +} + +static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr) +{ + if (attr->group) + return -ENXIO; + + switch (attr->attr) { + case KVM_X86_XCOMP_GUEST_SUPP: + return 0; + default: + return -ENXIO; + } +} + long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4431,6 +4466,22 @@ long kvm_arch_dev_ioctl(struct file *filp, case KVM_GET_SUPPORTED_HV_CPUID: r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp); break; + case KVM_GET_DEVICE_ATTR: { + struct kvm_device_attr attr; + r = -EFAULT; + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) + break; + r = kvm_x86_dev_get_attr(&attr); + break; + } + case KVM_HAS_DEVICE_ATTR: { + struct kvm_device_attr attr; + r = -EFAULT; + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) + break; + r = kvm_x86_dev_has_attr(&attr); + break; + } default: r = -EINVAL; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 9563d294f181..b46bcdb0cab1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1133,6 +1133,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 #define KVM_CAP_VM_GPA_BITS 207 #define KVM_CAP_XSAVE2 208 +#define KVM_CAP_SYS_ATTRIBUTES 209 #ifdef KVM_CAP_IRQ_ROUTING From b19c99b9f4486f23e3b7248dd4ce3d83e19b9032 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 26 Jan 2022 07:51:00 -0500 Subject: [PATCH 0761/1295] selftests: kvm: check dynamic bits against KVM_X86_XCOMP_GUEST_SUPP Provide coverage for the new API. Signed-off-by: Paolo Bonzini --- tools/arch/x86/include/uapi/asm/kvm.h | 3 +++ tools/include/uapi/linux/kvm.h | 1 + .../testing/selftests/kvm/lib/x86_64/processor.c | 15 +++++++++++++++ 3 files changed, 19 insertions(+) diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 2da3316bb559..bf6e96011dfe 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -452,6 +452,9 @@ struct kvm_sync_regs { #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 +/* attributes for system fd (group 0) */ +#define KVM_X86_XCOMP_GUEST_SUPP 0 + struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 9563d294f181..b46bcdb0cab1 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1133,6 +1133,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 #define KVM_CAP_VM_GPA_BITS 207 #define KVM_CAP_XSAVE2 208 +#define KVM_CAP_SYS_ATTRIBUTES 209 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index c1d1c195a838..9f000dfb5594 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -667,8 +667,23 @@ static bool is_xfd_supported(void) void vm_xsave_req_perm(int bit) { + int kvm_fd; u64 bitmask; long rc; + struct kvm_device_attr attr = { + .group = 0, + .attr = KVM_X86_XCOMP_GUEST_SUPP, + .addr = (unsigned long) &bitmask + }; + + kvm_fd = open_kvm_dev_path_or_exit(); + rc = ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr); + close(kvm_fd); + if (rc == -1 && (errno == ENXIO || errno == EINVAL)) + exit(KSFT_SKIP); + TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc); + if (!(bitmask & (1ULL << bit))) + exit(KSFT_SKIP); if (!is_xfd_supported()) exit(KSFT_SKIP); From f80ae0ef089a09e8c18da43a382c3caac9a424a7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 12 Jan 2022 18:01:30 +0100 Subject: [PATCH 0762/1295] KVM: nVMX: Also filter MSR_IA32_VMX_TRUE_PINBASED_CTLS when eVMCS Similar to MSR_IA32_VMX_EXIT_CTLS/MSR_IA32_VMX_TRUE_EXIT_CTLS, MSR_IA32_VMX_ENTRY_CTLS/MSR_IA32_VMX_TRUE_ENTRY_CTLS pair, MSR_IA32_VMX_TRUE_PINBASED_CTLS needs to be filtered the same way MSR_IA32_VMX_PINBASED_CTLS is currently filtered as guests may solely rely on 'true' MSR data. Note, none of the currently existing Windows/Hyper-V versions are known to stumble upon the unfiltered MSR_IA32_VMX_TRUE_PINBASED_CTLS, the change is aimed at making the filtering future proof. Signed-off-by: Vitaly Kuznetsov Message-Id: <20220112170134.1904308-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index ba6f99f584ac..a7ed30d5647a 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -362,6 +362,7 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata) case MSR_IA32_VMX_PROCBASED_CTLS2: ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; break; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; break; From 7a601e2cf61558dfd534a9ecaad09f5853ad8204 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 12 Jan 2022 18:01:31 +0100 Subject: [PATCH 0763/1295] KVM: nVMX: eVMCS: Filter out VM_EXIT_SAVE_VMX_PREEMPTION_TIMER Enlightened VMCS v1 doesn't have VMX_PREEMPTION_TIMER_VALUE field, PIN_BASED_VMX_PREEMPTION_TIMER is also filtered out already so it makes sense to filter out VM_EXIT_SAVE_VMX_PREEMPTION_TIMER too. Note, none of the currently existing Windows/Hyper-V versions are known to enable 'save VMX-preemption timer value' when eVMCS is in use, the change is aimed at making the filtering future proof. Signed-off-by: Vitaly Kuznetsov Message-Id: <20220112170134.1904308-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 16731d2cf231..3a461a32128b 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -59,7 +59,9 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); SECONDARY_EXEC_SHADOW_VMCS | \ SECONDARY_EXEC_TSC_SCALING | \ SECONDARY_EXEC_PAUSE_LOOP_EXITING) -#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) +#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \ + (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) #define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) #define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) From 2423a4c0d17418eca1ba1e3f48684cb2ab7523d5 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 12 Jan 2022 18:01:32 +0100 Subject: [PATCH 0764/1295] KVM: nVMX: Rename vmcs_to_field_offset{,_table} vmcs_to_field_offset{,_table} may sound misleading as VMCS is an opaque blob which is not supposed to be accessed directly. In fact, vmcs_to_field_offset{,_table} are related to KVM defined VMCS12 structure. Rename vmcs_field_to_offset() to get_vmcs12_field_offset() for clarity. No functional change intended. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Message-Id: <20220112170134.1904308-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 +++--- arch/x86/kvm/vmx/vmcs12.c | 4 ++-- arch/x86/kvm/vmx/vmcs12.h | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2777cea05cc0..e614a5555aab 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5113,7 +5113,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* Decode instruction info and find the field to read */ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); - offset = vmcs_field_to_offset(field); + offset = get_vmcs12_field_offset(field); if (offset < 0) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); @@ -5216,7 +5216,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); - offset = vmcs_field_to_offset(field); + offset = get_vmcs12_field_offset(field); if (offset < 0) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); @@ -6464,7 +6464,7 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void) max_idx = 0; for (i = 0; i < nr_vmcs12_fields; i++) { /* The vmcs12 table is very, very sparsely populated. */ - if (!vmcs_field_to_offset_table[i]) + if (!vmcs12_field_offsets[i]) continue; idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index cab6ba7a5005..2251b60920f8 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -8,7 +8,7 @@ FIELD(number, name), \ [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) -const unsigned short vmcs_field_to_offset_table[] = { +const unsigned short vmcs12_field_offsets[] = { FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), FIELD(POSTED_INTR_NV, posted_intr_nv), FIELD(GUEST_ES_SELECTOR, guest_es_selector), @@ -151,4 +151,4 @@ const unsigned short vmcs_field_to_offset_table[] = { FIELD(HOST_RSP, host_rsp), FIELD(HOST_RIP, host_rip), }; -const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs_field_to_offset_table); +const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets); diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 2a45f026ee11..746129ddd5ae 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -361,10 +361,10 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(guest_pml_index, 996); } -extern const unsigned short vmcs_field_to_offset_table[]; +extern const unsigned short vmcs12_field_offsets[]; extern const unsigned int nr_vmcs12_fields; -static inline short vmcs_field_to_offset(unsigned long field) +static inline short get_vmcs12_field_offset(unsigned long field) { unsigned short offset; unsigned int index; @@ -377,7 +377,7 @@ static inline short vmcs_field_to_offset(unsigned long field) return -ENOENT; index = array_index_nospec(index, nr_vmcs12_fields); - offset = vmcs_field_to_offset_table[index]; + offset = vmcs12_field_offsets[index]; if (offset == 0) return -ENOENT; return offset; From 892a42c10ddb945d3a4dcf07dccdf9cb98b21548 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 12 Jan 2022 18:01:33 +0100 Subject: [PATCH 0765/1295] KVM: nVMX: Implement evmcs_field_offset() suitable for handle_vmread() In preparation to allowing reads from Enlightened VMCS from handle_vmread(), implement evmcs_field_offset() to get the correct read offset. get_evmcs_offset(), which is being used by KVM-on-Hyper-V, is almost what's needed but a few things need to be adjusted. First, WARN_ON() is unacceptable for handle_vmread() as any field can (in theory) be supplied by the guest and not all fields are defined in eVMCS v1. Second, we need to handle 'holes' in eVMCS (missing fields). It also sounds like a good idea to WARN_ON() if such fields are ever accessed by KVM-on-Hyper-V. Implement dedicated evmcs_field_offset() helper. No functional change intended. Signed-off-by: Vitaly Kuznetsov Message-Id: <20220112170134.1904308-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.c | 3 +-- arch/x86/kvm/vmx/evmcs.h | 32 ++++++++++++++++++++++++-------- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index a7ed30d5647a..87e3dc10edf4 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -12,8 +12,6 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs); -#if IS_ENABLED(CONFIG_HYPERV) - #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ {EVMCS1_OFFSET(name), clean_field} @@ -296,6 +294,7 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = { }; const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); +#if IS_ENABLED(CONFIG_HYPERV) __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) { vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 3a461a32128b..9bc2521b159e 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -65,8 +65,6 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); #define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) #define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) -#if IS_ENABLED(CONFIG_HYPERV) - struct evmcs_field { u16 offset; u16 clean_field; @@ -75,26 +73,44 @@ struct evmcs_field { extern const struct evmcs_field vmcs_field_to_evmcs_1[]; extern const unsigned int nr_evmcs_1_fields; -static __always_inline int get_evmcs_offset(unsigned long field, - u16 *clean_field) +static __always_inline int evmcs_field_offset(unsigned long field, + u16 *clean_field) { unsigned int index = ROL16(field, 6); const struct evmcs_field *evmcs_field; - if (unlikely(index >= nr_evmcs_1_fields)) { - WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n", - field); + if (unlikely(index >= nr_evmcs_1_fields)) return -ENOENT; - } evmcs_field = &vmcs_field_to_evmcs_1[index]; + /* + * Use offset=0 to detect holes in eVMCS. This offset belongs to + * 'revision_id' but this field has no encoding and is supposed to + * be accessed directly. + */ + if (unlikely(!evmcs_field->offset)) + return -ENOENT; + if (clean_field) *clean_field = evmcs_field->clean_field; return evmcs_field->offset; } +#if IS_ENABLED(CONFIG_HYPERV) + +static __always_inline int get_evmcs_offset(unsigned long field, + u16 *clean_field) +{ + int offset = evmcs_field_offset(field, clean_field); + + WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n", + field); + + return offset; +} + static __always_inline void evmcs_write64(unsigned long field, u64 value) { u16 clean_field; From 6cbbaab60ff33f59355492c241318046befd9ffc Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 12 Jan 2022 18:01:34 +0100 Subject: [PATCH 0766/1295] KVM: nVMX: Allow VMREAD when Enlightened VMCS is in use Hyper-V TLFS explicitly forbids VMREAD and VMWRITE instructions when Enlightened VMCS interface is in use: "Any VMREAD or VMWRITE instructions while an enlightened VMCS is active is unsupported and can result in unexpected behavior."" Windows 11 + WSL2 seems to ignore this, attempts to VMREAD VMCS field 0x4404 ("VM-exit interruption information") are observed. Failing these attempts with nested_vmx_failInvalid() makes such guests unbootable. Microsoft confirms this is a Hyper-V bug and claims that it'll get fixed eventually but for the time being we need a workaround. (Temporary) allow VMREAD to get data from the currently loaded Enlightened VMCS. Note: VMWRITE instructions remain forbidden, it is not clear how to handle them properly and hopefully won't ever be needed. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Message-Id: <20220112170134.1904308-6-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.h | 12 +++++++++ arch/x86/kvm/vmx/nested.c | 55 +++++++++++++++++++++++++++------------ 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 9bc2521b159e..8d70f9aea94b 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -98,6 +98,18 @@ static __always_inline int evmcs_field_offset(unsigned long field, return evmcs_field->offset; } +static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs, + unsigned long field, u16 offset) +{ + /* + * vmcs12_read_any() doesn't care whether the supplied structure + * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes + * the exact offset of the required field, use it for convenience + * here. + */ + return vmcs12_read_any((void *)evmcs, field, offset); +} + #if IS_ENABLED(CONFIG_HYPERV) static __always_inline int get_evmcs_offset(unsigned long field, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e614a5555aab..ba34e94049c7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7,6 +7,7 @@ #include #include "cpuid.h" +#include "evmcs.h" #include "hyperv.h" #include "mmu.h" #include "nested.h" @@ -5101,27 +5102,49 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - /* - * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, - * any VMREAD sets the ALU flags for VMfailInvalid. - */ - if (vmx->nested.current_vmptr == INVALID_GPA || - (is_guest_mode(vcpu) && - get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) - return nested_vmx_failInvalid(vcpu); - /* Decode instruction info and find the field to read */ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); - offset = get_vmcs12_field_offset(field); - if (offset < 0) - return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + /* + * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, + * any VMREAD sets the ALU flags for VMfailInvalid. + */ + if (vmx->nested.current_vmptr == INVALID_GPA || + (is_guest_mode(vcpu) && + get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) + return nested_vmx_failInvalid(vcpu); - if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) - copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + offset = get_vmcs12_field_offset(field); + if (offset < 0) + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - /* Read the field, zero-extended to a u64 value */ - value = vmcs12_read_any(vmcs12, field, offset); + if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) + copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + + /* Read the field, zero-extended to a u64 value */ + value = vmcs12_read_any(vmcs12, field, offset); + } else { + /* + * Hyper-V TLFS (as of 6.0b) explicitly states, that while an + * enlightened VMCS is active VMREAD/VMWRITE instructions are + * unsupported. Unfortunately, certain versions of Windows 11 + * don't comply with this requirement which is not enforced in + * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a + * workaround, as misbehaving guests will panic on VM-Fail. + * Note, enlightened VMCS is incompatible with shadow VMCS so + * all VMREADs from L2 should go to L1. + */ + if (WARN_ON_ONCE(is_guest_mode(vcpu))) + return nested_vmx_failInvalid(vcpu); + + offset = evmcs_field_offset(field, NULL); + if (offset < 0) + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + + /* Read the field, zero-extended to a u64 value */ + value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset); + } /* * Now copy part of this value to register or memory, as requested. From 6a0c61703e3a5d67845a4b275e1d9d7bc1b5aad7 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Thu, 27 Jan 2022 14:54:49 +0800 Subject: [PATCH 0767/1295] KVM: eventfd: Fix false positive RCU usage warning Fix the following false positive warning: ============================= WARNING: suspicious RCU usage 5.16.0-rc4+ #57 Not tainted ----------------------------- arch/x86/kvm/../../../virt/kvm/eventfd.c:484 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 3 locks held by fc_vcpu 0/330: #0: ffff8884835fc0b0 (&vcpu->mutex){+.+.}-{3:3}, at: kvm_vcpu_ioctl+0x88/0x6f0 [kvm] #1: ffffc90004c0bb68 (&kvm->srcu){....}-{0:0}, at: vcpu_enter_guest+0x600/0x1860 [kvm] #2: ffffc90004c0c1d0 (&kvm->irq_srcu){....}-{0:0}, at: kvm_notify_acked_irq+0x36/0x180 [kvm] stack backtrace: CPU: 26 PID: 330 Comm: fc_vcpu 0 Not tainted 5.16.0-rc4+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x44/0x57 kvm_notify_acked_gsi+0x6b/0x70 [kvm] kvm_notify_acked_irq+0x8d/0x180 [kvm] kvm_ioapic_update_eoi+0x92/0x240 [kvm] kvm_apic_set_eoi_accelerated+0x2a/0xe0 [kvm] handle_apic_eoi_induced+0x3d/0x60 [kvm_intel] vmx_handle_exit+0x19c/0x6a0 [kvm_intel] vcpu_enter_guest+0x66e/0x1860 [kvm] kvm_arch_vcpu_ioctl_run+0x438/0x7f0 [kvm] kvm_vcpu_ioctl+0x38a/0x6f0 [kvm] __x64_sys_ioctl+0x89/0xc0 do_syscall_64+0x3a/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Since kvm_unregister_irq_ack_notifier() does synchronize_srcu(&kvm->irq_srcu), kvm->irq_ack_notifier_list is protected by kvm->irq_srcu. In fact, kvm->irq_srcu SRCU read lock is held in kvm_notify_acked_irq(), making it a false positive warning. So use hlist_for_each_entry_srcu() instead of hlist_for_each_entry_rcu(). Reviewed-by: Sean Christopherson Signed-off-by: Hou Wenlong Message-Id: Signed-off-by: Paolo Bonzini --- virt/kvm/eventfd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 2ad013b8bde9..59b1dd4a549e 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -463,8 +463,8 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) idx = srcu_read_lock(&kvm->irq_srcu); gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); if (gsi != -1) - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, - link) + hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, + link, srcu_read_lock_held(&kvm->irq_srcu)) if (kian->gsi == gsi) { srcu_read_unlock(&kvm->irq_srcu, idx); return true; @@ -480,8 +480,8 @@ void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) { struct kvm_irq_ack_notifier *kian; - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, - link) + hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, + link, srcu_read_lock_held(&kvm->irq_srcu)) if (kian->gsi == gsi) kian->irq_acked(kian); } From fb25621da5702c104ce0a48de5b174ced09e5b4e Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 27 Jan 2022 13:13:34 +0000 Subject: [PATCH 0768/1295] ASoC: fsl: Add missing error handling in pcm030_fabric_probe Add the missing platform_device_put() and platform_device_del() before return from pcm030_fabric_probe in the error handling case. Fixes: c912fa913446 ("ASoC: fsl: register the wm9712-codec") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220127131336.30214-1-linmq006@gmail.com Signed-off-by: Mark Brown --- sound/soc/fsl/pcm030-audio-fabric.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sound/soc/fsl/pcm030-audio-fabric.c b/sound/soc/fsl/pcm030-audio-fabric.c index af3c3b90c0ac..83b4a22bf15a 100644 --- a/sound/soc/fsl/pcm030-audio-fabric.c +++ b/sound/soc/fsl/pcm030-audio-fabric.c @@ -93,16 +93,21 @@ static int pcm030_fabric_probe(struct platform_device *op) dev_err(&op->dev, "platform_device_alloc() failed\n"); ret = platform_device_add(pdata->codec_device); - if (ret) + if (ret) { dev_err(&op->dev, "platform_device_add() failed: %d\n", ret); + platform_device_put(pdata->codec_device); + } ret = snd_soc_register_card(card); - if (ret) + if (ret) { dev_err(&op->dev, "snd_soc_register_card() failed: %d\n", ret); + platform_device_del(pdata->codec_device); + platform_device_put(pdata->codec_device); + } platform_set_drvdata(op, pdata); - return ret; + } static int pcm030_fabric_remove(struct platform_device *op) From 2cbd27267ffe020af1442b95ec57f59a157ba85c Mon Sep 17 00:00:00 2001 From: Kamal Dasu Date: Thu, 27 Jan 2022 13:53:59 -0500 Subject: [PATCH 0769/1295] spi: bcm-qspi: check for valid cs before applying chip select Apply only valid chip select value. This change fixes case where chip select is set to initial value of '-1' during probe and PM supend and subsequent resume can try to use the value with undefined behaviour. Also in case where gpio based chip select, the check in bcm_qspi_chip_select() shall prevent undefined behaviour on resume. Fixes: fa236a7ef240 ("spi: bcm-qspi: Add Broadcom MSPI driver") Signed-off-by: Kamal Dasu Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20220127185359.27322-1-kdasu.kdev@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-bcm-qspi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c index c9a769b8594b..86c76211b3d3 100644 --- a/drivers/spi/spi-bcm-qspi.c +++ b/drivers/spi/spi-bcm-qspi.c @@ -585,7 +585,7 @@ static void bcm_qspi_chip_select(struct bcm_qspi *qspi, int cs) u32 rd = 0; u32 wr = 0; - if (qspi->base[CHIP_SELECT]) { + if (cs >= 0 && qspi->base[CHIP_SELECT]) { rd = bcm_qspi_read(qspi, CHIP_SELECT, 0); wr = (rd & ~0xff) | (1 << cs); if (rd == wr) From 60b1e97140a487608b7cbde774b3cff1b5a99c00 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 26 Jan 2022 17:13:26 -0600 Subject: [PATCH 0770/1295] spi: dt-bindings: Fix 'reg' child node schema The schema for SPI child nodes' 'reg' property is not complete. 'reg' is a matrix of cells. The schema needs to define both the number of 'reg' entries and constraints on each entry. Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220126231326.1636199-1-robh@kernel.org Signed-off-by: Mark Brown --- .../devicetree/bindings/spi/spi-peripheral-props.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/spi/spi-peripheral-props.yaml b/Documentation/devicetree/bindings/spi/spi-peripheral-props.yaml index 5dd209206e88..3ec2d7b83775 100644 --- a/Documentation/devicetree/bindings/spi/spi-peripheral-props.yaml +++ b/Documentation/devicetree/bindings/spi/spi-peripheral-props.yaml @@ -23,8 +23,9 @@ properties: minItems: 1 maxItems: 256 items: - minimum: 0 - maximum: 256 + items: + - minimum: 0 + maximum: 256 description: Chip select used by the device. From ab451ea952fe9d7afefae55ddb28943a148247fe Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 26 Jan 2022 13:13:38 -0800 Subject: [PATCH 0771/1295] nfsd: nfsd4_setclientid_confirm mistakenly expires confirmed client. From RFC 7530 Section 16.34.5: o The server has not recorded an unconfirmed { v, x, c, *, * } and has recorded a confirmed { v, x, c, *, s }. If the principals of the record and of SETCLIENTID_CONFIRM do not match, the server returns NFS4ERR_CLID_INUSE without removing any relevant leased client state, and without changing recorded callback and callback_ident values for client { x }. The current code intends to do what the spec describes above but it forgot to set 'old' to NULL resulting to the confirmed client to be expired. Fixes: 2b63482185e6 ("nfsd: fix clid_inuse on mount with security change") Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever Reviewed-by: Bruce Fields --- fs/nfsd/nfs4state.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 72900b89cf84..32063733443d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4130,8 +4130,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, status = nfserr_clid_inuse; if (client_has_state(old) && !same_creds(&unconf->cl_cred, - &old->cl_cred)) + &old->cl_cred)) { + old = NULL; goto out; + } status = mark_client_expired_locked(old); if (status) { old = NULL; From 928d6fe996f69330ded6b887baf4534c5fac7988 Mon Sep 17 00:00:00 2001 From: Yuji Ishikawa Date: Thu, 27 Jan 2022 21:17:14 +0900 Subject: [PATCH 0772/1295] net: stmmac: dwmac-visconti: No change to ETHER_CLOCK_SEL for unexpected speed request. Variable clk_sel_val is not initialized in the default case of the first switch statement. In that case, the function should return immediately without any changes to the hardware. Reported-by: kernel test robot Reported-by: Dan Carpenter Fixes: b38dd98ff8d0 ("net: stmmac: Add Toshiba Visconti SoCs glue driver") Signed-off-by: Yuji Ishikawa Reviewed-by: Nobuhiro Iwamatsu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c index dde5b772a5af..c3f10a92b62b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c @@ -49,13 +49,15 @@ struct visconti_eth { void __iomem *reg; u32 phy_intf_sel; struct clk *phy_ref_clk; + struct device *dev; spinlock_t lock; /* lock to protect register update */ }; static void visconti_eth_fix_mac_speed(void *priv, unsigned int speed) { struct visconti_eth *dwmac = priv; - unsigned int val, clk_sel_val; + struct net_device *netdev = dev_get_drvdata(dwmac->dev); + unsigned int val, clk_sel_val = 0; unsigned long flags; spin_lock_irqsave(&dwmac->lock, flags); @@ -85,7 +87,9 @@ static void visconti_eth_fix_mac_speed(void *priv, unsigned int speed) break; default: /* No bit control */ - break; + netdev_err(netdev, "Unsupported speed request (%d)", speed); + spin_unlock_irqrestore(&dwmac->lock, flags); + return; } writel(val, dwmac->reg + MAC_CTRL_REG); @@ -229,6 +233,7 @@ static int visconti_eth_dwmac_probe(struct platform_device *pdev) spin_lock_init(&dwmac->lock); dwmac->reg = stmmac_res.addr; + dwmac->dev = &pdev->dev; plat_dat->bsp_priv = dwmac; plat_dat->fix_mac_speed = visconti_eth_fix_mac_speed; From 500c77eed0feabddd5b3afb48e32c204614a8eab Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Thu, 27 Jan 2022 20:46:02 +0100 Subject: [PATCH 0773/1295] pinctrl: zynqmp: Revert "Unify pin naming" This reverts commit 54784ff24971ed5bd3f1056edce998148709d0a7. This patch changes the pin names from "MIO%d" to "MIO-%d", but all dts in arch/arm64/boot/dts/xilinx still use the old name. As a result my ZCU104 has no output on serial terminal and is not reachable over network. Signed-off-by: Gerhard Engleder Signed-off-by: Andy Shevchenko --- drivers/pinctrl/pinctrl-zynqmp.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/pinctrl/pinctrl-zynqmp.c b/drivers/pinctrl/pinctrl-zynqmp.c index 42da6bd399ee..e14012209992 100644 --- a/drivers/pinctrl/pinctrl-zynqmp.c +++ b/drivers/pinctrl/pinctrl-zynqmp.c @@ -809,7 +809,6 @@ static int zynqmp_pinctrl_prepare_pin_desc(struct device *dev, unsigned int *npins) { struct pinctrl_pin_desc *pins, *pin; - char **pin_names; int ret; int i; @@ -821,14 +820,13 @@ static int zynqmp_pinctrl_prepare_pin_desc(struct device *dev, if (!pins) return -ENOMEM; - pin_names = devm_kasprintf_strarray(dev, ZYNQMP_PIN_PREFIX, *npins); - if (IS_ERR(pin_names)) - return PTR_ERR(pin_names); - for (i = 0; i < *npins; i++) { pin = &pins[i]; pin->number = i; - pin->name = pin_names[i]; + pin->name = devm_kasprintf(dev, GFP_KERNEL, "%s%d", + ZYNQMP_PIN_PREFIX, i); + if (!pin->name) + return -ENOMEM; } *zynqmp_pins = pins; From 4e0f718daf97d47cf7dec122da1be970f145c809 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 28 Jan 2022 12:47:15 +0800 Subject: [PATCH 0774/1295] ax25: improve the incomplete fix to avoid UAF and NPD bugs The previous commit 1ade48d0c27d ("ax25: NPD bug when detaching AX25 device") introduce lock_sock() into ax25_kill_by_device to prevent NPD bug. But the concurrency NPD or UAF bug will occur, when lock_sock() or release_sock() dereferences the ax25_cb->sock. The NULL pointer dereference bug can be shown as below: ax25_kill_by_device() | ax25_release() | ax25_destroy_socket() | ax25_cb_del() ... | ... | ax25->sk=NULL; lock_sock(s->sk); //(1) | s->ax25_dev = NULL; | ... release_sock(s->sk); //(2) | ... | The root cause is that the sock is set to null before dereference site (1) or (2). Therefore, this patch extracts the ax25_cb->sock in advance, and uses ax25_list_lock to protect it, which can synchronize with ax25_cb_del() and ensure the value of sock is not null before dereference sites. The concurrency UAF bug can be shown as below: ax25_kill_by_device() | ax25_release() | ax25_destroy_socket() ... | ... | sock_put(sk); //FREE lock_sock(s->sk); //(1) | s->ax25_dev = NULL; | ... release_sock(s->sk); //(2) | ... | The root cause is that the sock is released before dereference site (1) or (2). Therefore, this patch uses sock_hold() to increase the refcount of sock and uses ax25_list_lock to protect it, which can synchronize with ax25_cb_del() in ax25_destroy_socket() and ensure the sock wil not be released before dereference sites. Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 02f43f3e2c56..44a8730c26ac 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -77,6 +77,7 @@ static void ax25_kill_by_device(struct net_device *dev) { ax25_dev *ax25_dev; ax25_cb *s; + struct sock *sk; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) return; @@ -85,13 +86,15 @@ static void ax25_kill_by_device(struct net_device *dev) again: ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { + sk = s->sk; + sock_hold(sk); spin_unlock_bh(&ax25_list_lock); - lock_sock(s->sk); + lock_sock(sk); s->ax25_dev = NULL; - release_sock(s->sk); + release_sock(sk); ax25_disconnect(s, ENETUNREACH); spin_lock_bh(&ax25_list_lock); - + sock_put(sk); /* The entry could have been deleted from the * list meanwhile and thus the next pointer is * no longer valid. Play it safe and restart From d01ffb9eee4af165d83b08dd73ebdf9fe94a519b Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 28 Jan 2022 12:47:16 +0800 Subject: [PATCH 0775/1295] ax25: add refcount in ax25_dev to avoid UAF bugs If we dereference ax25_dev after we call kfree(ax25_dev) in ax25_dev_device_down(), it will lead to concurrency UAF bugs. There are eight syscall functions suffer from UAF bugs, include ax25_bind(), ax25_release(), ax25_connect(), ax25_ioctl(), ax25_getname(), ax25_sendmsg(), ax25_getsockopt() and ax25_info_show(). One of the concurrency UAF can be shown as below: (USE) | (FREE) | ax25_device_event | ax25_dev_device_down ax25_bind | ... ... | kfree(ax25_dev) ax25_fillin_cb() | ... ax25_fillin_cb_from_dev() | ... | The root cause of UAF bugs is that kfree(ax25_dev) in ax25_dev_device_down() is not protected by any locks. When ax25_dev, which there are still pointers point to, is released, the concurrency UAF bug will happen. This patch introduces refcount into ax25_dev in order to guarantee that there are no pointers point to it when ax25_dev is released. Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller --- include/net/ax25.h | 10 ++++++++++ net/ax25/af_ax25.c | 2 ++ net/ax25/ax25_dev.c | 12 ++++++++++-- net/ax25/ax25_route.c | 3 +++ 4 files changed, 25 insertions(+), 2 deletions(-) diff --git a/include/net/ax25.h b/include/net/ax25.h index 526e49589197..50b417df6221 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -239,6 +239,7 @@ typedef struct ax25_dev { #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_dama_info dama; #endif + refcount_t refcount; } ax25_dev; typedef struct ax25_cb { @@ -293,6 +294,15 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } } +#define ax25_dev_hold(__ax25_dev) \ + refcount_inc(&((__ax25_dev)->refcount)) + +static __inline__ void ax25_dev_put(ax25_dev *ax25_dev) +{ + if (refcount_dec_and_test(&ax25_dev->refcount)) { + kfree(ax25_dev); + } +} static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { skb->dev = dev; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 44a8730c26ac..32f61978ff29 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -91,6 +91,7 @@ again: spin_unlock_bh(&ax25_list_lock); lock_sock(sk); s->ax25_dev = NULL; + ax25_dev_put(ax25_dev); release_sock(sk); ax25_disconnect(s, ENETUNREACH); spin_lock_bh(&ax25_list_lock); @@ -439,6 +440,7 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) } out_put: + ax25_dev_put(ax25_dev); ax25_cb_put(ax25); return ret; diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 256fadb94df3..770b787fb7bb 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -37,6 +37,7 @@ ax25_dev *ax25_addr_ax25dev(ax25_address *addr) for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) if (ax25cmp(addr, (const ax25_address *)ax25_dev->dev->dev_addr) == 0) { res = ax25_dev; + ax25_dev_hold(ax25_dev); } spin_unlock_bh(&ax25_dev_lock); @@ -56,6 +57,7 @@ void ax25_dev_device_up(struct net_device *dev) return; } + refcount_set(&ax25_dev->refcount, 1); dev->ax25_ptr = ax25_dev; ax25_dev->dev = dev; dev_hold_track(dev, &ax25_dev->dev_tracker, GFP_ATOMIC); @@ -83,6 +85,7 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); ax25_dev->next = ax25_dev_list; ax25_dev_list = ax25_dev; + ax25_dev_hold(ax25_dev); spin_unlock_bh(&ax25_dev_lock); ax25_register_dev_sysctl(ax25_dev); @@ -112,20 +115,22 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; + ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; dev_put_track(dev, &ax25_dev->dev_tracker); - kfree(ax25_dev); + ax25_dev_put(ax25_dev); return; } while (s != NULL && s->next != NULL) { if (s->next == ax25_dev) { s->next = ax25_dev->next; + ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; dev_put_track(dev, &ax25_dev->dev_tracker); - kfree(ax25_dev); + ax25_dev_put(ax25_dev); return; } @@ -133,6 +138,7 @@ void ax25_dev_device_down(struct net_device *dev) } spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; + ax25_dev_put(ax25_dev); } int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) @@ -149,6 +155,7 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) if (ax25_dev->forward != NULL) return -EINVAL; ax25_dev->forward = fwd_dev->dev; + ax25_dev_put(fwd_dev); break; case SIOCAX25DELFWD: @@ -161,6 +168,7 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) return -EINVAL; } + ax25_dev_put(ax25_dev); return 0; } diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index d0b2e094bd55..1e32693833e5 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -116,6 +116,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->dev = ax25_dev->dev; ax25_rt->digipeat = NULL; ax25_rt->ip_mode = ' '; + ax25_dev_put(ax25_dev); if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); @@ -172,6 +173,7 @@ static int ax25_rt_del(struct ax25_routes_struct *route) } } } + ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); return 0; @@ -214,6 +216,7 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) } out: + ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); return err; } From 8c83d39cc730378bbac64d67a551897b203a606e Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Sat, 15 Jan 2022 18:02:33 -0500 Subject: [PATCH 0776/1295] IB/hfi1: Fix panic with larger ipoib send_queue_size When the ipoib send_queue_size is increased from the default the following panic happens: RIP: 0010:hfi1_ipoib_drain_tx_ring+0x45/0xf0 [hfi1] Code: 31 e4 eb 0f 8b 85 c8 02 00 00 41 83 c4 01 44 39 e0 76 60 8b 8d cc 02 00 00 44 89 e3 be 01 00 00 00 d3 e3 48 03 9d c0 02 00 00 83 18 01 00 00 00 00 00 00 48 8b bb 30 01 00 00 e8 25 af a7 e0 RSP: 0018:ffffc9000798f4a0 EFLAGS: 00010286 RAX: 0000000000008000 RBX: ffffc9000aa0f000 RCX: 000000000000000f RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000 RBP: ffff88810ff08000 R08: ffff88889476d900 R09: 0000000000000101 R10: 0000000000000000 R11: ffffc90006590ff8 R12: 0000000000000200 R13: ffffc9000798fba8 R14: 0000000000000000 R15: 0000000000000001 FS: 00007fd0f79cc3c0(0000) GS:ffff88885fb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffc9000aa0f118 CR3: 0000000889c84001 CR4: 00000000001706e0 Call Trace: hfi1_ipoib_napi_tx_disable+0x45/0x60 [hfi1] hfi1_ipoib_dev_stop+0x18/0x80 [hfi1] ipoib_ib_dev_stop+0x1d/0x40 [ib_ipoib] ipoib_stop+0x48/0xc0 [ib_ipoib] __dev_close_many+0x9e/0x110 __dev_change_flags+0xd9/0x210 dev_change_flags+0x21/0x60 do_setlink+0x31c/0x10f0 ? __nla_validate_parse+0x12d/0x1a0 ? __nla_parse+0x21/0x30 ? inet6_validate_link_af+0x5e/0xf0 ? cpumask_next+0x1f/0x20 ? __snmp6_fill_stats64.isra.53+0xbb/0x140 ? __nla_validate_parse+0x47/0x1a0 __rtnl_newlink+0x530/0x910 ? pskb_expand_head+0x73/0x300 ? __kmalloc_node_track_caller+0x109/0x280 ? __nla_put+0xc/0x20 ? cpumask_next_and+0x20/0x30 ? update_sd_lb_stats.constprop.144+0xd3/0x820 ? _raw_spin_unlock_irqrestore+0x25/0x37 ? __wake_up_common_lock+0x87/0xc0 ? kmem_cache_alloc_trace+0x3d/0x3d0 rtnl_newlink+0x43/0x60 The issue happens when the shift that should have been a function of the txq item size mistakenly used the ring size. Fix by using the item size. Cc: stable@vger.kernel.org Fixes: d47dfc2b00e6 ("IB/hfi1: Remove cache and embed txreq in ring") Link: https://lore.kernel.org/r/1642287756-182313-2-git-send-email-mike.marciniszyn@cornelisnetworks.com Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib_tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index f4010890309f..bf62956c8667 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -731,7 +731,7 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) goto free_txqs; txq->tx_ring.max_items = tx_ring_size; - txq->tx_ring.shift = ilog2(tx_ring_size); + txq->tx_ring.shift = ilog2(tx_item_size); txq->tx_ring.avail = hfi1_ipoib_ring_hwat(txq); netif_tx_napi_add(dev, &txq->napi, From 1f84a9450d75e08af70d9e2f2d5e1c0ac0c881d2 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Fri, 28 Jan 2022 18:47:14 +0800 Subject: [PATCH 0777/1295] gve: fix the wrong AdminQ buffer queue index check The 'tail' and 'head' are 'unsigned int' type free-running count, when 'head' is overflow, the 'int i (= tail) < u32 head' will be false: Only '- loop 0: idx = 63' result is shown, so it needs to use 'int' type to compare, it can handle the overflow correctly. typedef uint32_t u32; int main() { u32 tail, head; int stail, shead; int i, loop; tail = 0xffffffff; head = 0x00000000; for (i = tail, loop = 0; i < head; i++) { unsigned int idx = i & 63; printf("+ loop %d: idx = %u\n", loop++, idx); } stail = tail; shead = head; for (i = stail, loop = 0; i < shead; i++) { unsigned int idx = i & 63; printf("- loop %d: idx = %u\n", loop++, idx); } return 0; } Fixes: 5cdad90de62c ("gve: Batch AQ commands for creating and destroying queues.") Signed-off-by: Haiyue Wang Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_adminq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c index 2ad7f57f7e5b..f7621ab672b9 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.c +++ b/drivers/net/ethernet/google/gve/gve_adminq.c @@ -301,7 +301,7 @@ static int gve_adminq_parse_err(struct gve_priv *priv, u32 status) */ static int gve_adminq_kick_and_wait(struct gve_priv *priv) { - u32 tail, head; + int tail, head; int i; tail = ioread32be(&priv->reg_bar0->adminq_event_counter); From b1151b74ff68cc83c2a8e1a618efe7d056e4f237 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Sat, 15 Jan 2022 18:02:34 -0500 Subject: [PATCH 0778/1295] IB/hfi1: Fix alloc failure with larger txqueuelen The following allocation with large txqueuelen will result in the following warning: Call Trace: __alloc_pages_nodemask+0x283/0x2c0 kmalloc_large_node+0x3c/0xa0 __kmalloc_node+0x22a/0x2f0 hfi1_ipoib_txreq_init+0x19f/0x330 [hfi1] hfi1_ipoib_setup_rn+0xd3/0x1a0 [hfi1] rdma_init_netdev+0x5a/0x80 [ib_core] ipoib_intf_init+0x6c/0x350 [ib_ipoib] ipoib_intf_alloc+0x5c/0xc0 [ib_ipoib] ipoib_add_one+0xbe/0x300 [ib_ipoib] add_client_context+0x12c/0x1a0 [ib_core] ib_register_client+0x147/0x190 [ib_core] ipoib_init_module+0xdd/0x132 [ib_ipoib] do_one_initcall+0x46/0x1c3 do_init_module+0x5a/0x220 load_module+0x14c5/0x17f0 __do_sys_init_module+0x13b/0x180 do_syscall_64+0x5b/0x1a0 entry_SYSCALL_64_after_hwframe+0x65/0xca For ipoib, the txqueuelen is modified with the module parameter send_queue_size. Fix by changing to use kv versions of the same allocator to handle the large allocations. The allocation embeds a hdr struct that is dma mapped. Change that struct to a pointer to a kzalloced struct. Cc: stable@vger.kernel.org Fixes: d99dc602e2a5 ("IB/hfi1: Add functions to transmit datagram ipoib packets") Link: https://lore.kernel.org/r/1642287756-182313-3-git-send-email-mike.marciniszyn@cornelisnetworks.com Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib.h | 2 +- drivers/infiniband/hw/hfi1/ipoib_tx.c | 36 +++++++++++++++++++-------- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib.h b/drivers/infiniband/hw/hfi1/ipoib.h index 909122934246..aec60d4888eb 100644 --- a/drivers/infiniband/hw/hfi1/ipoib.h +++ b/drivers/infiniband/hw/hfi1/ipoib.h @@ -55,7 +55,7 @@ union hfi1_ipoib_flow { */ struct ipoib_txreq { struct sdma_txreq txreq; - struct hfi1_sdma_header sdma_hdr; + struct hfi1_sdma_header *sdma_hdr; int sdma_status; int complete; struct hfi1_ipoib_dev_priv *priv; diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index bf62956c8667..d6bbdb8fcb50 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -122,7 +122,7 @@ static void hfi1_ipoib_free_tx(struct ipoib_txreq *tx, int budget) dd_dev_warn(priv->dd, "%s: Status = 0x%x pbc 0x%llx txq = %d sde = %d\n", __func__, tx->sdma_status, - le64_to_cpu(tx->sdma_hdr.pbc), tx->txq->q_idx, + le64_to_cpu(tx->sdma_hdr->pbc), tx->txq->q_idx, tx->txq->sde->this_idx); } @@ -231,7 +231,7 @@ static int hfi1_ipoib_build_tx_desc(struct ipoib_txreq *tx, { struct hfi1_devdata *dd = txp->dd; struct sdma_txreq *txreq = &tx->txreq; - struct hfi1_sdma_header *sdma_hdr = &tx->sdma_hdr; + struct hfi1_sdma_header *sdma_hdr = tx->sdma_hdr; u16 pkt_bytes = sizeof(sdma_hdr->pbc) + (txp->hdr_dwords << 2) + tx->skb->len; int ret; @@ -256,7 +256,7 @@ static void hfi1_ipoib_build_ib_tx_headers(struct ipoib_txreq *tx, struct ipoib_txparms *txp) { struct hfi1_ipoib_dev_priv *priv = tx->txq->priv; - struct hfi1_sdma_header *sdma_hdr = &tx->sdma_hdr; + struct hfi1_sdma_header *sdma_hdr = tx->sdma_hdr; struct sk_buff *skb = tx->skb; struct hfi1_pportdata *ppd = ppd_from_ibp(txp->ibp); struct rdma_ah_attr *ah_attr = txp->ah_attr; @@ -483,7 +483,7 @@ static int hfi1_ipoib_send_dma_single(struct net_device *dev, if (likely(!ret)) { tx_ok: trace_sdma_output_ibhdr(txq->priv->dd, - &tx->sdma_hdr.hdr, + &tx->sdma_hdr->hdr, ib_is_sc5(txp->flow.sc5)); hfi1_ipoib_check_queue_depth(txq); return NETDEV_TX_OK; @@ -547,7 +547,7 @@ static int hfi1_ipoib_send_dma_list(struct net_device *dev, hfi1_ipoib_check_queue_depth(txq); trace_sdma_output_ibhdr(txq->priv->dd, - &tx->sdma_hdr.hdr, + &tx->sdma_hdr->hdr, ib_is_sc5(txp->flow.sc5)); if (!netdev_xmit_more()) @@ -683,7 +683,8 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) { struct net_device *dev = priv->netdev; u32 tx_ring_size, tx_item_size; - int i; + struct hfi1_ipoib_circ_buf *tx_ring; + int i, j; /* * Ring holds 1 less than tx_ring_size @@ -701,7 +702,9 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) for (i = 0; i < dev->num_tx_queues; i++) { struct hfi1_ipoib_txq *txq = &priv->txqs[i]; + struct ipoib_txreq *tx; + tx_ring = &txq->tx_ring; iowait_init(&txq->wait, 0, hfi1_ipoib_flush_txq, @@ -725,14 +728,19 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) priv->dd->node); txq->tx_ring.items = - kcalloc_node(tx_ring_size, tx_item_size, - GFP_KERNEL, priv->dd->node); + kvzalloc_node(array_size(tx_ring_size, tx_item_size), + GFP_KERNEL, priv->dd->node); if (!txq->tx_ring.items) goto free_txqs; txq->tx_ring.max_items = tx_ring_size; txq->tx_ring.shift = ilog2(tx_item_size); txq->tx_ring.avail = hfi1_ipoib_ring_hwat(txq); + tx_ring = &txq->tx_ring; + for (j = 0; j < tx_ring_size; j++) + hfi1_txreq_from_idx(tx_ring, j)->sdma_hdr = + kzalloc_node(sizeof(*tx->sdma_hdr), + GFP_KERNEL, priv->dd->node); netif_tx_napi_add(dev, &txq->napi, hfi1_ipoib_poll_tx_ring, @@ -746,7 +754,10 @@ free_txqs: struct hfi1_ipoib_txq *txq = &priv->txqs[i]; netif_napi_del(&txq->napi); - kfree(txq->tx_ring.items); + tx_ring = &txq->tx_ring; + for (j = 0; j < tx_ring_size; j++) + kfree(hfi1_txreq_from_idx(tx_ring, j)->sdma_hdr); + kvfree(tx_ring->items); } kfree(priv->txqs); @@ -780,17 +791,20 @@ static void hfi1_ipoib_drain_tx_list(struct hfi1_ipoib_txq *txq) void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv) { - int i; + int i, j; for (i = 0; i < priv->netdev->num_tx_queues; i++) { struct hfi1_ipoib_txq *txq = &priv->txqs[i]; + struct hfi1_ipoib_circ_buf *tx_ring = &txq->tx_ring; iowait_cancel_work(&txq->wait); iowait_sdma_drain(&txq->wait); hfi1_ipoib_drain_tx_list(txq); netif_napi_del(&txq->napi); hfi1_ipoib_drain_tx_ring(txq); - kfree(txq->tx_ring.items); + for (j = 0; j < tx_ring->max_items; j++) + kfree(hfi1_txreq_from_idx(tx_ring, j)->sdma_hdr); + kvfree(tx_ring->items); } kfree(priv->txqs); From 5f8f55b92edd621f056bdf09e572092849fabd83 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Sat, 15 Jan 2022 18:02:35 -0500 Subject: [PATCH 0779/1295] IB/hfi1: Fix AIP early init panic An early failure in hfi1_ipoib_setup_rn() can lead to the following panic: BUG: unable to handle kernel NULL pointer dereference at 00000000000001b0 PGD 0 P4D 0 Oops: 0002 [#1] SMP NOPTI Workqueue: events work_for_cpu_fn RIP: 0010:try_to_grab_pending+0x2b/0x140 Code: 1f 44 00 00 41 55 41 54 55 48 89 d5 53 48 89 fb 9c 58 0f 1f 44 00 00 48 89 c2 fa 66 0f 1f 44 00 00 48 89 55 00 40 84 f6 75 77 48 0f ba 2b 00 72 09 31 c0 5b 5d 41 5c 41 5d c3 48 89 df e8 6c RSP: 0018:ffffb6b3cf7cfa48 EFLAGS: 00010046 RAX: 0000000000000246 RBX: 00000000000001b0 RCX: 0000000000000000 RDX: 0000000000000246 RSI: 0000000000000000 RDI: 00000000000001b0 RBP: ffffb6b3cf7cfa70 R08: 0000000000000f09 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: ffffb6b3cf7cfa90 R14: ffffffff9b2fbfc0 R15: ffff8a4fdf244690 FS: 0000000000000000(0000) GS:ffff8a527f400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000001b0 CR3: 00000017e2410003 CR4: 00000000007706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: __cancel_work_timer+0x42/0x190 ? dev_printk_emit+0x4e/0x70 iowait_cancel_work+0x15/0x30 [hfi1] hfi1_ipoib_txreq_deinit+0x5a/0x220 [hfi1] ? dev_err+0x6c/0x90 hfi1_ipoib_netdev_dtor+0x15/0x30 [hfi1] hfi1_ipoib_setup_rn+0x10e/0x150 [hfi1] rdma_init_netdev+0x5a/0x80 [ib_core] ? hfi1_ipoib_free_rdma_netdev+0x20/0x20 [hfi1] ipoib_intf_init+0x6c/0x350 [ib_ipoib] ipoib_intf_alloc+0x5c/0xc0 [ib_ipoib] ipoib_add_one+0xbe/0x300 [ib_ipoib] add_client_context+0x12c/0x1a0 [ib_core] enable_device_and_get+0xdc/0x1d0 [ib_core] ib_register_device+0x572/0x6b0 [ib_core] rvt_register_device+0x11b/0x220 [rdmavt] hfi1_register_ib_device+0x6b4/0x770 [hfi1] do_init_one.isra.20+0x3e3/0x680 [hfi1] local_pci_probe+0x41/0x90 work_for_cpu_fn+0x16/0x20 process_one_work+0x1a7/0x360 ? create_worker+0x1a0/0x1a0 worker_thread+0x1cf/0x390 ? create_worker+0x1a0/0x1a0 kthread+0x116/0x130 ? kthread_flush_work_fn+0x10/0x10 ret_from_fork+0x1f/0x40 The panic happens in hfi1_ipoib_txreq_deinit() because there is a NULL deref when hfi1_ipoib_netdev_dtor() is called in this error case. hfi1_ipoib_txreq_init() and hfi1_ipoib_rxq_init() are self unwinding so fix by adjusting the error paths accordingly. Other changes: - hfi1_ipoib_free_rdma_netdev() is deleted including the free_netdev() since the netdev core code deletes calls free_netdev() - The switch to the accelerated entrances is moved to the success path. Cc: stable@vger.kernel.org Fixes: d99dc602e2a5 ("IB/hfi1: Add functions to transmit datagram ipoib packets") Link: https://lore.kernel.org/r/1642287756-182313-4-git-send-email-mike.marciniszyn@cornelisnetworks.com Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib_main.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib_main.c b/drivers/infiniband/hw/hfi1/ipoib_main.c index e1a2b02bbd91..8306ed5c1b80 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_main.c +++ b/drivers/infiniband/hw/hfi1/ipoib_main.c @@ -168,12 +168,6 @@ static void hfi1_ipoib_netdev_dtor(struct net_device *dev) free_percpu(dev->tstats); } -static void hfi1_ipoib_free_rdma_netdev(struct net_device *dev) -{ - hfi1_ipoib_netdev_dtor(dev); - free_netdev(dev); -} - static void hfi1_ipoib_set_id(struct net_device *dev, int id) { struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); @@ -211,24 +205,23 @@ static int hfi1_ipoib_setup_rn(struct ib_device *device, priv->port_num = port_num; priv->netdev_ops = netdev->netdev_ops; - netdev->netdev_ops = &hfi1_ipoib_netdev_ops; - ib_query_pkey(device, port_num, priv->pkey_index, &priv->pkey); rc = hfi1_ipoib_txreq_init(priv); if (rc) { dd_dev_err(dd, "IPoIB netdev TX init - failed(%d)\n", rc); - hfi1_ipoib_free_rdma_netdev(netdev); return rc; } rc = hfi1_ipoib_rxq_init(netdev); if (rc) { dd_dev_err(dd, "IPoIB netdev RX init - failed(%d)\n", rc); - hfi1_ipoib_free_rdma_netdev(netdev); + hfi1_ipoib_txreq_deinit(priv); return rc; } + netdev->netdev_ops = &hfi1_ipoib_netdev_ops; + netdev->priv_destructor = hfi1_ipoib_netdev_dtor; netdev->needs_free_netdev = true; From e5cce44aff3be9ad2cd52f63f35edbd706181d50 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Sat, 15 Jan 2022 18:02:36 -0500 Subject: [PATCH 0780/1295] IB/hfi1: Fix tstats alloc and dealloc The tstats allocation is done in the accelerated ndo_init function but the allocation is not tested to succeed. The deallocation is not done in the accelerated ndo_uninit function. Resolve issues by testing for an allocation failure and adding the free_percpu in the uninit function. Fixes: aa0616a9bd52 ("IB/hfi1: switch to core handling of rx/tx byte/packet counters") Link: https://lore.kernel.org/r/1642287756-182313-5-git-send-email-mike.marciniszyn@cornelisnetworks.com Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib_main.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib_main.c b/drivers/infiniband/hw/hfi1/ipoib_main.c index 8306ed5c1b80..5d814afdf7f3 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_main.c +++ b/drivers/infiniband/hw/hfi1/ipoib_main.c @@ -22,26 +22,35 @@ static int hfi1_ipoib_dev_init(struct net_device *dev) int ret; dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; ret = priv->netdev_ops->ndo_init(dev); if (ret) - return ret; + goto out_ret; ret = hfi1_netdev_add_data(priv->dd, qpn_from_mac(priv->netdev->dev_addr), dev); if (ret < 0) { priv->netdev_ops->ndo_uninit(dev); - return ret; + goto out_ret; } return 0; +out_ret: + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; } static void hfi1_ipoib_dev_uninit(struct net_device *dev) { struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + free_percpu(dev->tstats); + dev->tstats = NULL; + hfi1_netdev_remove_data(priv->dd, qpn_from_mac(priv->netdev->dev_addr)); priv->netdev_ops->ndo_uninit(dev); @@ -166,6 +175,7 @@ static void hfi1_ipoib_netdev_dtor(struct net_device *dev) hfi1_ipoib_rxq_deinit(priv->netdev); free_percpu(dev->tstats); + dev->tstats = NULL; } static void hfi1_ipoib_set_id(struct net_device *dev, int id) From 6449520391dfc3d2cef134f11a91251a054ff7d0 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 28 Jan 2022 22:15:50 +0800 Subject: [PATCH 0781/1295] net: stmmac: properly handle with runtime pm in stmmac_dvr_remove() There are two issues with runtime pm handling in stmmac_dvr_remove(): 1. the mac is runtime suspended before stopping dma and rx/tx. We need to ensure the device is properly resumed back. 2. the stmmaceth clk enable/disable isn't balanced in both exit and error handling code path. Take the exit code path for example, when we unbind the driver or rmmod the driver module, the mac is runtime suspended as said above, so the stmmaceth clk is disabled, but stmmac_dvr_remove() stmmac_remove_config_dt() clk_disable_unprepare() CCF will complain this time. The error handling code path suffers from the similar situtaion. Here are kernel warnings in error handling code path on Allwinner D1 platform: [ 1.604695] ------------[ cut here ]------------ [ 1.609328] bus-emac already disabled [ 1.613015] WARNING: CPU: 0 PID: 38 at drivers/clk/clk.c:952 clk_core_disable+0xcc/0xec [ 1.621039] CPU: 0 PID: 38 Comm: kworker/u2:1 Not tainted 5.14.0-rc4#1 [ 1.627653] Hardware name: Allwinner D1 NeZha (DT) [ 1.632443] Workqueue: events_unbound deferred_probe_work_func [ 1.638286] epc : clk_core_disable+0xcc/0xec [ 1.642561] ra : clk_core_disable+0xcc/0xec [ 1.646835] epc : ffffffff8023c2ec ra : ffffffff8023c2ec sp : ffffffd00411bb10 [ 1.654054] gp : ffffffff80ec9988 tp : ffffffe00143a800 t0 : ffffffff80ed6a6f [ 1.661272] t1 : ffffffff80ed6a60 t2 : 0000000000000000 s0 : ffffffe001509e00 [ 1.668489] s1 : 0000000000000001 a0 : 0000000000000019 a1 : ffffffff80e80bd8 [ 1.675707] a2 : 00000000ffffefff a3 : 00000000000000f4 a4 : 0000000000000002 [ 1.682924] a5 : 0000000000000001 a6 : 0000000000000030 a7 : 00000000028f5c29 [ 1.690141] s2 : 0000000000000800 s3 : ffffffe001375000 s4 : ffffffe01fdf7a80 [ 1.697358] s5 : ffffffe001375010 s6 : ffffffff8001fc10 s7 : ffffffffffffffff [ 1.704577] s8 : 0000000000000001 s9 : ffffffff80ecb248 s10: ffffffe001b80000 [ 1.711794] s11: ffffffe001b80760 t3 : 0000000000000062 t4 : ffffffffffffffff [ 1.719012] t5 : ffffffff80e0f6d8 t6 : ffffffd00411b8f0 [ 1.724321] status: 8000000201800100 badaddr: 0000000000000000 cause: 0000000000000003 [ 1.732233] [] clk_core_disable+0xcc/0xec [ 1.737810] [] clk_disable+0x38/0x78 [ 1.742956] [] worker_thread+0x1a8/0x4d8 [ 1.748451] [] stmmac_remove_config_dt+0x1c/0x4c [ 1.754646] [] sun8i_dwmac_probe+0x378/0x82c [ 1.760484] [] worker_thread+0x1a8/0x4d8 [ 1.765975] [] platform_probe+0x64/0xf0 [ 1.771382] [] really_probe.part.0+0x8c/0x30c [ 1.777305] [] __driver_probe_device+0xa0/0x148 [ 1.783402] [] driver_probe_device+0x38/0x138 [ 1.789324] [] __device_attach_driver+0xd0/0x170 [ 1.795508] [] __driver_attach_async_helper+0xbc/0xc0 [ 1.802125] [] bus_for_each_drv+0x68/0xb4 [ 1.807701] [] __device_attach+0xd8/0x184 [ 1.813277] [] bus_probe_device+0x98/0xbc [ 1.818852] [] deferred_probe_work_func+0x90/0xd4 [ 1.825122] [] process_one_work+0x1e4/0x390 [ 1.830872] [] worker_thread+0x31c/0x4d8 [ 1.836362] [] kthreadd+0x94/0x188 [ 1.841335] [] kthreadd+0x94/0x188 [ 1.846304] [] process_one_work+0x38c/0x390 [ 1.852054] [] kthread+0x124/0x160 [ 1.857021] [] set_kthread_struct+0x5c/0x60 [ 1.862770] [] ret_from_syscall_rejected+0x8/0xc [ 1.868956] ---[ end trace 8d5c6046255f84a0 ]--- [ 1.873675] ------------[ cut here ]------------ [ 1.878366] bus-emac already unprepared [ 1.882378] WARNING: CPU: 0 PID: 38 at drivers/clk/clk.c:810 clk_core_unprepare+0xe4/0x168 [ 1.890673] CPU: 0 PID: 38 Comm: kworker/u2:1 Tainted: G W 5.14.0-rc4 #1 [ 1.898674] Hardware name: Allwinner D1 NeZha (DT) [ 1.903464] Workqueue: events_unbound deferred_probe_work_func [ 1.909305] epc : clk_core_unprepare+0xe4/0x168 [ 1.913840] ra : clk_core_unprepare+0xe4/0x168 [ 1.918375] epc : ffffffff8023d6cc ra : ffffffff8023d6cc sp : ffffffd00411bb10 [ 1.925593] gp : ffffffff80ec9988 tp : ffffffe00143a800 t0 : 0000000000000002 [ 1.932811] t1 : ffffffe01f743be0 t2 : 0000000000000040 s0 : ffffffe001509e00 [ 1.940029] s1 : 0000000000000001 a0 : 000000000000001b a1 : ffffffe00143a800 [ 1.947246] a2 : 0000000000000000 a3 : 00000000000000f4 a4 : 0000000000000001 [ 1.954463] a5 : 0000000000000000 a6 : 0000000005fce2a5 a7 : 0000000000000001 [ 1.961680] s2 : 0000000000000800 s3 : ffffffff80afeb90 s4 : ffffffe01fdf7a80 [ 1.968898] s5 : ffffffe001375010 s6 : ffffffff8001fc10 s7 : ffffffffffffffff [ 1.976115] s8 : 0000000000000001 s9 : ffffffff80ecb248 s10: ffffffe001b80000 [ 1.983333] s11: ffffffe001b80760 t3 : ffffffff80b39120 t4 : 0000000000000001 [ 1.990550] t5 : 0000000000000000 t6 : ffffffe001600002 [ 1.995859] status: 8000000201800120 badaddr: 0000000000000000 cause: 0000000000000003 [ 2.003771] [] clk_core_unprepare+0xe4/0x168 [ 2.009609] [] clk_unprepare+0x24/0x3c [ 2.014929] [] stmmac_remove_config_dt+0x24/0x4c [ 2.021125] [] sun8i_dwmac_probe+0x378/0x82c [ 2.026965] [] worker_thread+0x1a8/0x4d8 [ 2.032463] [] platform_probe+0x64/0xf0 [ 2.037871] [] really_probe.part.0+0x8c/0x30c [ 2.043795] [] __driver_probe_device+0xa0/0x148 [ 2.049892] [] driver_probe_device+0x38/0x138 [ 2.055815] [] __device_attach_driver+0xd0/0x170 [ 2.061999] [] __driver_attach_async_helper+0xbc/0xc0 [ 2.068616] [] bus_for_each_drv+0x68/0xb4 [ 2.074193] [] __device_attach+0xd8/0x184 [ 2.079769] [] bus_probe_device+0x98/0xbc [ 2.085345] [] deferred_probe_work_func+0x90/0xd4 [ 2.091616] [] process_one_work+0x1e4/0x390 [ 2.097367] [] worker_thread+0x31c/0x4d8 [ 2.102858] [] kthreadd+0x94/0x188 [ 2.107830] [] kthreadd+0x94/0x188 [ 2.112800] [] process_one_work+0x38c/0x390 [ 2.118551] [] kthread+0x124/0x160 [ 2.123520] [] set_kthread_struct+0x5c/0x60 [ 2.129268] [] ret_from_syscall_rejected+0x8/0xc [ 2.135455] ---[ end trace 8d5c6046255f84a1 ]--- Fixes: 5ec55823438e ("net: stmmac: add clocks management for gmac driver") Signed-off-by: Jisheng Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 639a753266e6..bde76ea2deec 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7252,6 +7252,10 @@ int stmmac_dvr_remove(struct device *dev) netdev_info(priv->dev, "%s: removing driver", __func__); + pm_runtime_get_sync(dev); + pm_runtime_disable(dev); + pm_runtime_put_noidle(dev); + stmmac_stop_all_dma(priv); stmmac_mac_set(priv, priv->ioaddr, false); netif_carrier_off(ndev); @@ -7270,8 +7274,6 @@ int stmmac_dvr_remove(struct device *dev) if (priv->plat->stmmac_rst) reset_control_assert(priv->plat->stmmac_rst); reset_control_assert(priv->plat->stmmac_ahb_rst); - pm_runtime_put(dev); - pm_runtime_disable(dev); if (priv->hw->pcs != STMMAC_PCS_TBI && priv->hw->pcs != STMMAC_PCS_RTBI) stmmac_mdio_unregister(ndev); From d9e410ebbed9d091b97bdf45b8a3792e2878dc48 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 18 Jan 2022 09:35:00 +0200 Subject: [PATCH 0782/1295] RDMA/cma: Use correct address when leaving multicast group In RoCE we should use cma_iboe_set_mgid() and not cma_set_mgid to generate the mgid, otherwise we will generate an IGMP for an incorrect address. Fixes: b5de0c60cc30 ("RDMA/cma: Fix use after free race in roce multicast join") Link: https://lore.kernel.org/r/913bc6783fd7a95fe71ad9454e01653ee6fb4a9a.1642491047.git.leonro@nvidia.com Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 27a00ce2e101..c447526288f4 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -67,8 +67,8 @@ static const char * const cma_events[] = { [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", }; -static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, - union ib_gid *mgid); +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, + enum ib_gid_type gid_type); const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { @@ -1846,17 +1846,19 @@ static void destroy_mc(struct rdma_id_private *id_priv, if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); - if (ndev) { + if (ndev && !send_only) { + enum ib_gid_type gid_type; union ib_gid mgid; - cma_set_mgid(id_priv, (struct sockaddr *)&mc->addr, - &mgid); - - if (!send_only) - cma_igmp_send(ndev, &mgid, false); - - dev_put(ndev); + gid_type = id_priv->cma_dev->default_gid_type + [id_priv->id.port_num - + rdma_start_port( + id_priv->cma_dev->device)]; + cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid, + gid_type); + cma_igmp_send(ndev, &mgid, false); } + dev_put(ndev); cancel_work_sync(&mc->iboe_join.work); } From 36e8169ec973359f671f9ec7213547059cae972e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 18 Jan 2022 09:35:01 +0200 Subject: [PATCH 0783/1295] RDMA/ucma: Protect mc during concurrent multicast leaves Partially revert the commit mentioned in the Fixes line to make sure that allocation and erasing multicast struct are locked. BUG: KASAN: use-after-free in ucma_cleanup_multicast drivers/infiniband/core/ucma.c:491 [inline] BUG: KASAN: use-after-free in ucma_destroy_private_ctx+0x914/0xb70 drivers/infiniband/core/ucma.c:579 Read of size 8 at addr ffff88801bb74b00 by task syz-executor.1/25529 CPU: 0 PID: 25529 Comm: syz-executor.1 Not tainted 5.16.0-rc7-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x320 mm/kasan/report.c:247 __kasan_report mm/kasan/report.c:433 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:450 ucma_cleanup_multicast drivers/infiniband/core/ucma.c:491 [inline] ucma_destroy_private_ctx+0x914/0xb70 drivers/infiniband/core/ucma.c:579 ucma_destroy_id+0x1e6/0x280 drivers/infiniband/core/ucma.c:614 ucma_write+0x25c/0x350 drivers/infiniband/core/ucma.c:1732 vfs_write+0x28e/0xae0 fs/read_write.c:588 ksys_write+0x1ee/0x250 fs/read_write.c:643 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Currently the xarray search can touch a concurrently freeing mc as the xa_for_each() is not surrounded by any lock. Rather than hold the lock for a full scan hold it only for the effected items, which is usually an empty list. Fixes: 95fe51096b7a ("RDMA/ucma: Remove mc_list and rely on xarray") Link: https://lore.kernel.org/r/1cda5fabb1081e8d16e39a48d3a4f8160cea88b8.1642491047.git.leonro@nvidia.com Reported-by: syzbot+e3f96c43d19782dd14a7@syzkaller.appspotmail.com Suggested-by: Jason Gunthorpe Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 2b72c4fa9550..9d6ac9dff39a 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -95,6 +95,7 @@ struct ucma_context { u64 uid; struct list_head list; + struct list_head mc_list; struct work_struct close_work; }; @@ -105,6 +106,7 @@ struct ucma_multicast { u64 uid; u8 join_state; + struct list_head list; struct sockaddr_storage addr; }; @@ -198,6 +200,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) INIT_WORK(&ctx->close_work, ucma_close_id); init_completion(&ctx->comp); + INIT_LIST_HEAD(&ctx->mc_list); /* So list_del() will work if we don't do ucma_finish_ctx() */ INIT_LIST_HEAD(&ctx->list); ctx->file = file; @@ -484,19 +487,19 @@ err1: static void ucma_cleanup_multicast(struct ucma_context *ctx) { - struct ucma_multicast *mc; - unsigned long index; + struct ucma_multicast *mc, *tmp; - xa_for_each(&multicast_table, index, mc) { - if (mc->ctx != ctx) - continue; + xa_lock(&multicast_table); + list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { + list_del(&mc->list); /* * At this point mc->ctx->ref is 0 so the mc cannot leave the * lock on the reader and this is enough serialization */ - xa_erase(&multicast_table, index); + __xa_erase(&multicast_table, mc->id); kfree(mc); } + xa_unlock(&multicast_table); } static void ucma_cleanup_mc_events(struct ucma_multicast *mc) @@ -1469,12 +1472,16 @@ static ssize_t ucma_process_join(struct ucma_file *file, mc->uid = cmd->uid; memcpy(&mc->addr, addr, cmd->addr_size); - if (xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, + xa_lock(&multicast_table); + if (__xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, GFP_KERNEL)) { ret = -ENOMEM; goto err_free_mc; } + list_add_tail(&mc->list, &ctx->mc_list); + xa_unlock(&multicast_table); + mutex_lock(&ctx->mutex); ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr, join_state, mc); @@ -1500,8 +1507,11 @@ err_leave_multicast: mutex_unlock(&ctx->mutex); ucma_cleanup_mc_events(mc); err_xa_erase: - xa_erase(&multicast_table, mc->id); + xa_lock(&multicast_table); + list_del(&mc->list); + __xa_erase(&multicast_table, mc->id); err_free_mc: + xa_unlock(&multicast_table); kfree(mc); err_put_ctx: ucma_put_ctx(ctx); @@ -1569,15 +1579,17 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file, mc = ERR_PTR(-EINVAL); else if (!refcount_inc_not_zero(&mc->ctx->ref)) mc = ERR_PTR(-ENXIO); - else - __xa_erase(&multicast_table, mc->id); - xa_unlock(&multicast_table); if (IS_ERR(mc)) { + xa_unlock(&multicast_table); ret = PTR_ERR(mc); goto out; } + list_del(&mc->list); + __xa_erase(&multicast_table, mc->id); + xa_unlock(&multicast_table); + mutex_lock(&mc->ctx->mutex); rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr); mutex_unlock(&mc->ctx->mutex); From a75badebfdc0b3823054bedf112edb54d6357c75 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 18 Jan 2022 12:11:04 +0300 Subject: [PATCH 0784/1295] RDMA/siw: Fix refcounting leak in siw_create_qp() The atomic_inc() needs to be paired with an atomic_dec() on the error path. Fixes: 514aee660df4 ("RDMA: Globally allocate and release QP memory") Link: https://lore.kernel.org/r/20220118091104.GA11671@kili Signed-off-by: Dan Carpenter Reviewed-by: Leon Romanovsky Reviewed-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_verbs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index a3dd2cb6d5c9..54ef367b074a 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -313,7 +313,8 @@ int siw_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { siw_dbg(base_dev, "too many QP's\n"); - return -ENOMEM; + rv = -ENOMEM; + goto err_atomic; } if (attrs->qp_type != IB_QPT_RC) { siw_dbg(base_dev, "only RC QP's supported\n"); From 3c75c0ea5da749bd1efebd1387f2e5011b8c7d78 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 19 Jan 2022 16:52:48 +0100 Subject: [PATCH 0785/1295] ASoC: soc-pcm: Fix DPCM lockdep warning due to nested stream locks The recent change for DPCM locking caused spurious lockdep warnings. Actually the warnings are false-positive, as those are triggered due to the nested stream locks for FE and BE. Since both locks belong to the same lock class, lockdep sees it as if a deadlock. For fixing this, we need to take PCM stream locks for BE with the nested lock primitives. Since currently snd_pcm_stream_lock*() helper assumes only the top-level single locking, a new helper function snd_pcm_stream_lock_irqsave_nested() is defined for a single-depth nested lock, which is now used in the BE DAI trigger that is always performed inside a FE stream lock. Fixes: b2ae80663008 ("ASoC: soc-pcm: serialize BE triggers") Reported-and-tested-by: Hans de Goede Reported-and-tested-by: Marek Szyprowski Link: https://lore.kernel.org/r/73018f3c-9769-72ea-0325-b3f8e2381e30@redhat.com Link: https://lore.kernel.org/alsa-devel/9a0abddd-49e9-872d-2f00-a1697340f786@samsung.com Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20220119155249.26754-2-tiwai@suse.de Signed-off-by: Mark Brown --- include/sound/pcm.h | 15 +++++++++++++++ sound/core/pcm_native.c | 13 +++++++++++++ sound/soc/soc-pcm.c | 6 +++--- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/include/sound/pcm.h b/include/sound/pcm.h index 33451f8ff755..524220fe1af6 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -614,6 +614,7 @@ void snd_pcm_stream_unlock(struct snd_pcm_substream *substream); void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream); void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream); unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream); +unsigned long _snd_pcm_stream_lock_irqsave_nested(struct snd_pcm_substream *substream); /** * snd_pcm_stream_lock_irqsave - Lock the PCM stream @@ -632,6 +633,20 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream); void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream, unsigned long flags); +/** + * snd_pcm_stream_lock_irqsave_nested - Single-nested PCM stream locking + * @substream: PCM substream + * @flags: irq flags + * + * This locks the PCM stream like snd_pcm_stream_lock_irqsave() but with + * the single-depth lockdep subclass. + */ +#define snd_pcm_stream_lock_irqsave_nested(substream, flags) \ + do { \ + typecheck(unsigned long, flags); \ + flags = _snd_pcm_stream_lock_irqsave_nested(substream); \ + } while (0) + /** * snd_pcm_group_for_each_entry - iterate over the linked substreams * @s: the iterator diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 621883e71194..a056b3ef3c84 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -172,6 +172,19 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream) } EXPORT_SYMBOL_GPL(_snd_pcm_stream_lock_irqsave); +unsigned long _snd_pcm_stream_lock_irqsave_nested(struct snd_pcm_substream *substream) +{ + unsigned long flags = 0; + if (substream->pcm->nonatomic) + mutex_lock_nested(&substream->self_group.mutex, + SINGLE_DEPTH_NESTING); + else + spin_lock_irqsave_nested(&substream->self_group.lock, flags, + SINGLE_DEPTH_NESTING); + return flags; +} +EXPORT_SYMBOL_GPL(_snd_pcm_stream_lock_irqsave_nested); + /** * snd_pcm_stream_unlock_irqrestore - Unlock the PCM stream * @substream: PCM substream diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index 7abfc48b26ca..e8876e65c649 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -46,8 +46,8 @@ static inline void snd_soc_dpcm_stream_lock_irq(struct snd_soc_pcm_runtime *rtd, snd_pcm_stream_lock_irq(snd_soc_dpcm_get_substream(rtd, stream)); } -#define snd_soc_dpcm_stream_lock_irqsave(rtd, stream, flags) \ - snd_pcm_stream_lock_irqsave(snd_soc_dpcm_get_substream(rtd, stream), flags) +#define snd_soc_dpcm_stream_lock_irqsave_nested(rtd, stream, flags) \ + snd_pcm_stream_lock_irqsave_nested(snd_soc_dpcm_get_substream(rtd, stream), flags) static inline void snd_soc_dpcm_stream_unlock_irq(struct snd_soc_pcm_runtime *rtd, int stream) @@ -2094,7 +2094,7 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream, be = dpcm->be; be_substream = snd_soc_dpcm_get_substream(be, stream); - snd_soc_dpcm_stream_lock_irqsave(be, stream, flags); + snd_soc_dpcm_stream_lock_irqsave_nested(be, stream, flags); /* is this op for this BE ? */ if (!snd_soc_dpcm_be_can_update(fe, be, stream)) From 9f620684c1ef5a002b6622ecc7b5818e81252f48 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 19 Jan 2022 16:52:49 +0100 Subject: [PATCH 0786/1295] ASoC: soc-pcm: Move debugfs removal out of spinlock The recent fix for DPCM locking also covered the loop in dpcm_be_disconnect() with the FE stream lock. This caused an unexpected side effect, thought: calling debugfs_remove_recursive() in the spinlock may lead to lockdep splats as the code there assumes the SOFTIRQ-safe context. For avoiding the problem, this patch changes the disconnection procedure to two phases: at first, the matching entries are removed from the linked list, then the resources are freed outside the lock. Fixes: b7898396f4bb ("ASoC: soc-pcm: Fix and cleanup DPCM locking") Reported-and-tested-by: Marek Szyprowski Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20220119155249.26754-3-tiwai@suse.de Signed-off-by: Mark Brown --- sound/soc/soc-pcm.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index e8876e65c649..9a954680d492 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -1268,6 +1268,7 @@ static void dpcm_be_reparent(struct snd_soc_pcm_runtime *fe, void dpcm_be_disconnect(struct snd_soc_pcm_runtime *fe, int stream) { struct snd_soc_dpcm *dpcm, *d; + LIST_HEAD(deleted_dpcms); snd_soc_dpcm_mutex_assert_held(fe); @@ -1287,13 +1288,18 @@ void dpcm_be_disconnect(struct snd_soc_pcm_runtime *fe, int stream) /* BEs still alive need new FE */ dpcm_be_reparent(fe, dpcm->be, stream); - dpcm_remove_debugfs_state(dpcm); - list_del(&dpcm->list_be); - list_del(&dpcm->list_fe); - kfree(dpcm); + list_move(&dpcm->list_fe, &deleted_dpcms); } snd_soc_dpcm_stream_unlock_irq(fe, stream); + + while (!list_empty(&deleted_dpcms)) { + dpcm = list_first_entry(&deleted_dpcms, struct snd_soc_dpcm, + list_fe); + list_del(&dpcm->list_fe); + dpcm_remove_debugfs_state(dpcm); + kfree(dpcm); + } } /* get BE for DAI widget and stream */ From 06feec6005c9d9500cd286ec440aabf8b2ddd94d Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Wed, 12 Jan 2022 22:50:39 +0300 Subject: [PATCH 0787/1295] ASoC: hdmi-codec: Fix OOB memory accesses Correct size of iec_status array by changing it to the size of status array of the struct snd_aes_iec958. This fixes out-of-bounds slab read accesses made by memcpy() of the hdmi-codec driver. This problem is reported by KASAN. Cc: stable@vger.kernel.org Signed-off-by: Dmitry Osipenko Link: https://lore.kernel.org/r/20220112195039.1329-1-digetx@gmail.com Signed-off-by: Mark Brown --- include/uapi/sound/asound.h | 4 +++- sound/soc/codecs/hdmi-codec.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index ff7e638221c5..228279ea0670 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -56,8 +56,10 @@ * * ****************************************************************************/ +#define AES_IEC958_STATUS_SIZE 24 + struct snd_aes_iec958 { - unsigned char status[24]; /* AES/IEC958 channel status bits */ + unsigned char status[AES_IEC958_STATUS_SIZE]; /* AES/IEC958 channel status bits */ unsigned char subcode[147]; /* AES/IEC958 subcode bits */ unsigned char pad; /* nothing */ unsigned char dig_subframe[4]; /* AES/IEC958 subframe bits */ diff --git a/sound/soc/codecs/hdmi-codec.c b/sound/soc/codecs/hdmi-codec.c index b61f980cabdc..b07607a9ecea 100644 --- a/sound/soc/codecs/hdmi-codec.c +++ b/sound/soc/codecs/hdmi-codec.c @@ -277,7 +277,7 @@ struct hdmi_codec_priv { bool busy; struct snd_soc_jack *jack; unsigned int jack_status; - u8 iec_status[5]; + u8 iec_status[AES_IEC958_STATUS_SIZE]; }; static const struct snd_soc_dapm_widget hdmi_widgets[] = { From 4045daf0fa87846a27f56329fddad2deeb5ca354 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 26 Jan 2022 12:03:25 +0200 Subject: [PATCH 0788/1295] ASoC: rt5682: Fix deadlock on resume On resume from suspend the following chain of events can happen: A rt5682_resume() -> mod_delayed_work() for jack_detect_work B DAPM sequence starts ( DAPM is locked now) A1. rt5682_jack_detect_handler() scheduled - Takes both jdet_mutex and calibrate_mutex - Calls in to rt5682_headset_detect() which tries to take DAPM lock, it starts to wait for it as B path took it already. B1. DAPM sequence reaches the "HP Amp", rt5682_hp_event() tries to take the jdet_mutex, but it is locked in A1, so it waits. Deadlock. To solve the deadlock, drop the jdet_mutex, use the jack_detect_work to do the jack removal handling, move the dapm lock up one level to protect the most of the rt5682_jack_detect_handler(), but not the jack reporting as it might trigger a DAPM sequence. The rt5682_headset_detect() can be changed to static as well. Fixes: 8deb34a90f063 ("ASoC: rt5682: fix the wrong jack type detected") Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20220126100325.16513-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5682-i2c.c | 15 ++++----------- sound/soc/codecs/rt5682.c | 24 ++++++++---------------- sound/soc/codecs/rt5682.h | 2 -- 3 files changed, 12 insertions(+), 29 deletions(-) diff --git a/sound/soc/codecs/rt5682-i2c.c b/sound/soc/codecs/rt5682-i2c.c index 20e0f90ea498..20fc0f3766de 100644 --- a/sound/soc/codecs/rt5682-i2c.c +++ b/sound/soc/codecs/rt5682-i2c.c @@ -59,18 +59,12 @@ static void rt5682_jd_check_handler(struct work_struct *work) struct rt5682_priv *rt5682 = container_of(work, struct rt5682_priv, jd_check_work.work); - if (snd_soc_component_read(rt5682->component, RT5682_AJD1_CTRL) - & RT5682_JDH_RS_MASK) { + if (snd_soc_component_read(rt5682->component, RT5682_AJD1_CTRL) & RT5682_JDH_RS_MASK) /* jack out */ - rt5682->jack_type = rt5682_headset_detect(rt5682->component, 0); - - snd_soc_jack_report(rt5682->hs_jack, rt5682->jack_type, - SND_JACK_HEADSET | - SND_JACK_BTN_0 | SND_JACK_BTN_1 | - SND_JACK_BTN_2 | SND_JACK_BTN_3); - } else { + mod_delayed_work(system_power_efficient_wq, + &rt5682->jack_detect_work, 0); + else schedule_delayed_work(&rt5682->jd_check_work, 500); - } } static irqreturn_t rt5682_irq(int irq, void *data) @@ -198,7 +192,6 @@ static int rt5682_i2c_probe(struct i2c_client *i2c, } mutex_init(&rt5682->calibrate_mutex); - mutex_init(&rt5682->jdet_mutex); rt5682_calibrate(rt5682); rt5682_apply_patch_list(rt5682, &i2c->dev); diff --git a/sound/soc/codecs/rt5682.c b/sound/soc/codecs/rt5682.c index 415ec564c82e..0a0ec4a021e1 100644 --- a/sound/soc/codecs/rt5682.c +++ b/sound/soc/codecs/rt5682.c @@ -922,15 +922,13 @@ static void rt5682_enable_push_button_irq(struct snd_soc_component *component, * * Returns detect status. */ -int rt5682_headset_detect(struct snd_soc_component *component, int jack_insert) +static int rt5682_headset_detect(struct snd_soc_component *component, int jack_insert) { struct rt5682_priv *rt5682 = snd_soc_component_get_drvdata(component); struct snd_soc_dapm_context *dapm = &component->dapm; unsigned int val, count; if (jack_insert) { - snd_soc_dapm_mutex_lock(dapm); - snd_soc_component_update_bits(component, RT5682_PWR_ANLG_1, RT5682_PWR_VREF2 | RT5682_PWR_MB, RT5682_PWR_VREF2 | RT5682_PWR_MB); @@ -981,8 +979,6 @@ int rt5682_headset_detect(struct snd_soc_component *component, int jack_insert) snd_soc_component_update_bits(component, RT5682_MICBIAS_2, RT5682_PWR_CLK25M_MASK | RT5682_PWR_CLK1M_MASK, RT5682_PWR_CLK25M_PU | RT5682_PWR_CLK1M_PU); - - snd_soc_dapm_mutex_unlock(dapm); } else { rt5682_enable_push_button_irq(component, false); snd_soc_component_update_bits(component, RT5682_CBJ_CTRL_1, @@ -1011,7 +1007,6 @@ int rt5682_headset_detect(struct snd_soc_component *component, int jack_insert) dev_dbg(component->dev, "jack_type = %d\n", rt5682->jack_type); return rt5682->jack_type; } -EXPORT_SYMBOL_GPL(rt5682_headset_detect); static int rt5682_set_jack_detect(struct snd_soc_component *component, struct snd_soc_jack *hs_jack, void *data) @@ -1094,6 +1089,7 @@ void rt5682_jack_detect_handler(struct work_struct *work) { struct rt5682_priv *rt5682 = container_of(work, struct rt5682_priv, jack_detect_work.work); + struct snd_soc_dapm_context *dapm; int val, btn_type; while (!rt5682->component) @@ -1102,7 +1098,9 @@ void rt5682_jack_detect_handler(struct work_struct *work) while (!rt5682->component->card->instantiated) usleep_range(10000, 15000); - mutex_lock(&rt5682->jdet_mutex); + dapm = snd_soc_component_get_dapm(rt5682->component); + + snd_soc_dapm_mutex_lock(dapm); mutex_lock(&rt5682->calibrate_mutex); val = snd_soc_component_read(rt5682->component, RT5682_AJD1_CTRL) @@ -1162,6 +1160,9 @@ void rt5682_jack_detect_handler(struct work_struct *work) rt5682->irq_work_delay_time = 50; } + mutex_unlock(&rt5682->calibrate_mutex); + snd_soc_dapm_mutex_unlock(dapm); + snd_soc_jack_report(rt5682->hs_jack, rt5682->jack_type, SND_JACK_HEADSET | SND_JACK_BTN_0 | SND_JACK_BTN_1 | @@ -1174,9 +1175,6 @@ void rt5682_jack_detect_handler(struct work_struct *work) else cancel_delayed_work_sync(&rt5682->jd_check_work); } - - mutex_unlock(&rt5682->calibrate_mutex); - mutex_unlock(&rt5682->jdet_mutex); } EXPORT_SYMBOL_GPL(rt5682_jack_detect_handler); @@ -1526,7 +1524,6 @@ static int rt5682_hp_event(struct snd_soc_dapm_widget *w, { struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); - struct rt5682_priv *rt5682 = snd_soc_component_get_drvdata(component); switch (event) { case SND_SOC_DAPM_PRE_PMU: @@ -1538,17 +1535,12 @@ static int rt5682_hp_event(struct snd_soc_dapm_widget *w, RT5682_DEPOP_1, 0x60, 0x60); snd_soc_component_update_bits(component, RT5682_DAC_ADC_DIG_VOL1, 0x00c0, 0x0080); - - mutex_lock(&rt5682->jdet_mutex); - snd_soc_component_update_bits(component, RT5682_HP_CTRL_2, RT5682_HP_C2_DAC_L_EN | RT5682_HP_C2_DAC_R_EN, RT5682_HP_C2_DAC_L_EN | RT5682_HP_C2_DAC_R_EN); usleep_range(5000, 10000); snd_soc_component_update_bits(component, RT5682_CHARGE_PUMP_1, RT5682_CP_SW_SIZE_MASK, RT5682_CP_SW_SIZE_L); - - mutex_unlock(&rt5682->jdet_mutex); break; case SND_SOC_DAPM_POST_PMD: diff --git a/sound/soc/codecs/rt5682.h b/sound/soc/codecs/rt5682.h index c917c76200ea..52ff0d9c36c5 100644 --- a/sound/soc/codecs/rt5682.h +++ b/sound/soc/codecs/rt5682.h @@ -1463,7 +1463,6 @@ struct rt5682_priv { int jack_type; int irq_work_delay_time; - struct mutex jdet_mutex; }; extern const char *rt5682_supply_names[RT5682_NUM_SUPPLIES]; @@ -1473,7 +1472,6 @@ int rt5682_sel_asrc_clk_src(struct snd_soc_component *component, void rt5682_apply_patch_list(struct rt5682_priv *rt5682, struct device *dev); -int rt5682_headset_detect(struct snd_soc_component *component, int jack_insert); void rt5682_jack_detect_handler(struct work_struct *work); bool rt5682_volatile_register(struct device *dev, unsigned int reg); From f6a26318e3148289717b0f39ee976f9d95a93f4d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 28 Jan 2022 10:00:29 +0200 Subject: [PATCH 0789/1295] ocfs2: fix subdirectory registration with register_sysctl() The kernel test robot reports that commit c42ff46f97c1 ("ocfs2: simplify subdirectory registration with register_sysctl()") is broken, and results in kernel warning messages like sysctl table check failed: fs/ocfs2/nm Not a file sysctl table check failed: fs/ocfs2/nm No proc_handler sysctl table check failed: fs/ocfs2/nm bogus .mode 0555 and in fact this was already reported back in linux-next, but nobody seems to have reacted to that report. Possibly that original report only ever made it to the lkp list. The problem seems to be that the simplification didn't actually go far enough, and should have converted the whole directory path to the final sysctl file, rather than just the two first components. So take that last step. Fixes: c42ff46f97c1 ("ocfs2: simplify subdirectory registration with register_sysctl()") Reported-by: kernel test robot Link: https://lore.kernel.org/all/20220128065310.GF8421@xsang-OptiPlex-9020/ Link: https://lists.01.org/hyperkitty/list/lkp@lists.01.org/thread/KQ2F6TPJWMDVEXJM4WTUC4DU3EH3YJVT/ Tested-by: Jan Kara Reviewed-by: Jan Kara Cc: Luis Chamberlain Cc: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stackglue.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 731558a6f27d..dd77b7aaabf5 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -661,17 +661,6 @@ static struct ctl_table ocfs2_nm_table[] = { { } }; -static struct ctl_table ocfs2_mod_table[] = { - { - .procname = "nm", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ocfs2_nm_table - }, - { } -}; - static struct ctl_table_header *ocfs2_table_header; /* @@ -682,7 +671,7 @@ static int __init ocfs2_stack_glue_init(void) { strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); - ocfs2_table_header = register_sysctl("fs/ocfs2", ocfs2_mod_table); + ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table); if (!ocfs2_table_header) { printk(KERN_ERR "ocfs2 stack glue: unable to register sysctl\n"); From 297ae1eb23b04c5a46111ab53c8d0f69af43f402 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 25 Jan 2022 15:40:40 +0000 Subject: [PATCH 0790/1295] arm64: cpufeature: List early Cortex-A510 parts as having broken dbm Versions of Cortex-A510 before r0p3 are affected by a hardware erratum where the hardware update of the dirty bit is not correctly ordered. Add these cpus to the cpu_has_broken_dbm list. Signed-off-by: James Morse Link: https://lore.kernel.org/r/20220125154040.549272-3-james.morse@arm.com Signed-off-by: Catalin Marinas --- Documentation/arm64/silicon-errata.rst | 2 ++ arch/arm64/Kconfig | 10 ++++++++++ arch/arm64/kernel/cpufeature.c | 3 +++ 3 files changed, 15 insertions(+) diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index 1b0e53ececda..0ec7b7f1524b 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -98,6 +98,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A77 | #1508412 | ARM64_ERRATUM_1508412 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A510 | #2051678 | ARM64_ERRATUM_2051678 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A710 | #2119858 | ARM64_ERRATUM_2119858 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A710 | #2054223 | ARM64_ERRATUM_2054223 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1d7036e10215..f2b5a4abef21 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -670,6 +670,16 @@ config ARM64_ERRATUM_1508412 config ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE bool +config ARM64_ERRATUM_2051678 + bool "Cortex-A510: 2051678: disable Hardware Update of the page table dirty bit" + help + This options adds the workaround for ARM Cortex-A510 erratum ARM64_ERRATUM_2051678. + Affected Coretex-A510 might not respect the ordering rules for + hardware update of the page table's dirty bit. The workaround + is to not enable the feature on affected CPUs. + + If unsure, say Y. + config ARM64_ERRATUM_2119858 bool "Cortex-A710/X2: 2119858: workaround TRBE overwriting trace data in FILL mode" default y diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a46ab3b1c4d5..e5f23dab1c8d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1645,6 +1645,9 @@ static bool cpu_has_broken_dbm(void) MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), /* Kryo4xx Silver (rdpe => r1p0) */ MIDR_REV(MIDR_QCOM_KRYO_4XX_SILVER, 0xd, 0xe), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2051678 + MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2), #endif {}, }; From 483529f3209f56d4c7a465d045278a2546ae7ed9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Jan 2022 16:02:34 +0000 Subject: [PATCH 0791/1295] Fix a warning about a malformed kernel doc comment in cifs Fix by removing the extra asterisk. Signed-off-by: David Howells Acked-by: Jeff Layton Reviewed-by: Rohith Surabattula Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 11a22a30ee14..ed210d774a21 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -162,7 +162,7 @@ static void cifs_resolve_server(struct work_struct *work) mutex_unlock(&server->srv_mutex); } -/** +/* * Mark all sessions and tcons for reconnect. * * @server needs to be previously set to CifsNeedReconnect. From b856101a1774b5f1c8c99e8dfdef802856520732 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Wed, 19 Jan 2022 10:37:55 +0200 Subject: [PATCH 0792/1295] IB/cm: Release previously acquired reference counter in the cm_id_priv In failure flow, the reference counter acquired was not released, and the following error was reported: drivers/infiniband/core/cm.c:3373 cm_lap_handler() warn: inconsistent refcounting 'cm_id_priv->refcount.refs.counter': Fixes: 7345201c3963 ("IB/cm: Improve the calling of cm_init_av_for_lap and cm_init_av_by_path") Link: https://lore.kernel.org/r/7615f23bbb5c5b66d03f6fa13e1c99d51dae6916.1642581448.git.leonro@nvidia.com Reported-by: Dan Carpenter Signed-off-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c903b74f46a4..35f0d5e7533d 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3322,7 +3322,7 @@ static int cm_lap_handler(struct cm_work *work) ret = cm_init_av_by_path(param->alternate_path, NULL, &alt_av); if (ret) { rdma_destroy_ah_attr(&ah_attr); - return -EINVAL; + goto deref; } spin_lock_irq(&cm_id_priv->lock); From 4028bccb003cf67e46632dee7f97ddc5d7b6e685 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 19 Jan 2022 04:28:09 -0500 Subject: [PATCH 0793/1295] IB/rdmavt: Validate remote_addr during loopback atomic tests The rdma-core test suite sends an unaligned remote address and expects a failure. ERROR: test_atomic_non_aligned_addr (tests.test_atomic.AtomicTest) The qib/hfi1 rc handling validates properly, but the test has the client and server on the same system. The loopback of these operations is a distinct code path. Fix by syntaxing the proposed remote address in the loopback code path. Fixes: 15703461533a ("IB/{hfi1, qib, rdmavt}: Move ruc_loopback to rdmavt") Link: https://lore.kernel.org/r/1642584489-141005-1-git-send-email-mike.marciniszyn@cornelisnetworks.com Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 3305f2744bfa..ae50b56e8913 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -3073,6 +3073,8 @@ do_write: case IB_WR_ATOMIC_FETCH_AND_ADD: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) goto inv_err; + if (unlikely(wqe->atomic_wr.remote_addr & (sizeof(u64) - 1))) + goto inv_err; if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), wqe->atomic_wr.remote_addr, wqe->atomic_wr.rkey, From e8cc7a5d1ad2d44e7f43664ef6a61e31c0545a5b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 26 Jan 2022 13:32:05 +0100 Subject: [PATCH 0794/1295] dt-bindings: irqchip: renesas-irqc: Add R-Car V3U support Document support for the Interrupt Controller for External Devices (INT-EC) in the Renesas R-Car V3U (r8a779a0) SoC. Signed-off-by: Geert Uytterhoeven Tested-by: Kieran Bingham Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/85b246cc0792663c72c1bb12a8576bd23d2299d3.1643200256.git.geert+renesas@glider.be --- .../devicetree/bindings/interrupt-controller/renesas,irqc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.yaml b/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.yaml index 79d0358e2f61..620f01775e42 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.yaml @@ -36,6 +36,7 @@ properties: - renesas,intc-ex-r8a77980 # R-Car V3H - renesas,intc-ex-r8a77990 # R-Car E3 - renesas,intc-ex-r8a77995 # R-Car D3 + - renesas,intc-ex-r8a779a0 # R-Car V3U - const: renesas,irqc '#interrupt-cells': From 8fbc16d26d3a1ed3d80553b773be29408750987b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 28 Jan 2022 10:03:57 +0100 Subject: [PATCH 0795/1295] dt-bindings: interrupt-controller: sifive,plic: Fix number of interrupts The number of interrupts lacks an upper bound, thus assuming one, causing properly grouped "interrupts-extended" properties to be flagged as an error by "make dtbs_check". Fix this by adding the missing "maxItems", using the architectural maximum of 15872 interrupts. Signed-off-by: Geert Uytterhoeven Acked-by: Rob Herring Reviewed-by: Anup Patel Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/f73a0aead89e1426b146c4c64f797aa035868bf0.1643360419.git.geert@linux-m68k.org --- .../bindings/interrupt-controller/sifive,plic-1.0.0.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml b/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml index 08d5a57ce00f..5edaa08f576e 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml @@ -61,6 +61,7 @@ properties: interrupts-extended: minItems: 1 + maxItems: 15872 description: Specifies which contexts are connected to the PLIC, with "-1" specifying that a context is not present. Each node pointed to should be a From c89e5eb7dcf1519e5e084ee82e0d29d4e751ddb7 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 28 Jan 2022 10:03:58 +0100 Subject: [PATCH 0796/1295] dt-bindings: interrupt-controller: sifive,plic: Group interrupt tuples To improve human readability and enable automatic validation, the tuples in "interrupts-extended" properties should be grouped using angle brackets. Signed-off-by: Geert Uytterhoeven Reviewed-by: Rob Herring Reviewed-by: Anup Patel Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/211705e74a2ce77de43d036c5dea032484119bf7.1643360419.git.geert@linux-m68k.org --- .../interrupt-controller/sifive,plic-1.0.0.yaml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml b/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml index 5edaa08f576e..058997c4e3cd 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml @@ -90,12 +90,11 @@ examples: #interrupt-cells = <1>; compatible = "sifive,fu540-c000-plic", "sifive,plic-1.0.0"; interrupt-controller; - interrupts-extended = < - &cpu0_intc 11 - &cpu1_intc 11 &cpu1_intc 9 - &cpu2_intc 11 &cpu2_intc 9 - &cpu3_intc 11 &cpu3_intc 9 - &cpu4_intc 11 &cpu4_intc 9>; + interrupts-extended = <&cpu0_intc 11>, + <&cpu1_intc 11>, <&cpu1_intc 9>, + <&cpu2_intc 11>, <&cpu2_intc 9>, + <&cpu3_intc 11>, <&cpu3_intc 9>, + <&cpu4_intc 11>, <&cpu4_intc 9>; reg = <0xc000000 0x4000000>; riscv,ndev = <10>; }; From 7f5056b9e7b71149bf11073f00a57fa1ac2921a9 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 26 Jan 2022 15:35:14 -0500 Subject: [PATCH 0797/1295] security, lsm: dentry_init_security() Handle multi LSM registration A ceph user has reported that ceph is crashing with kernel NULL pointer dereference. Following is the backtrace. /proc/version: Linux version 5.16.2-arch1-1 (linux@archlinux) (gcc (GCC) 11.1.0, GNU ld (GNU Binutils) 2.36.1) #1 SMP PREEMPT Thu, 20 Jan 2022 16:18:29 +0000 distro / arch: Arch Linux / x86_64 SELinux is not enabled ceph cluster version: 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) relevant dmesg output: [ 30.947129] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 30.947206] #PF: supervisor read access in kernel mode [ 30.947258] #PF: error_code(0x0000) - not-present page [ 30.947310] PGD 0 P4D 0 [ 30.947342] Oops: 0000 [#1] PREEMPT SMP PTI [ 30.947388] CPU: 5 PID: 778 Comm: touch Not tainted 5.16.2-arch1-1 #1 86fbf2c313cc37a553d65deb81d98e9dcc2a3659 [ 30.947486] Hardware name: Gigabyte Technology Co., Ltd. B365M DS3H/B365M DS3H, BIOS F5 08/13/2019 [ 30.947569] RIP: 0010:strlen+0x0/0x20 [ 30.947616] Code: b6 07 38 d0 74 16 48 83 c7 01 84 c0 74 05 48 39 f7 75 ec 31 c0 31 d2 89 d6 89 d7 c3 48 89 f8 31 d2 89 d6 89 d7 c3 0 f 1f 40 00 <80> 3f 00 74 12 48 89 f8 48 83 c0 01 80 38 00 75 f7 48 29 f8 31 ff [ 30.947782] RSP: 0018:ffffa4ed80ffbbb8 EFLAGS: 00010246 [ 30.947836] RAX: 0000000000000000 RBX: ffffa4ed80ffbc60 RCX: 0000000000000000 [ 30.947904] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 30.947971] RBP: ffff94b0d15c0ae0 R08: 0000000000000000 R09: 0000000000000000 [ 30.948040] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 30.948106] R13: 0000000000000001 R14: ffffa4ed80ffbc60 R15: 0000000000000000 [ 30.948174] FS: 00007fc7520f0740(0000) GS:ffff94b7ced40000(0000) knlGS:0000000000000000 [ 30.948252] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 30.948308] CR2: 0000000000000000 CR3: 0000000104a40001 CR4: 00000000003706e0 [ 30.948376] Call Trace: [ 30.948404] [ 30.948431] ceph_security_init_secctx+0x7b/0x240 [ceph 49f9c4b9bf5be8760f19f1747e26da33920bce4b] [ 30.948582] ceph_atomic_open+0x51e/0x8a0 [ceph 49f9c4b9bf5be8760f19f1747e26da33920bce4b] [ 30.948708] ? get_cached_acl+0x4d/0xa0 [ 30.948759] path_openat+0x60d/0x1030 [ 30.948809] do_filp_open+0xa5/0x150 [ 30.948859] do_sys_openat2+0xc4/0x190 [ 30.948904] __x64_sys_openat+0x53/0xa0 [ 30.948948] do_syscall_64+0x5c/0x90 [ 30.948989] ? exc_page_fault+0x72/0x180 [ 30.949034] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 30.949091] RIP: 0033:0x7fc7521e25bb [ 30.950849] Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25 18 00 00 00 85 c0 75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 0 0 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 91 00 00 00 48 8b 54 24 28 64 48 2b 14 25 Core of the problem is that ceph checks for return code from security_dentry_init_security() and if return code is 0, it assumes everything is fine and continues to call strlen(name), which crashes. Typically SELinux LSM returns 0 and sets name to "security.selinux" and it is not a problem. Or if selinux is not compiled in or disabled, it returns -EOPNOTSUP and ceph deals with it. But somehow in this configuration, 0 is being returned and "name" is not being initialized and that's creating the problem. Our suspicion is that BPF LSM is registering a hook for dentry_init_security() and returns hook default of 0. LSM_HOOK(int, 0, dentry_init_security, struct dentry *dentry,...) I have not been able to reproduce it just by doing CONFIG_BPF_LSM=y. Stephen has tested the patch though and confirms it solves the problem for him. dentry_init_security() is written in such a way that it expects only one LSM to register the hook. Atleast that's the expectation with current code. If another LSM returns a hook and returns default, it will simply return 0 as of now and that will break ceph. Hence, suggestion is that change semantics of this hook a bit. If there are no LSMs or no LSM is taking ownership and initializing security context, then return -EOPNOTSUP. Also allow at max one LSM to initialize security context. This hook can't deal with multiple LSMs trying to init security context. This patch implements this new behavior. Reported-by: Stephen Muth Tested-by: Stephen Muth Suggested-by: Casey Schaufler Acked-by: Casey Schaufler Reviewed-by: Serge Hallyn Cc: Jeff Layton Cc: Christian Brauner Cc: Paul Moore Cc: # 5.16.0 Signed-off-by: Vivek Goyal Reviewed-by: Jeff Layton Acked-by: Paul Moore Acked-by: Christian Brauner Signed-off-by: James Morris --- include/linux/lsm_hook_defs.h | 2 +- security/security.c | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index df8de62f4710..f0c7b352340a 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -82,7 +82,7 @@ LSM_HOOK(int, 0, sb_add_mnt_opt, const char *option, const char *val, int len, void **mnt_opts) LSM_HOOK(int, 0, move_mount, const struct path *from_path, const struct path *to_path) -LSM_HOOK(int, 0, dentry_init_security, struct dentry *dentry, +LSM_HOOK(int, -EOPNOTSUPP, dentry_init_security, struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, void **ctx, u32 *ctxlen) LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode, diff --git a/security/security.c b/security/security.c index c88167a414b4..64abdfb20bc2 100644 --- a/security/security.c +++ b/security/security.c @@ -1056,8 +1056,19 @@ int security_dentry_init_security(struct dentry *dentry, int mode, const char **xattr_name, void **ctx, u32 *ctxlen) { - return call_int_hook(dentry_init_security, -EOPNOTSUPP, dentry, mode, - name, xattr_name, ctx, ctxlen); + struct security_hook_list *hp; + int rc; + + /* + * Only one module will provide a security context. + */ + hlist_for_each_entry(hp, &security_hook_heads.dentry_init_security, list) { + rc = hp->hook.dentry_init_security(dentry, mode, name, + xattr_name, ctx, ctxlen); + if (rc != LSM_RET_DEFAULT(dentry_init_security)) + return rc; + } + return LSM_RET_DEFAULT(dentry_init_security); } EXPORT_SYMBOL(security_dentry_init_security); From e45c47d1f94e0cc7b6b079fdb4bcce2995e2adc4 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 28 Jan 2022 10:58:39 -0500 Subject: [PATCH 0798/1295] block: add bio_start_io_acct_time() to control start_time bio_start_io_acct_time() interface is like bio_start_io_acct() that allows start_time to be passed in. This gives drivers the ability to defer starting accounting until after IO is issued (but possibily not entirely due to bio splitting). Reviewed-by: Christoph Hellwig Signed-off-by: Mike Snitzer Link: https://lore.kernel.org/r/20220128155841.39644-2-snitzer@redhat.com Signed-off-by: Jens Axboe --- block/blk-core.c | 25 +++++++++++++++++++------ include/linux/blkdev.h | 1 + 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 97f8bc8d3a79..d93e3bb9a769 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1061,21 +1061,33 @@ again: } static unsigned long __part_start_io_acct(struct block_device *part, - unsigned int sectors, unsigned int op) + unsigned int sectors, unsigned int op, + unsigned long start_time) { const int sgrp = op_stat_group(op); - unsigned long now = READ_ONCE(jiffies); part_stat_lock(); - update_io_ticks(part, now, false); + update_io_ticks(part, start_time, false); part_stat_inc(part, ios[sgrp]); part_stat_add(part, sectors[sgrp], sectors); part_stat_local_inc(part, in_flight[op_is_write(op)]); part_stat_unlock(); - return now; + return start_time; } +/** + * bio_start_io_acct_time - start I/O accounting for bio based drivers + * @bio: bio to start account for + * @start_time: start time that should be passed back to bio_end_io_acct(). + */ +void bio_start_io_acct_time(struct bio *bio, unsigned long start_time) +{ + __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), + bio_op(bio), start_time); +} +EXPORT_SYMBOL_GPL(bio_start_io_acct_time); + /** * bio_start_io_acct - start I/O accounting for bio based drivers * @bio: bio to start account for @@ -1084,14 +1096,15 @@ static unsigned long __part_start_io_acct(struct block_device *part, */ unsigned long bio_start_io_acct(struct bio *bio) { - return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), bio_op(bio)); + return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), + bio_op(bio), jiffies); } EXPORT_SYMBOL_GPL(bio_start_io_acct); unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op) { - return __part_start_io_acct(disk->part0, sectors, op); + return __part_start_io_acct(disk->part0, sectors, op, jiffies); } EXPORT_SYMBOL(disk_start_io_acct); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9c95df26fc26..f35aea98bc35 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1258,6 +1258,7 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time); +void bio_start_io_acct_time(struct bio *bio, unsigned long start_time); unsigned long bio_start_io_acct(struct bio *bio); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, struct block_device *orig_bdev); From f524d9c95fab54783d0038f7a3e8c014d5b56857 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 28 Jan 2022 10:58:40 -0500 Subject: [PATCH 0799/1295] dm: revert partial fix for redundant bio-based IO accounting Reverts a1e1cb72d9649 ("dm: fix redundant IO accounting for bios that need splitting") because it was too narrow in scope (only addressed redundant 'sectors[]' accounting and not ios, nsecs[], etc). Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer Link: https://lore.kernel.org/r/20220128155841.39644-3-snitzer@redhat.com Signed-off-by: Jens Axboe --- drivers/md/dm.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c0ae8087c602..9849114b3c08 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1442,9 +1442,6 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md, ci->sector = bio->bi_iter.bi_sector; } -#define __dm_part_stat_sub(part, field, subnd) \ - (part_stat_get(part, field) -= (subnd)) - /* * Entry point to split a bio into clones and submit them to the targets. */ @@ -1480,18 +1477,6 @@ static void __split_and_process_bio(struct mapped_device *md, GFP_NOIO, &md->queue->bio_split); ci.io->orig_bio = b; - /* - * Adjust IO stats for each split, otherwise upon queue - * reentry there will be redundant IO accounting. - * NOTE: this is a stop-gap fix, a proper fix involves - * significant refactoring of DM core's bio splitting - * (by eliminating DM's splitting and just using bio_split) - */ - part_stat_lock(); - __dm_part_stat_sub(dm_disk(md)->part0, - sectors[op_stat_group(bio_op(bio))], ci.sector_count); - part_stat_unlock(); - bio_chain(b, bio); trace_block_split(b, bio->bi_iter.bi_sector); submit_bio_noacct(bio); From b879f915bc48a18d4f4462729192435bb0f17052 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 28 Jan 2022 10:58:41 -0500 Subject: [PATCH 0800/1295] dm: properly fix redundant bio-based IO accounting Record the start_time for a bio but defer the starting block core's IO accounting until after IO is submitted using bio_start_io_acct_time(). This approach avoids the need to mess around with any of the individual IO stats in response to a bio_split() that follows bio submission. Reported-by: Bud Brown Reviewed-by: Christoph Hellwig Cc: stable@vger.kernel.org Depends-on: e45c47d1f94e ("block: add bio_start_io_acct_time() to control start_time") Signed-off-by: Mike Snitzer Link: https://lore.kernel.org/r/20220128155841.39644-4-snitzer@redhat.com Signed-off-by: Jens Axboe --- drivers/md/dm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 9849114b3c08..dcbd6d201619 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -489,7 +489,7 @@ static void start_io_acct(struct dm_io *io) struct mapped_device *md = io->md; struct bio *bio = io->orig_bio; - io->start_time = bio_start_io_acct(bio); + bio_start_io_acct_time(bio, io->start_time); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), bio->bi_iter.bi_sector, bio_sectors(bio), @@ -535,7 +535,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) io->md = md; spin_lock_init(&io->endio_lock); - start_io_acct(io); + io->start_time = jiffies; return io; } @@ -1482,6 +1482,7 @@ static void __split_and_process_bio(struct mapped_device *md, submit_bio_noacct(bio); } } + start_io_acct(ci.io); /* drop the extra reference count */ dm_io_dec_pending(ci.io, errno_to_blk_status(error)); From 279eb8575fdaa92c314a54c0d583c65e26229107 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Mon, 24 Jan 2022 21:55:02 +0300 Subject: [PATCH 0801/1295] EDAC/altera: Fix deferred probing The driver overrides the error codes returned by platform_get_irq() to -ENODEV for some strange reason, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the proper error codes to platform driver code upwards. [ bp: Massage commit message. ] Fixes: 71bcada88b0f ("edac: altera: Add Altera SDRAM EDAC support") Signed-off-by: Sergey Shtylyov Signed-off-by: Borislav Petkov Acked-by: Dinh Nguyen Cc: Link: https://lore.kernel.org/r/20220124185503.6720-2-s.shtylyov@omp.ru --- drivers/edac/altera_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 3a6d2416cb0f..5dd29789f97d 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -350,7 +350,7 @@ static int altr_sdram_probe(struct platform_device *pdev) if (irq < 0) { edac_printk(KERN_ERR, EDAC_MC, "No irq %d in DT\n", irq); - return -ENODEV; + return irq; } /* Arria10 has a 2nd IRQ */ From 1601033da2dd2052e0489137f7788a46a8fcd82f Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 28 Jan 2022 19:24:43 +0000 Subject: [PATCH 0802/1295] ASoC: ops: Check for negative values before reading them The controls allow inputs to be specified as negative but our manipulating them into register fields need to be done on unsigned variables so the checks for negative numbers weren't taking effect properly. Do the checks for negative values on the variable in the ABI struct rather than on our local unsigned copy. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20220128192443.3504823-1-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index dc0e7c8d31f3..9833611b83d1 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -316,26 +316,26 @@ int snd_soc_put_volsw(struct snd_kcontrol *kcontrol, if (sign_bit) mask = BIT(sign_bit + 1) - 1; + if (ucontrol->value.integer.value[0] < 0) + return -EINVAL; val = ucontrol->value.integer.value[0]; if (mc->platform_max && val > mc->platform_max) return -EINVAL; if (val > max - min) return -EINVAL; - if (val < 0) - return -EINVAL; val = (val + min) & mask; if (invert) val = max - val; val_mask = mask << shift; val = val << shift; if (snd_soc_volsw_is_stereo(mc)) { + if (ucontrol->value.integer.value[1] < 0) + return -EINVAL; val2 = ucontrol->value.integer.value[1]; if (mc->platform_max && val2 > mc->platform_max) return -EINVAL; if (val2 > max - min) return -EINVAL; - if (val2 < 0) - return -EINVAL; val2 = (val2 + min) & mask; if (invert) val2 = max - val2; @@ -423,13 +423,13 @@ int snd_soc_put_volsw_sx(struct snd_kcontrol *kcontrol, int err = 0; unsigned int val, val_mask; + if (ucontrol->value.integer.value[0] < 0) + return -EINVAL; val = ucontrol->value.integer.value[0]; if (mc->platform_max && val > mc->platform_max) return -EINVAL; if (val > max - min) return -EINVAL; - if (val < 0) - return -EINVAL; val_mask = mask << shift; val = (val + min) & mask; val = val << shift; From dbe0d009d8c23c7408da9721c1378a5f661aaa83 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Fri, 14 Jan 2022 09:59:06 +0100 Subject: [PATCH 0803/1295] arm64: dts: freescale: Fix sound card model for MBa8Mx The audio codec connection on MBa8Mx is identical to MBa7 (imx7) and MBa6 (imx6). Use the same sound card model as well. Fixes commit dfcd1b6f7620 ("arm64: dts: freescale: add initial device tree for TQMa8MQML with i.MX8MM") Signed-off-by: Alexander Stein Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/mba8mx.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/mba8mx.dtsi b/arch/arm64/boot/dts/freescale/mba8mx.dtsi index f27e3c8de916..ce6d5bdba0a8 100644 --- a/arch/arm64/boot/dts/freescale/mba8mx.dtsi +++ b/arch/arm64/boot/dts/freescale/mba8mx.dtsi @@ -91,7 +91,7 @@ sound { compatible = "fsl,imx-audio-tlv320aic32x4"; - model = "tqm-tlv320aic32"; + model = "imx-audio-tlv320aic32x4"; ssi-controller = <&sai3>; audio-codec = <&tlv320aic3x04>; }; From ff3cfc35a4dd4b432e33eee4f6d772411d70399f Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Sat, 15 Jan 2022 22:04:32 +0100 Subject: [PATCH 0804/1295] arm64: dts: ls1028a: sl28: re-enable ftm_alarm0 Commit dd3d936a1b17 ("arm64: dts: ls1028a: add ftm_alarm1 node to be used as wakeup source") disables ftm_alarm0 in the SoC dtsi but doesn't enable it on the board which is still using it. Re-enable it on the sl28 board. Fixes: dd3d936a1b17 ("arm64: dts: ls1028a: add ftm_alarm1 node to be used as wakeup source") Reported-by: Guillaume Tucker Reported-by: "kernelci.org bot" Signed-off-by: Michael Walle Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts index d74e738e4070..c03f4e183389 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts @@ -157,6 +157,10 @@ }; }; +&ftm_alarm0 { + status = "okay"; +}; + &gpio1 { gpio-line-names = "", "", "", "", "", "", "", "", From 6d58c5e21a3fe355ce6d1808e96d02a610265218 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 20 Jan 2022 11:23:55 -0600 Subject: [PATCH 0805/1295] ARM: dts: imx7ulp: Fix 'assigned-clocks-parents' typo The correct property name is 'assigned-clock-parents', not 'assigned-clocks-parents'. Though if the platform works with the typo, one has to wonder if the property is even needed. Signed-off-by: Rob Herring Fixes: 8b8c7d97e2c7 ("ARM: dts: imx7ulp: Add wdog1 node") Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx7ulp.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/imx7ulp.dtsi b/arch/arm/boot/dts/imx7ulp.dtsi index b7ea37ad4e55..bcec98b96411 100644 --- a/arch/arm/boot/dts/imx7ulp.dtsi +++ b/arch/arm/boot/dts/imx7ulp.dtsi @@ -259,7 +259,7 @@ interrupts = ; clocks = <&pcc2 IMX7ULP_CLK_WDG1>; assigned-clocks = <&pcc2 IMX7ULP_CLK_WDG1>; - assigned-clocks-parents = <&scg1 IMX7ULP_CLK_FIRC_BUS_CLK>; + assigned-clock-parents = <&scg1 IMX7ULP_CLK_FIRC_BUS_CLK>; timeout-sec = <40>; }; From 283d45145fbf460dbaf0229cacd7ed60ec52f364 Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Fri, 21 Jan 2022 10:33:25 +0100 Subject: [PATCH 0806/1295] arm64: dts: imx8mq: fix mipi_csi bidirectional port numbers The port numbers for the imx8mq mipi csi controller are wrong and the mipi driver can't find any media devices as port@1 is connected to the CSI bridge, not port@0. And port@0 is connected to the source - the sensor. Fix this. Fixes: bcadd5f66c2a ("arm64: dts: imx8mq: add mipi csi phy and csi bridge descriptions") Signed-off-by: Martin Kepplinger Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mq.dtsi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mq.dtsi b/arch/arm64/boot/dts/freescale/imx8mq.dtsi index 2df2510d0118..bb68c94c2fc9 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mq.dtsi @@ -1151,8 +1151,8 @@ #address-cells = <1>; #size-cells = <0>; - port@0 { - reg = <0>; + port@1 { + reg = <1>; csi1_mipi_ep: endpoint { remote-endpoint = <&csi1_ep>; @@ -1203,8 +1203,8 @@ #address-cells = <1>; #size-cells = <0>; - port@0 { - reg = <0>; + port@1 { + reg = <1>; csi2_mipi_ep: endpoint { remote-endpoint = <&csi2_ep>; From 5ea62d06b1899f63c4374f52c8d40c43cad69ec0 Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Fri, 21 Jan 2022 10:33:26 +0100 Subject: [PATCH 0807/1295] arm64: dts: imx8mq-librem5: fix mipi_csi1 port number to sensor Since the previous commit fixed a hardware description bug for imx8mq, we need to fix up all DT users like this. The mipi_csi port@0 is connected to the sensor, not port@1. Fixes: fed7603597fa ("arm64: dts: imx8mq-librem5: describe the selfie cam") Signed-off-by: Martin Kepplinger Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mq-librem5.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mq-librem5.dtsi b/arch/arm64/boot/dts/freescale/imx8mq-librem5.dtsi index f3e3418f7edc..2d4a472af6a9 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq-librem5.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mq-librem5.dtsi @@ -1115,8 +1115,8 @@ status = "okay"; ports { - port@1 { - reg = <1>; + port@0 { + reg = <0>; mipi1_sensor_ep: endpoint { remote-endpoint = <&camera1_ep>; From 91f6d5f181f6629dd74ab71759fe92d3f4eff966 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Sat, 29 Jan 2022 14:39:05 +0800 Subject: [PATCH 0808/1295] arm64: dts: imx8mq: fix lcdif port node The port node does not have a unit-address, remove it. This fixes the warnings: lcd-controller@30320000: 'port' is a required property lcd-controller@30320000: 'port@0' does not match any of the regexes: 'pinctrl-[0-9]+' Fixes: commit d0081bd02a03 ("arm64: dts: imx8mq: Add NWL MIPI DSI controller") Signed-off-by: Alexander Stein Reviewed-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mq.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mq.dtsi b/arch/arm64/boot/dts/freescale/imx8mq.dtsi index bb68c94c2fc9..e92ebb6147e6 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mq.dtsi @@ -554,7 +554,7 @@ assigned-clock-rates = <0>, <0>, <0>, <594000000>; status = "disabled"; - port@0 { + port { lcdif_mipi_dsi: endpoint { remote-endpoint = <&mipi_dsi_lcdif_in>; }; From 489f710a738e24d887823a010b8b206b4124e26f Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Sat, 29 Jan 2022 09:32:33 +0000 Subject: [PATCH 0809/1295] cifs: unlock chan_lock before calling cifs_put_tcp_session While removing an smb session, we need to free up the tcp session for each channel for that session. We were doing this with chan_lock held. This results in a cyclic dependency with cifs_tcp_ses_lock. For now, unlock the chan_lock temporarily before calling cifs_put_tcp_session. This should not cause any problem for now, since we do not remove channels anywhere else. And this code segment will not be called by two threads. When we do implement the code for removing channels, we will need to execute proper ref counting here. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ed210d774a21..5a51a098b845 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1831,13 +1831,9 @@ void cifs_put_smb_ses(struct cifs_ses *ses) int i; for (i = 1; i < chan_count; i++) { - /* - * note: for now, we're okay accessing ses->chans - * without chan_lock. But when chans can go away, we'll - * need to introduce ref counting to make sure that chan - * is not freed from under us. - */ + spin_unlock(&ses->chan_lock); cifs_put_tcp_session(ses->chans[i].server, 0); + spin_lock(&ses->chan_lock); ses->chans[i].server = NULL; } } From dfd0dfb9a7cc04acf93435b440dd34c2ca7b4424 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Mon, 24 Jan 2022 21:55:03 +0300 Subject: [PATCH 0810/1295] EDAC/xgene: Fix deferred probing The driver overrides error codes returned by platform_get_irq_optional() to -EINVAL for some strange reason, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the proper error codes to platform driver code upwards. [ bp: Massage commit message. ] Fixes: 0d4429301c4a ("EDAC: Add APM X-Gene SoC EDAC driver") Signed-off-by: Sergey Shtylyov Signed-off-by: Borislav Petkov Cc: Link: https://lore.kernel.org/r/20220124185503.6720-3-s.shtylyov@omp.ru --- drivers/edac/xgene_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/xgene_edac.c b/drivers/edac/xgene_edac.c index 2ccd1db5e98f..7197f9fa0245 100644 --- a/drivers/edac/xgene_edac.c +++ b/drivers/edac/xgene_edac.c @@ -1919,7 +1919,7 @@ static int xgene_edac_probe(struct platform_device *pdev) irq = platform_get_irq_optional(pdev, i); if (irq < 0) { dev_err(&pdev->dev, "No IRQ resource\n"); - rc = -EINVAL; + rc = irq; goto out_err; } rc = devm_request_irq(&pdev->dev, irq, From 5297c693d8c8e08fa742e3112cf70723f7a04da2 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 27 Jan 2022 13:50:31 -0800 Subject: [PATCH 0811/1295] pinctrl: bcm2835: Fix a few error paths After commit 266423e60ea1 ("pinctrl: bcm2835: Change init order for gpio hogs") a few error paths would not unwind properly the registration of gpio ranges. Correct that by assigning a single error label and goto it whenever we encounter a fatal error. Fixes: 266423e60ea1 ("pinctrl: bcm2835: Change init order for gpio hogs") Signed-off-by: Florian Fainelli Link: https://lore.kernel.org/r/20220127215033.267227-1-f.fainelli@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/bcm/pinctrl-bcm2835.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/pinctrl/bcm/pinctrl-bcm2835.c b/drivers/pinctrl/bcm/pinctrl-bcm2835.c index c4ebfa852b42..47e433e09c5c 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm2835.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm2835.c @@ -1269,16 +1269,18 @@ static int bcm2835_pinctrl_probe(struct platform_device *pdev) sizeof(*girq->parents), GFP_KERNEL); if (!girq->parents) { - pinctrl_remove_gpio_range(pc->pctl_dev, &pc->gpio_range); - return -ENOMEM; + err = -ENOMEM; + goto out_remove; } if (is_7211) { pc->wake_irq = devm_kcalloc(dev, BCM2835_NUM_IRQS, sizeof(*pc->wake_irq), GFP_KERNEL); - if (!pc->wake_irq) - return -ENOMEM; + if (!pc->wake_irq) { + err = -ENOMEM; + goto out_remove; + } } /* @@ -1306,8 +1308,10 @@ static int bcm2835_pinctrl_probe(struct platform_device *pdev) len = strlen(dev_name(pc->dev)) + 16; name = devm_kzalloc(pc->dev, len, GFP_KERNEL); - if (!name) - return -ENOMEM; + if (!name) { + err = -ENOMEM; + goto out_remove; + } snprintf(name, len, "%s:bank%d", dev_name(pc->dev), i); @@ -1326,11 +1330,14 @@ static int bcm2835_pinctrl_probe(struct platform_device *pdev) err = gpiochip_add_data(&pc->gpio_chip, pc); if (err) { dev_err(dev, "could not add GPIO chip\n"); - pinctrl_remove_gpio_range(pc->pctl_dev, &pc->gpio_range); - return err; + goto out_remove; } return 0; + +out_remove: + pinctrl_remove_gpio_range(pc->pctl_dev, &pc->gpio_range); + return err; } static struct platform_driver bcm2835_pinctrl_driver = { From 3a5286955bf5febc3d151bcb2c5e272e383b64aa Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Mon, 17 Jan 2022 01:25:57 -0500 Subject: [PATCH 0812/1295] pinctrl: bcm63xx: fix unmet dependency on REGMAP for GPIO_REGMAP When PINCTRL_BCM63XX is selected, and REGMAP is not selected, Kbuild gives the following warning: WARNING: unmet direct dependencies detected for GPIO_REGMAP Depends on [n]: GPIOLIB [=y] && REGMAP [=n] Selected by [y]: - PINCTRL_BCM63XX [=y] && PINCTRL [=y] This is because PINCTRL_BCM63XX selects GPIO_REGMAP without selecting or depending on REGMAP, despite GPIO_REGMAP depending on REGMAP. This unmet dependency bug was detected by Kismet, a static analysis tool for Kconfig. Please advise if this is not the appropriate solution. Signed-off-by: Julian Braha Link: https://lore.kernel.org/r/20220117062557.89568-1-julianbraha@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/bcm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pinctrl/bcm/Kconfig b/drivers/pinctrl/bcm/Kconfig index 5123f4c33854..ac1e400bbbac 100644 --- a/drivers/pinctrl/bcm/Kconfig +++ b/drivers/pinctrl/bcm/Kconfig @@ -35,6 +35,7 @@ config PINCTRL_BCM63XX select PINCONF select GENERIC_PINCONF select GPIOLIB + select REGMAP select GPIO_REGMAP config PINCTRL_BCM6318 From 6cb917411e028dcb66ce8f5db1b47361b78d7d3f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 29 Jan 2022 13:40:52 -0800 Subject: [PATCH 0813/1295] include/linux/sysctl.h: fix register_sysctl_mount_point() return type The CONFIG_SYSCTL=n stub returns the wrong type. Fixes: ee9efac48a082 ("sysctl: add helper to register a sysctl mount point") Reported-by: kernel test robot Acked-by: Luis Chamberlain Cc: Tong Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 180adf7da785..6353d6db69b2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -265,7 +265,7 @@ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * return NULL; } -static inline struct sysctl_header *register_sysctl_mount_point(const char *path) +static inline struct ctl_table_header *register_sysctl_mount_point(const char *path) { return NULL; } From e7f1e8834b2b2144dfbe0b2235d05e4f6f95882e Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Sat, 29 Jan 2022 13:40:55 -0800 Subject: [PATCH 0814/1295] binfmt_misc: fix crash when load/unload module We should unregister the table upon module unload otherwise something horrible will happen when we load binfmt_misc module again. Also note that we should keep value returned by register_sysctl_mount_point() and release it later, otherwise it will leak. Also, per Christian's comment, to fully restore the old behavior that won't break userspace the check(binfmt_misc_header) should be eliminated. To reproduce: modprobe binfmt_misc modprobe -r binfmt_misc modprobe binfmt_misc modprobe -r binfmt_misc modprobe binfmt_misc resulting in modprobe: can't load module binfmt_misc (kernel/fs/binfmt_misc.ko): Cannot allocate memory and an unhappy kernel: binfmt_misc: Failed to create fs/binfmt_misc sysctl mount point binfmt_misc: Failed to create fs/binfmt_misc sysctl mount point BUG: unable to handle page fault for address: fffffbfff8004802 Call Trace: init_misc_binfmt+0x2d/0x1000 [binfmt_misc] Link: https://lkml.kernel.org/r/20220124181812.1869535-2-ztong0001@gmail.com Fixes: 3ba442d5331f ("fs: move binfmt_misc sysctl to its own file") Signed-off-by: Tong Zhang Co-developed-by: Christian Brauner Acked-by: Luis Chamberlain Cc: Eric Biederman Cc: Kees Cook Cc: Iurii Zaikin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index ddea6acbddde..c07f35719ee3 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -817,20 +817,20 @@ static struct file_system_type bm_fs_type = { }; MODULE_ALIAS_FS("binfmt_misc"); +static struct ctl_table_header *binfmt_misc_header; + static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); if (!err) insert_binfmt(&misc_format); - if (!register_sysctl_mount_point("fs/binfmt_misc")) { - pr_warn("Failed to create fs/binfmt_misc sysctl mount point"); - return -ENOMEM; - } + binfmt_misc_header = register_sysctl_mount_point("fs/binfmt_misc"); return 0; } static void __exit exit_misc_binfmt(void) { + unregister_sysctl_table(binfmt_misc_header); unregister_binfmt(&misc_format); unregister_filesystem(&bm_fs_type); } From dbecf9b8b8ce580f4e11afed9d61e8aa294cddd2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 29 Jan 2022 13:40:58 -0800 Subject: [PATCH 0815/1295] ia64: make IA64_MCA_RECOVERY bool instead of tristate In linux-next, IA64_MCA_RECOVERY uses the (new) function make_task_dead(), which is not exported for use by modules. Instead of exporting it for one user, convert IA64_MCA_RECOVERY to be a bool Kconfig symbol. In a config file from "kernel test robot " for a different problem, this linker error was exposed when CONFIG_IA64_MCA_RECOVERY=m. Fixes this build error: ERROR: modpost: "make_task_dead" [arch/ia64/kernel/mca_recovery.ko] undefined! Link: https://lkml.kernel.org/r/20220124213129.29306-1-rdunlap@infradead.org Fixes: 0e25498f8cd4 ("exit: Add and use make_task_dead.") Signed-off-by: Randy Dunlap Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: "Eric W. Biederman" Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 703952819e10..a7e01573abd8 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -318,7 +318,7 @@ config ARCH_PROC_KCORE_TEXT depends on PROC_KCORE config IA64_MCA_RECOVERY - tristate "MCA recovery from errors other than TLB." + bool "MCA recovery from errors other than TLB." config IA64_PALINFO tristate "/proc/pal support" From 61e28cf0543c7d8e6ef88c3c305f727c5a21ba5b Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Sat, 29 Jan 2022 13:41:01 -0800 Subject: [PATCH 0816/1295] memory-failure: fetch compound_head after pgmap_pfn_valid() memory_failure_dev_pagemap() at the moment assumes base pages (e.g. dax_lock_page()). For devmap with compound pages fetch the compound_head in case a tail page memory failure is being handled. Currently this is a nop, but in the advent of compound pages in dev_pagemap it allows memory_failure_dev_pagemap() to keep working. Without this fix memory-failure handling (i.e. MCEs on pmem) with device-dax configured namespaces will regress (and crash). Link: https://lkml.kernel.org/r/20211202204422.26777-2-joao.m.martins@oracle.com Reported-by: Jane Chu Signed-off-by: Joao Martins Reviewed-by: Naoya Horiguchi Reviewed-by: Dan Williams Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 14ae5c18e776..97a9ed8f87a9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1595,6 +1595,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, goto out; } + /* + * Pages instantiated by device-dax (not filesystem-dax) + * may be compound pages. + */ + page = compound_head(page); + /* * Prevent the inode from being freed while we are interrogating * the address_space, typically this would be handled by From 536f4217ced62b671bd759f6b549621a5654a70f Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 29 Jan 2022 13:41:04 -0800 Subject: [PATCH 0817/1295] mm: page->mapping folio->mapping should have the same offset As with the other members of folio, the offset of page->mapping and folio->mapping must be the same. The compile-time check was inadvertently removed during development. Add it back. [willy@infradead.org: changelog redo] Link: https://lkml.kernel.org/r/20220104011734.21714-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9db36dc5d4cf..5140e5feb486 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -261,6 +261,7 @@ static_assert(sizeof(struct page) == sizeof(struct folio)); static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) FOLIO_MATCH(flags, flags); FOLIO_MATCH(lru, lru); +FOLIO_MATCH(mapping, mapping); FOLIO_MATCH(compound_head, lru); FOLIO_MATCH(index, index); FOLIO_MATCH(private, private); From 0226bd64da52aa23120d1450c37a424387827a21 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Sat, 29 Jan 2022 13:41:07 -0800 Subject: [PATCH 0818/1295] tools/testing/scatterlist: add missing defines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cited commits replaced preemptible with pagefault_disabled and flush_kernel_dcache_page with flush_dcache_page respectively, hence need to update the corresponding defines in the test. scatterlist.c: In function ‘sg_miter_stop’: scatterlist.c:919:4: warning: implicit declaration of function ‘flush_dcache_page’ [-Wimplicit-function-declaration] flush_dcache_page(miter->page); ^~~~~~~~~~~~~~~~~ In file included from linux/scatterlist.h:8:0, from scatterlist.c:9: scatterlist.c:922:18: warning: implicit declaration of function ‘pagefault_disabled’ [-Wimplicit-function-declaration] WARN_ON_ONCE(!pagefault_disabled()); ^ linux/mm.h:23:25: note: in definition of macro ‘WARN_ON_ONCE’ int __ret_warn_on = !!(condition); \ ^~~~~~~~~ Link: https://lkml.kernel.org/r/20220118082105.1737320-1-maorg@nvidia.com Fixes: 723aca208516 ("mm/scatterlist: replace the !preemptible warning in sg_miter_stop()") Fixes: 0e84f5dbf8d6 ("scatterlist: replace flush_kernel_dcache_page with flush_dcache_page") Signed-off-by: Maor Gottlieb Tested-by: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/scatterlist/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/scatterlist/linux/mm.h b/tools/testing/scatterlist/linux/mm.h index 16ec895bbe5f..5bd9e6e80625 100644 --- a/tools/testing/scatterlist/linux/mm.h +++ b/tools/testing/scatterlist/linux/mm.h @@ -74,7 +74,7 @@ static inline unsigned long page_to_phys(struct page *page) __UNIQUE_ID(min1_), __UNIQUE_ID(min2_), \ x, y) -#define preemptible() (1) +#define pagefault_disabled() (0) static inline void *kmap(struct page *page) { @@ -127,6 +127,7 @@ kmalloc_array(unsigned int n, unsigned int size, unsigned int flags) #define kmemleak_free(a) #define PageSlab(p) (0) +#define flush_dcache_page(p) #define MAX_ERRNO 4095 From 09c6304e38e440b93a9ebf3f3cf75cd6cb529f91 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Sat, 29 Jan 2022 13:41:11 -0800 Subject: [PATCH 0819/1295] kasan: test: fix compatibility with FORTIFY_SOURCE With CONFIG_FORTIFY_SOURCE enabled, string functions will also perform dynamic checks using __builtin_object_size(ptr), which when failed will panic the kernel. Because the KASAN test deliberately performs out-of-bounds operations, the kernel panics with FORTIFY_SOURCE, for example: | kernel BUG at lib/string_helpers.c:910! | invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI | CPU: 1 PID: 137 Comm: kunit_try_catch Tainted: G B 5.16.0-rc3+ #3 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 | RIP: 0010:fortify_panic+0x19/0x1b | ... | Call Trace: | kmalloc_oob_in_memset.cold+0x16/0x16 | ... Fix it by also hiding `ptr` from the optimizer, which will ensure that __builtin_object_size() does not return a valid size, preventing fortified string functions from panicking. Link: https://lkml.kernel.org/r/20220124160744.1244685-1-elver@google.com Signed-off-by: Marco Elver Reported-by: Nico Pache Reviewed-by: Nico Pache Reviewed-by: Andrey Konovalov Reviewed-by: Kees Cook Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Brendan Higgins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 847cdbefab46..26a5c9007653 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -492,6 +492,7 @@ static void kmalloc_oob_in_memset(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size + KASAN_GRANULE_SIZE)); @@ -515,6 +516,7 @@ static void kmalloc_memmove_negative_size(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset((char *)ptr, 0, 64); + OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(invalid_size); KUNIT_EXPECT_KASAN_FAIL(test, memmove((char *)ptr, (char *)ptr + 4, invalid_size)); @@ -531,6 +533,7 @@ static void kmalloc_memmove_invalid_size(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset((char *)ptr, 0, 64); + OPTIMIZER_HIDE_VAR(ptr); KUNIT_EXPECT_KASAN_FAIL(test, memmove((char *)ptr, (char *)ptr + 4, invalid_size)); kfree(ptr); @@ -893,6 +896,7 @@ static void kasan_memchr(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = memchr(ptr, '1', size + 1)); @@ -919,6 +923,7 @@ static void kasan_memcmp(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset(arr, 0, sizeof(arr)); + OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = memcmp(ptr, arr, size+1)); From 27fe73394a1c6d0b07fa4d95f1bca116d1cc66e9 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Sat, 29 Jan 2022 13:41:14 -0800 Subject: [PATCH 0820/1295] mm, kasan: use compare-exchange operation to set KASAN page tag It has been reported that the tag setting operation on newly-allocated pages can cause the page flags to be corrupted when performed concurrently with other flag updates as a result of the use of non-atomic operations. Fix the problem by using a compare-exchange loop to update the tag. Link: https://lkml.kernel.org/r/20220120020148.1632253-1-pcc@google.com Link: https://linux-review.googlesource.com/id/I456b24a2b9067d93968d43b4bb3351c0cec63101 Fixes: 2813b9c02962 ("kasan, mm, arm64: tag non slab memory allocated via pagealloc") Signed-off-by: Peter Collingbourne Reviewed-by: Andrey Konovalov Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e1a84b1e6787..213cc569b192 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1506,11 +1506,18 @@ static inline u8 page_kasan_tag(const struct page *page) static inline void page_kasan_tag_set(struct page *page, u8 tag) { - if (kasan_enabled()) { - tag ^= 0xff; - page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); - page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; - } + unsigned long old_flags, flags; + + if (!kasan_enabled()) + return; + + tag ^= 0xff; + old_flags = READ_ONCE(page->flags); + do { + flags = old_flags; + flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); + flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; + } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) From 51e50fbd3efc6064c30ed73a5e009018b36e290a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 29 Jan 2022 13:41:17 -0800 Subject: [PATCH 0821/1295] psi: fix "no previous prototype" warnings when CONFIG_CGROUPS=n When CONFIG_CGROUPS is disabled psi code generates the following warnings: kernel/sched/psi.c:1112:21: warning: no previous prototype for 'psi_trigger_create' [-Wmissing-prototypes] 1112 | struct psi_trigger *psi_trigger_create(struct psi_group *group, | ^~~~~~~~~~~~~~~~~~ kernel/sched/psi.c:1182:6: warning: no previous prototype for 'psi_trigger_destroy' [-Wmissing-prototypes] 1182 | void psi_trigger_destroy(struct psi_trigger *t) | ^~~~~~~~~~~~~~~~~~~ kernel/sched/psi.c:1249:10: warning: no previous prototype for 'psi_trigger_poll' [-Wmissing-prototypes] 1249 | __poll_t psi_trigger_poll(void **trigger_ptr, | ^~~~~~~~~~~~~~~~ Change the declarations of these functions in the header to provide the prototypes even when they are unused. Link: https://lkml.kernel.org/r/20220119223940.787748-2-surenb@google.com Fixes: 0e94682b73bf ("psi: introduce psi monitor") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/psi.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/include/linux/psi.h b/include/linux/psi.h index f8ce53bfdb2a..7f7d1d88c3bb 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -25,18 +25,17 @@ void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); - -#ifdef CONFIG_CGROUPS -int psi_cgroup_alloc(struct cgroup *cgrp); -void psi_cgroup_free(struct cgroup *cgrp); -void cgroup_move_task(struct task_struct *p, struct css_set *to); - struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res); void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); + +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgrp); +void psi_cgroup_free(struct cgroup *cgrp); +void cgroup_move_task(struct task_struct *p, struct css_set *to); #endif #else /* CONFIG_PSI */ From 44585f7bc0cb01095bc2ad4258049c02bbad21ef Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 29 Jan 2022 13:41:20 -0800 Subject: [PATCH 0822/1295] psi: fix "defined but not used" warnings when CONFIG_PROC_FS=n When CONFIG_PROC_FS is disabled psi code generates the following warnings: kernel/sched/psi.c:1364:30: warning: 'psi_cpu_proc_ops' defined but not used [-Wunused-const-variable=] 1364 | static const struct proc_ops psi_cpu_proc_ops = { | ^~~~~~~~~~~~~~~~ kernel/sched/psi.c:1355:30: warning: 'psi_memory_proc_ops' defined but not used [-Wunused-const-variable=] 1355 | static const struct proc_ops psi_memory_proc_ops = { | ^~~~~~~~~~~~~~~~~~~ kernel/sched/psi.c:1346:30: warning: 'psi_io_proc_ops' defined but not used [-Wunused-const-variable=] 1346 | static const struct proc_ops psi_io_proc_ops = { | ^~~~~~~~~~~~~~~ Make definitions of these structures and related functions conditional on CONFIG_PROC_FS config. Link: https://lkml.kernel.org/r/20220119223940.787748-3-surenb@google.com Fixes: 0e94682b73bf ("psi: introduce psi monitor") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 79 ++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index c137c4d6983e..e14358178849 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1082,44 +1082,6 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -static int psi_io_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_IO); -} - -static int psi_memory_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_MEM); -} - -static int psi_cpu_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_CPU); -} - -static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) -{ - if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - return single_open(file, psi_show, NULL); -} - -static int psi_io_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_io_show); -} - -static int psi_memory_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_memory_show); -} - -static int psi_cpu_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_cpu_show); -} - struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { @@ -1278,6 +1240,45 @@ __poll_t psi_trigger_poll(void **trigger_ptr, return ret; } +#ifdef CONFIG_PROC_FS +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) +{ + if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return single_open(file, psi_show, NULL); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_io_show); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_memory_show); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_cpu_show); +} + static ssize_t psi_write(struct file *file, const char __user *user_buf, size_t nbytes, enum psi_res res) { @@ -1392,3 +1393,5 @@ static int __init psi_proc_init(void) return 0; } module_init(psi_proc_init); + +#endif /* CONFIG_PROC_FS */ From 4cd1103d8c66b2cdb7e64385c274edb0ac5e8887 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Sat, 29 Jan 2022 13:41:23 -0800 Subject: [PATCH 0823/1295] jbd2: export jbd2_journal_[grab|put]_journal_head Patch series "ocfs2: fix a deadlock case". This fixes a deadlock case in ocfs2. We firstly export jbd2 symbols jbd2_journal_[grab|put]_journal_head as preparation and later use them in ocfs2 insread of jbd_[lock|unlock]_bh_journal_head to fix the deadlock. This patch (of 2): This exports symbols jbd2_journal_[grab|put]_journal_head, which will be used outside modules, e.g. ocfs2. Link: https://lkml.kernel.org/r/20220121071205.100648-2-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Andreas Dilger Cc: Gautham Ananthakrishna Cc: Saeed Mirzamohammadi Cc: "Theodore Ts'o" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd2/journal.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f13d548e4a7f..bf108d4493d2 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2972,6 +2972,7 @@ struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh) jbd_unlock_bh_journal_head(bh); return jh; } +EXPORT_SYMBOL(jbd2_journal_grab_journal_head); static void __journal_remove_journal_head(struct buffer_head *bh) { @@ -3024,6 +3025,7 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) jbd_unlock_bh_journal_head(bh); } } +EXPORT_SYMBOL(jbd2_journal_put_journal_head); /* * Initialize jbd inode head From ddf4b773aa40790dfa936bd845c18e735a49c61c Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Sat, 29 Jan 2022 13:41:27 -0800 Subject: [PATCH 0824/1295] ocfs2: fix a deadlock when commit trans commit 6f1b228529ae introduces a regression which can deadlock as follows: Task1: Task2: jbd2_journal_commit_transaction ocfs2_test_bg_bit_allocatable spin_lock(&jh->b_state_lock) jbd_lock_bh_journal_head __jbd2_journal_remove_checkpoint spin_lock(&jh->b_state_lock) jbd2_journal_put_journal_head jbd_lock_bh_journal_head Task1 and Task2 lock bh->b_state and jh->b_state_lock in different order, which finally result in a deadlock. So use jbd2_journal_[grab|put]_journal_head instead in ocfs2_test_bg_bit_allocatable() to fix it. Link: https://lkml.kernel.org/r/20220121071205.100648-3-joseph.qi@linux.alibaba.com Fixes: 6f1b228529ae ("ocfs2: fix race between searching chunks and release journal_head from buffer_head") Signed-off-by: Joseph Qi Reported-by: Gautham Ananthakrishna Tested-by: Gautham Ananthakrishna Reported-by: Saeed Mirzamohammadi Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/suballoc.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 481017e1dac5..166c8918c825 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1251,26 +1251,23 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, { struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct journal_head *jh; - int ret = 1; + int ret; if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) return 0; - if (!buffer_jbd(bg_bh)) + jh = jbd2_journal_grab_journal_head(bg_bh); + if (!jh) return 1; - jbd_lock_bh_journal_head(bg_bh); - if (buffer_jbd(bg_bh)) { - jh = bh2jh(bg_bh); - spin_lock(&jh->b_state_lock); - bg = (struct ocfs2_group_desc *) jh->b_committed_data; - if (bg) - ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); - else - ret = 1; - spin_unlock(&jh->b_state_lock); - } - jbd_unlock_bh_journal_head(bg_bh); + spin_lock(&jh->b_state_lock); + bg = (struct ocfs2_group_desc *) jh->b_committed_data; + if (bg) + ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); + else + ret = 1; + spin_unlock(&jh->b_state_lock); + jbd2_journal_put_journal_head(jh); return ret; } From 22e424feb6658c5d6789e45121830357809c59cb Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Sat, 29 Jan 2022 18:42:59 +0900 Subject: [PATCH 0825/1295] Revert "fs/9p: search open fids first" This reverts commit 478ba09edc1f2f2ee27180a06150cb2d1a686f9c. That commit was meant as a fix for setattrs with by fd (e.g. ftruncate) to use an open fid instead of the first fid it found on lookup. The proper fix for that is to use the fid associated with the open file struct, available in iattr->ia_file for such operations, and was actually done just before in 66246641609b ("9p: retrieve fid from file when file instance exist.") As such, this commit is no longer required. Furthermore, changing lookup to return open fids first had unwanted side effects, as it turns out the protocol forbids the use of open fids for further walks (e.g. clone_fid) and we broke mounts for some servers enforcing this rule. Note this only reverts to the old working behaviour, but it's still possible for lookup to return open fids if dentry->d_fsdata is not set, so more work is needed to make sure we respect this rule in the future, for example by adding a flag to the lookup functions to only match certain fid open modes depending on caller requirements. Link: https://lkml.kernel.org/r/20220130130651.712293-1-asmadeus@codewreck.org Fixes: 478ba09edc1f ("fs/9p: search open fids first") Cc: stable@vger.kernel.org # v5.11+ Reported-by: ron minnich Reported-by: ng@0x80.stream Signed-off-by: Dominique Martinet --- fs/9p/fid.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 6aab046c98e2..79df61fe0e59 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -96,12 +96,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) dentry, dentry, from_kuid(&init_user_ns, uid), any); ret = NULL; - - if (d_inode(dentry)) - ret = v9fs_fid_find_inode(d_inode(dentry), uid); - /* we'll recheck under lock if there's anything to look in */ - if (!ret && dentry->d_fsdata) { + if (dentry->d_fsdata) { struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata; spin_lock(&dentry->d_lock); @@ -113,6 +109,9 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) } } spin_unlock(&dentry->d_lock); + } else { + if (dentry->d_inode) + ret = v9fs_fid_find_inode(dentry->d_inode, uid); } return ret; From 26291c54e111ff6ba87a164d85d4a4e134b7315c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 30 Jan 2022 15:37:07 +0200 Subject: [PATCH 0826/1295] Linux 5.17-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0fb4f94a6885..1fc3491096cb 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Gobble Gobble # *DOCUMENTATION* From 2719c7160dcfaae1f73a1c0c210ad3281c19022e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 30 Jan 2022 08:53:16 -0800 Subject: [PATCH 0827/1295] vfs: make freeze_super abort when sync_filesystem returns error If we fail to synchronize the filesystem while preparing to freeze the fs, abort the freeze. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner --- fs/super.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/super.c b/fs/super.c index 7af820ba5ad5..f1d4a193602d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1616,11 +1616,9 @@ static void lockdep_sb_freeze_acquire(struct super_block *sb) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } -static void sb_freeze_unlock(struct super_block *sb) +static void sb_freeze_unlock(struct super_block *sb, int level) { - int level; - - for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } @@ -1691,7 +1689,14 @@ int freeze_super(struct super_block *sb) sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ - sync_filesystem(sb); + ret = sync_filesystem(sb); + if (ret) { + sb->s_writers.frozen = SB_UNFROZEN; + sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); + wake_up(&sb->s_writers.wait_unfrozen); + deactivate_locked_super(sb); + return ret; + } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; @@ -1703,7 +1708,7 @@ int freeze_super(struct super_block *sb) printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; - sb_freeze_unlock(sb); + sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; @@ -1748,7 +1753,7 @@ static int thaw_super_locked(struct super_block *sb) } sb->s_writers.frozen = SB_UNFROZEN; - sb_freeze_unlock(sb); + sb_freeze_unlock(sb, SB_FREEZE_FS); out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); From 5679897eb104cec9e99609c3f045a0c20603da4c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 30 Jan 2022 08:53:16 -0800 Subject: [PATCH 0828/1295] vfs: make sync_filesystem return errors from ->sync_fs Strangely, sync_filesystem ignores the return code from the ->sync_fs call, which means that syscalls like syncfs(2) never see the error. This doesn't seem right, so fix that. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner --- fs/sync.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/sync.c b/fs/sync.c index 3ce8e2137f31..c7690016453e 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -29,7 +29,7 @@ */ int sync_filesystem(struct super_block *sb) { - int ret; + int ret = 0; /* * We need to be protected against the filesystem going from @@ -52,15 +52,21 @@ int sync_filesystem(struct super_block *sb) * at a time. */ writeback_inodes_sb(sb, WB_REASON_SYNC); - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 0); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 0); + if (ret) + return ret; + } ret = sync_blockdev_nowait(sb->s_bdev); - if (ret < 0) + if (ret) return ret; sync_inodes_sb(sb); - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 1); + if (ret) + return ret; + } return sync_blockdev(sb->s_bdev); } EXPORT_SYMBOL(sync_filesystem); From dd5532a4994bfda0386eb2286ec00758cee08444 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 30 Jan 2022 08:53:16 -0800 Subject: [PATCH 0829/1295] quota: make dquot_quota_sync return errors from ->sync_fs Strangely, dquot_quota_sync ignores the return code from the ->sync_fs call, which means that quotacalls like Q_SYNC never see the error. This doesn't seem right, so fix that. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner --- fs/quota/dquot.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 22d904bde6ab..a74aef99bd3d 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -690,9 +690,14 @@ int dquot_quota_sync(struct super_block *sb, int type) /* This is not very clever (and fast) but currently I don't know about * any other simple way of getting quota data to disk and we must get * them there for userspace to be visible... */ - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); - sync_blockdev(sb->s_bdev); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 1); + if (ret) + return ret; + } + ret = sync_blockdev(sb->s_bdev); + if (ret) + return ret; /* * Now when everything is written we can discard the pagecache so From 2d86293c70750e4331e9616aded33ab6b47c299d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 30 Jan 2022 08:53:17 -0800 Subject: [PATCH 0830/1295] xfs: return errors in xfs_fs_sync_fs Now that the VFS will do something with the return values from ->sync_fs, make ours pass on error codes. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner --- fs/xfs/xfs_super.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e8f37bdc8354..4c0dee78b2f8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -735,6 +735,7 @@ xfs_fs_sync_fs( int wait) { struct xfs_mount *mp = XFS_M(sb); + int error; trace_xfs_fs_sync_fs(mp, __return_address); @@ -744,7 +745,10 @@ xfs_fs_sync_fs( if (!wait) return 0; - xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_log_force(mp, XFS_LOG_SYNC); + if (error) + return error; + if (laptop_mode) { /* * The disk must be active because we're syncing. From 47307c31d90ae7d52cebbbc7c1d4ff213213d4e9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jan 2022 12:38:11 -0800 Subject: [PATCH 0831/1295] crypto: octeontx2 - Avoid stack variable overflow Building with -Warray-bounds showed a stack variable array index overflow. Increase the expected size of the array to avoid the warning: In file included from ./include/linux/printk.h:555, from ./include/asm-generic/bug.h:22, from ./arch/x86/include/asm/bug.h:84, from ./include/linux/bug.h:5, from ./include/linux/mmdebug.h:5, from ./include/linux/gfp.h:5, from ./include/linux/firmware.h:7, from drivers/crypto/marvell/octeontx2/otx2_cptpf_ucode.c:5: drivers/crypto/marvell/octeontx2/otx2_cptpf_ucode.c: In function 'otx2_cpt_print_uc_dbg_info': ./include/linux/dynamic_debug.h:162:33: warning: array subscript 4 is above array bounds of 'u32[4]' {aka 'unsigned int[4]'} [-Warray-bounds] 162 | _dynamic_func_call(fmt, __dynamic_pr_debug, \ | ^ ./include/linux/dynamic_debug.h:134:17: note: in definition of macro '__dynamic_func_call' 134 | func(&id, ##__VA_ARGS__); \ | ^~~~ ./include/linux/dynamic_debug.h:162:9: note: in expansion of macro '_dynamic_func_call' 162 | _dynamic_func_call(fmt, __dynamic_pr_debug, \ | ^~~~~~~~~~~~~~~~~~ ./include/linux/printk.h:570:9: note: in expansion of macro 'dynamic_pr_debug' 570 | dynamic_pr_debug(fmt, ##__VA_ARGS__) | ^~~~~~~~~~~~~~~~ drivers/crypto/marvell/octeontx2/otx2_cptpf_ucode.c:1807:41: note: in expansion of macro 'pr_debug' 1807 | pr_debug("Mask: %8.8x %8.8x %8.8x %8.8x %8.8x", | ^~~~~~~~ drivers/crypto/marvell/octeontx2/otx2_cptpf_ucode.c:1765:13: note: while referencing 'mask' 1765 | u32 mask[4]; | ^~~~ This is justified because the mask size (eng_grps->engs_num) can be at most 144 (OTX2_CPT_MAX_ENGINES bits), which is larger than available storage. 4 * 32 == 128, so this must be 5: 5 * 32bit = 160. Additionally clear the mask before conversion so trailing bits are zero. Cc: Herbert Xu Cc: Boris Brezillon Cc: Arnaud Ebalard Cc: Srujana Challa Cc: "David S. Miller" Cc: Suheil Chandran Cc: Shijith Thotton Cc: Lukasz Bartosik Cc: linux-crypto@vger.kernel.org Fixes: d9d7749773e8 ("crypto: octeontx2 - add apis for custom engine groups") Acked-by: Ard Biesheuvel Signed-off-by: Kees Cook Signed-off-by: Herbert Xu --- drivers/crypto/marvell/octeontx2/otx2_cptpf_ucode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/marvell/octeontx2/otx2_cptpf_ucode.c b/drivers/crypto/marvell/octeontx2/otx2_cptpf_ucode.c index 4c8ebdf671ca..1b4d425bbf0e 100644 --- a/drivers/crypto/marvell/octeontx2/otx2_cptpf_ucode.c +++ b/drivers/crypto/marvell/octeontx2/otx2_cptpf_ucode.c @@ -1753,7 +1753,6 @@ void otx2_cpt_print_uc_dbg_info(struct otx2_cptpf_dev *cptpf) char engs_info[2 * OTX2_CPT_NAME_LENGTH]; struct otx2_cpt_eng_grp_info *grp; struct otx2_cpt_engs_rsvd *engs; - u32 mask[4]; int i, j; pr_debug("Engine groups global info"); @@ -1785,6 +1784,8 @@ void otx2_cpt_print_uc_dbg_info(struct otx2_cptpf_dev *cptpf) for (j = 0; j < OTX2_CPT_MAX_ETYPES_PER_GRP; j++) { engs = &grp->engs[j]; if (engs->type) { + u32 mask[5] = { }; + get_engs_info(grp, engs_info, 2 * OTX2_CPT_NAME_LENGTH, j); pr_debug("Slot%d: %s", j, engs_info); From b837a9f5ab3bdfab9233c9f98a6bef717673a3e5 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 31 Jan 2022 08:57:38 +0100 Subject: [PATCH 0832/1295] ALSA: hda: realtek: Fix race at concurrent COEF updates The COEF access is done with two steps: setting the index then read or write the data. When multiple COEF accesses are performed concurrently, the index and data might be paired unexpectedly. In most cases, this isn't a big problem as the COEF setup is done at the initialization, but some dynamic changes like the mute LED may hit such a race. For avoiding the racy COEF accesses, this patch introduces a new mutex coef_mutex to alc_spec, and wrap the COEF accessing functions with it. Reported-by: Alexander Sergeyev Cc: Link: https://lore.kernel.org/r/20220111195229.a77wrpjclqwrx4bx@localhost.localdomain Link: https://lore.kernel.org/r/20220131075738.24323-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 61 ++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 668274e52674..a5677be0a405 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -98,6 +98,7 @@ struct alc_spec { unsigned int gpio_mic_led_mask; struct alc_coef_led mute_led_coef; struct alc_coef_led mic_led_coef; + struct mutex coef_mutex; hda_nid_t headset_mic_pin; hda_nid_t headphone_mic_pin; @@ -137,8 +138,8 @@ struct alc_spec { * COEF access helper functions */ -static int alc_read_coefex_idx(struct hda_codec *codec, hda_nid_t nid, - unsigned int coef_idx) +static int __alc_read_coefex_idx(struct hda_codec *codec, hda_nid_t nid, + unsigned int coef_idx) { unsigned int val; @@ -147,28 +148,61 @@ static int alc_read_coefex_idx(struct hda_codec *codec, hda_nid_t nid, return val; } +static int alc_read_coefex_idx(struct hda_codec *codec, hda_nid_t nid, + unsigned int coef_idx) +{ + struct alc_spec *spec = codec->spec; + unsigned int val; + + mutex_lock(&spec->coef_mutex); + val = __alc_read_coefex_idx(codec, nid, coef_idx); + mutex_unlock(&spec->coef_mutex); + return val; +} + #define alc_read_coef_idx(codec, coef_idx) \ alc_read_coefex_idx(codec, 0x20, coef_idx) -static void alc_write_coefex_idx(struct hda_codec *codec, hda_nid_t nid, - unsigned int coef_idx, unsigned int coef_val) +static void __alc_write_coefex_idx(struct hda_codec *codec, hda_nid_t nid, + unsigned int coef_idx, unsigned int coef_val) { snd_hda_codec_write(codec, nid, 0, AC_VERB_SET_COEF_INDEX, coef_idx); snd_hda_codec_write(codec, nid, 0, AC_VERB_SET_PROC_COEF, coef_val); } +static void alc_write_coefex_idx(struct hda_codec *codec, hda_nid_t nid, + unsigned int coef_idx, unsigned int coef_val) +{ + struct alc_spec *spec = codec->spec; + + mutex_lock(&spec->coef_mutex); + __alc_write_coefex_idx(codec, nid, coef_idx, coef_val); + mutex_unlock(&spec->coef_mutex); +} + #define alc_write_coef_idx(codec, coef_idx, coef_val) \ alc_write_coefex_idx(codec, 0x20, coef_idx, coef_val) +static void __alc_update_coefex_idx(struct hda_codec *codec, hda_nid_t nid, + unsigned int coef_idx, unsigned int mask, + unsigned int bits_set) +{ + unsigned int val = __alc_read_coefex_idx(codec, nid, coef_idx); + + if (val != -1) + __alc_write_coefex_idx(codec, nid, coef_idx, + (val & ~mask) | bits_set); +} + static void alc_update_coefex_idx(struct hda_codec *codec, hda_nid_t nid, unsigned int coef_idx, unsigned int mask, unsigned int bits_set) { - unsigned int val = alc_read_coefex_idx(codec, nid, coef_idx); + struct alc_spec *spec = codec->spec; - if (val != -1) - alc_write_coefex_idx(codec, nid, coef_idx, - (val & ~mask) | bits_set); + mutex_lock(&spec->coef_mutex); + __alc_update_coefex_idx(codec, nid, coef_idx, mask, bits_set); + mutex_unlock(&spec->coef_mutex); } #define alc_update_coef_idx(codec, coef_idx, mask, bits_set) \ @@ -201,13 +235,17 @@ struct coef_fw { static void alc_process_coef_fw(struct hda_codec *codec, const struct coef_fw *fw) { + struct alc_spec *spec = codec->spec; + + mutex_lock(&spec->coef_mutex); for (; fw->nid; fw++) { if (fw->mask == (unsigned short)-1) - alc_write_coefex_idx(codec, fw->nid, fw->idx, fw->val); + __alc_write_coefex_idx(codec, fw->nid, fw->idx, fw->val); else - alc_update_coefex_idx(codec, fw->nid, fw->idx, - fw->mask, fw->val); + __alc_update_coefex_idx(codec, fw->nid, fw->idx, + fw->mask, fw->val); } + mutex_unlock(&spec->coef_mutex); } /* @@ -1153,6 +1191,7 @@ static int alc_alloc_spec(struct hda_codec *codec, hda_nid_t mixer_nid) codec->spdif_status_reset = 1; codec->forced_resume = 1; codec->patch_ops = alc_patch_ops; + mutex_init(&spec->coef_mutex); err = alc_codec_rename_from_preset(codec); if (err < 0) { From 63394a16086fc2152869d7902621e2525e14bc40 Mon Sep 17 00:00:00 2001 From: Christian Lachner Date: Sat, 29 Jan 2022 12:32:41 +0100 Subject: [PATCH 0833/1295] ALSA: hda/realtek: Add missing fixup-model entry for Gigabyte X570 ALC1220 quirks The initial commit of the new Gigabyte X570 ALC1220 quirks lacked the fixup-model entry in alc882_fixup_models[]. It seemed not to cause any ill effects but for completeness sake this commit makes up for that. Signed-off-by: Christian Lachner Cc: Link: https://lore.kernel.org/r/20220129113243.93068-2-gladiac@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a5677be0a405..d662ad805960 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2665,6 +2665,7 @@ static const struct hda_model_fixup alc882_fixup_models[] = { {.id = ALC882_FIXUP_NO_PRIMARY_HP, .name = "no-primary-hp"}, {.id = ALC887_FIXUP_ASUS_BASS, .name = "asus-bass"}, {.id = ALC1220_FIXUP_GB_DUAL_CODECS, .name = "dual-codecs"}, + {.id = ALC1220_FIXUP_GB_X570, .name = "gb-x570"}, {.id = ALC1220_FIXUP_CLEVO_P950, .name = "clevo-p950"}, {} }; From 41a8601302ecbe704ac970552c33dc942300fc37 Mon Sep 17 00:00:00 2001 From: Christian Lachner Date: Sat, 29 Jan 2022 12:32:42 +0100 Subject: [PATCH 0834/1295] ALSA: hda/realtek: Fix silent output on Gigabyte X570S Aorus Master (newer chipset) Newer versions of the X570 Master come with a newer revision of the mainboard chipset - the X570S. These boards have the same ALC1220 codec but seem to initialize the codec with a different parameter in Coef 0x7 which causes the output audio to be very low. We therefore write a known-good value to Coef 0x7 to fix that. As the value is the exact same as on the other X570(non-S) boards the same quirk-function can be shared between both generations. This commit adds the Gigabyte X570S Aorus Master to the list of boards using the ALC1220_FIXUP_GB_X570 quirk. This fixes both, the silent output and the no-audio after reboot from windows problems. This work has been tested by the folks over at the level1techs forum here: https://forum.level1techs.com/t/has-anybody-gotten-audio-working-in-linux-on-aorus-x570-master/154072 Signed-off-by: Christian Lachner Cc: Link: https://lore.kernel.org/r/20220129113243.93068-3-gladiac@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index d662ad805960..54301d208d2b 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2164,6 +2164,7 @@ static void alc1220_fixup_gb_x570(struct hda_codec *codec, { static const hda_nid_t conn1[] = { 0x0c }; static const struct coef_fw gb_x570_coefs[] = { + WRITE_COEF(0x07, 0x03c0), WRITE_COEF(0x1a, 0x01c1), WRITE_COEF(0x1b, 0x0202), WRITE_COEF(0x43, 0x3005), @@ -2591,6 +2592,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1458, 0xa0b8, "Gigabyte AZ370-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS), SND_PCI_QUIRK(0x1458, 0xa0cd, "Gigabyte X570 Aorus Master", ALC1220_FIXUP_GB_X570), SND_PCI_QUIRK(0x1458, 0xa0ce, "Gigabyte X570 Aorus Xtreme", ALC1220_FIXUP_CLEVO_P950), + SND_PCI_QUIRK(0x1458, 0xa0d5, "Gigabyte X570S Aorus Master", ALC1220_FIXUP_GB_X570), SND_PCI_QUIRK(0x1462, 0x11f7, "MSI-GE63", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x1228, "MSI-GP63", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x1229, "MSI-GP73", ALC1220_FIXUP_CLEVO_P950), From ea3541961376f733373839cc90493aafa8a7f733 Mon Sep 17 00:00:00 2001 From: Christian Lachner Date: Sat, 29 Jan 2022 12:32:43 +0100 Subject: [PATCH 0835/1295] ALSA: hda/realtek: Fix silent output on Gigabyte X570 Aorus Xtreme after reboot from Windows This commit switches the Gigabyte X570 Aorus Xtreme from using the ALC1220_FIXUP_CLEVO_P950 to the ALC1220_FIXUP_GB_X570 quirk. This fixes the no-audio after reboot from windows problem. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=205275 Signed-off-by: Christian Lachner Cc: Link: https://lore.kernel.org/r/20220129113243.93068-4-gladiac@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 54301d208d2b..0d52eca57bbc 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2591,7 +2591,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1458, 0xa002, "Gigabyte EP45-DS3/Z87X-UD3H", ALC889_FIXUP_FRONT_HP_NO_PRESENCE), SND_PCI_QUIRK(0x1458, 0xa0b8, "Gigabyte AZ370-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS), SND_PCI_QUIRK(0x1458, 0xa0cd, "Gigabyte X570 Aorus Master", ALC1220_FIXUP_GB_X570), - SND_PCI_QUIRK(0x1458, 0xa0ce, "Gigabyte X570 Aorus Xtreme", ALC1220_FIXUP_CLEVO_P950), + SND_PCI_QUIRK(0x1458, 0xa0ce, "Gigabyte X570 Aorus Xtreme", ALC1220_FIXUP_GB_X570), SND_PCI_QUIRK(0x1458, 0xa0d5, "Gigabyte X570S Aorus Master", ALC1220_FIXUP_GB_X570), SND_PCI_QUIRK(0x1462, 0x11f7, "MSI-GE63", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x1228, "MSI-GP63", ALC1220_FIXUP_CLEVO_P950), From 94db9cc8f8fa2d5426ce79ec4ca16028f7084224 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albert=20Geant=C4=83?= Date: Mon, 31 Jan 2022 03:05:23 +0200 Subject: [PATCH 0836/1295] ALSA: hda/realtek: Add quirk for ASUS GU603 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ASUS GU603 (Zephyrus M16 - SSID 1043:16b2) requires a quirk similar to other ASUS devices for correctly routing the 4 integrated speakers. This fixes it by adding a corresponding quirk entry, which connects the bass speakers to the proper DAC. Signed-off-by: Albert Geantă Cc: Link: https://lore.kernel.org/r/20220131010523.546386-1-albertgeanta@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 0d52eca57bbc..8315bf7d4c38 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9011,6 +9011,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1e51, "ASUS Zephyrus M15", ALC294_FIXUP_ASUS_GU502_PINS), SND_PCI_QUIRK(0x1043, 0x1e8e, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1f11, "ASUS Zephyrus G14", ALC289_FIXUP_ASUS_GA401), + SND_PCI_QUIRK(0x1043, 0x16b2, "ASUS GU603", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x3030, "ASUS ZN270IE", ALC256_FIXUP_ASUS_AIO_GPIO2), SND_PCI_QUIRK(0x1043, 0x831a, "ASUS P901", ALC269_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x1043, 0x834a, "ASUS S101", ALC269_FIXUP_STEREO_DMIC), From b470947c3672f7eb7c4c271d510383d896831cc2 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Thu, 27 Jan 2022 16:15:00 -0600 Subject: [PATCH 0837/1295] usb: dwc3: xilinx: fix uninitialized return value A previous patch to skip part of the initialization when a USB3 PHY was not present could result in the return value being uninitialized in that case, causing spurious probe failures. Initialize ret to 0 to avoid this. Fixes: 9678f3361afc ("usb: dwc3: xilinx: Skip resets and USB3 register settings for USB2.0 mode") Cc: Reviewed-by: Nathan Chancellor Signed-off-by: Robert Hancock Link: https://lore.kernel.org/r/20220127221500.177021-1-robert.hancock@calian.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-xilinx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/dwc3-xilinx.c b/drivers/usb/dwc3/dwc3-xilinx.c index e14ac15e24c3..a6f3a9b38789 100644 --- a/drivers/usb/dwc3/dwc3-xilinx.c +++ b/drivers/usb/dwc3/dwc3-xilinx.c @@ -99,7 +99,7 @@ static int dwc3_xlnx_init_zynqmp(struct dwc3_xlnx *priv_data) struct device *dev = priv_data->dev; struct reset_control *crst, *hibrst, *apbrst; struct phy *usb3_phy; - int ret; + int ret = 0; u32 reg; usb3_phy = devm_phy_optional_get(dev, "usb3-phy"); From 8172f41859cf7516e73eb957297e6752b3073119 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 20 Jan 2022 20:31:16 -0800 Subject: [PATCH 0838/1295] drm/i915: Allocate intel_engine_coredump_alloc with ALLOW_FAIL Allocate intel_engine_coredump_alloc with ALLOW_FAIL rather than GFP_KERNEL to fully decouple the error capture from fence signalling. v2: (John Harrison) - Fix typo in commit message (s/do/to) Fixes: 8b91cdd4f8649 ("drm/i915: Use __GFP_KSWAPD_RECLAIM in the capture code") Signed-off-by: Matthew Brost Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20220121043118.24886-2-matthew.brost@intel.com (cherry picked from commit 4f72fc3c7f3d9f29a438bb0e17c7773f2fc8242a) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_gpu_error.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 5ae812d60abe..0633888a411e 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1522,7 +1522,7 @@ capture_engine(struct intel_engine_cs *engine, struct i915_request *rq = NULL; unsigned long flags; - ee = intel_engine_coredump_alloc(engine, GFP_KERNEL); + ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL); if (!ee) return NULL; From 5ae13c305ef8cb54efc4f0ba4565709b9f320fed Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Tue, 11 Jan 2022 08:39:29 -0800 Subject: [PATCH 0839/1295] drm/i915: Lock timeline mutex directly in error path of eb_pin_timeline Don't use the interruptable version of the timeline mutex lock in the error path of eb_pin_timeline as the cleanup must always happen. v2: (John Harrison) - Don't check for interrupt during mutex lock v3: (Tvrtko) - A comment explaining why lock helper isn't used Fixes: 544460c33821 ("drm/i915: Multi-BB execbuf") Signed-off-by: Matthew Brost Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20220111163929.14017-1-matthew.brost@intel.com (cherry picked from commit cb935c4618bd2ff9058feee4af7088446da6a763) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 3a5b247be738..1736efa43339 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -2505,9 +2505,14 @@ static int eb_pin_timeline(struct i915_execbuffer *eb, struct intel_context *ce, timeout) < 0) { i915_request_put(rq); - tl = intel_context_timeline_lock(ce); + /* + * Error path, cannot use intel_context_timeline_lock as + * that is user interruptable and this clean up step + * must be done. + */ + mutex_lock(&ce->timeline->mutex); intel_context_exit(ce); - intel_context_timeline_unlock(tl); + mutex_unlock(&ce->timeline->mutex); if (nonblock) return -EWOULDBLOCK; From 90a3d22ff02b196d5884e111f39271a1d4ee8e3e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 24 Jan 2022 15:24:09 +0300 Subject: [PATCH 0840/1295] drm/i915/overlay: Prevent divide by zero bugs in scaling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Smatch detected a divide by zero bug in check_overlay_scaling(). drivers/gpu/drm/i915/display/intel_overlay.c:976 check_overlay_scaling() error: potential divide by zero bug '/ rec->dst_height'. drivers/gpu/drm/i915/display/intel_overlay.c:980 check_overlay_scaling() error: potential divide by zero bug '/ rec->dst_width'. Prevent this by ensuring that the dst height and width are non-zero. Fixes: 02e792fbaadb ("drm/i915: implement drmmode overlay support v4") Signed-off-by: Dan Carpenter Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220124122409.GA31673@kili (cherry picked from commit cf5b64f7f10b28bebb9b7c9d25e7aee5cbe43918) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_overlay.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_overlay.c b/drivers/gpu/drm/i915/display/intel_overlay.c index 1a376e9a1ff3..d610e48cab94 100644 --- a/drivers/gpu/drm/i915/display/intel_overlay.c +++ b/drivers/gpu/drm/i915/display/intel_overlay.c @@ -959,6 +959,9 @@ static int check_overlay_dst(struct intel_overlay *overlay, const struct intel_crtc_state *pipe_config = overlay->crtc->config; + if (rec->dst_height == 0 || rec->dst_width == 0) + return -EINVAL; + if (rec->dst_x < pipe_config->pipe_src_w && rec->dst_x + rec->dst_width <= pipe_config->pipe_src_w && rec->dst_y < pipe_config->pipe_src_h && From b3f74938d65665f892d1b7807c51140f68dc911c Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Mon, 10 Jan 2022 17:55:23 -0800 Subject: [PATCH 0841/1295] drm/i915/pmu: Use PM timestamp instead of RING TIMESTAMP for reference All timestamps returned by GuC for GuC PMU busyness are captured from GUC PM TIMESTAMP. Since this timestamp does not tick when GuC goes idle, kmd uses RING_TIMESTAMP to measure busyness of an engine with an active context. In further stress testing, the MMIO read of the RING_TIMESTAMP is seen to cause a rare hang. Resolve the issue by using gt specific timestamp from PM which is in sync with the GuC PM timestamp. Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to pmu") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Alan Previn Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20220111015523.225562-1-umesh.nerlige.ramappa@intel.com (cherry picked from commit 721fd84ea1fe957453587efad5fdc44dfba58e04) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/uc/intel_guc.h | 5 ++ .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 56 ++++++++++++++----- drivers/gpu/drm/i915/i915_reg.h | 3 +- 3 files changed, 50 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h index f9240d4baa69..3aabe164c329 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h @@ -206,6 +206,11 @@ struct intel_guc { * context usage for overflows. */ struct delayed_work work; + + /** + * @shift: Right shift value for the gpm timestamp + */ + u32 shift; } timestamp; #ifdef CONFIG_DRM_I915_SELFTEST diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index e7517206af82..fcec2cb833af 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1149,23 +1149,51 @@ static void guc_update_engine_gt_clks(struct intel_engine_cs *engine) } } -static void guc_update_pm_timestamp(struct intel_guc *guc, - struct intel_engine_cs *engine, - ktime_t *now) +static u32 gpm_timestamp_shift(struct intel_gt *gt) { - u32 gt_stamp_now, gt_stamp_hi; + intel_wakeref_t wakeref; + u32 reg, shift; + + with_intel_runtime_pm(gt->uncore->rpm, wakeref) + reg = intel_uncore_read(gt->uncore, RPM_CONFIG0); + + shift = (reg & GEN10_RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK) >> + GEN10_RPM_CONFIG0_CTC_SHIFT_PARAMETER_SHIFT; + + return 3 - shift; +} + +static u64 gpm_timestamp(struct intel_gt *gt) +{ + u32 lo, hi, old_hi, loop = 0; + + hi = intel_uncore_read(gt->uncore, MISC_STATUS1); + do { + lo = intel_uncore_read(gt->uncore, MISC_STATUS0); + old_hi = hi; + hi = intel_uncore_read(gt->uncore, MISC_STATUS1); + } while (old_hi != hi && loop++ < 2); + + return ((u64)hi << 32) | lo; +} + +static void guc_update_pm_timestamp(struct intel_guc *guc, ktime_t *now) +{ + struct intel_gt *gt = guc_to_gt(guc); + u32 gt_stamp_lo, gt_stamp_hi; + u64 gpm_ts; lockdep_assert_held(&guc->timestamp.lock); gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp); - gt_stamp_now = intel_uncore_read(engine->uncore, - RING_TIMESTAMP(engine->mmio_base)); + gpm_ts = gpm_timestamp(gt) >> guc->timestamp.shift; + gt_stamp_lo = lower_32_bits(gpm_ts); *now = ktime_get(); - if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp)) + if (gt_stamp_lo < lower_32_bits(guc->timestamp.gt_stamp)) gt_stamp_hi++; - guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_now; + guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_lo; } /* @@ -1209,7 +1237,7 @@ static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now) stats_saved = *stats; gt_stamp_saved = guc->timestamp.gt_stamp; guc_update_engine_gt_clks(engine); - guc_update_pm_timestamp(guc, engine, now); + guc_update_pm_timestamp(guc, now); intel_gt_pm_put_async(gt); if (i915_reset_count(gpu_error) != reset_count) { *stats = stats_saved; @@ -1241,8 +1269,8 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc) spin_lock_irqsave(&guc->timestamp.lock, flags); + guc_update_pm_timestamp(guc, &unused); for_each_engine(engine, gt, id) { - guc_update_pm_timestamp(guc, engine, &unused); guc_update_engine_gt_clks(engine); engine->stats.guc.prev_total = 0; } @@ -1259,10 +1287,11 @@ static void __update_guc_busyness_stats(struct intel_guc *guc) ktime_t unused; spin_lock_irqsave(&guc->timestamp.lock, flags); - for_each_engine(engine, gt, id) { - guc_update_pm_timestamp(guc, engine, &unused); + + guc_update_pm_timestamp(guc, &unused); + for_each_engine(engine, gt, id) guc_update_engine_gt_clks(engine); - } + spin_unlock_irqrestore(&guc->timestamp.lock, flags); } @@ -1783,6 +1812,7 @@ int intel_guc_submission_init(struct intel_guc *guc) spin_lock_init(&guc->timestamp.lock); INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping); guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ; + guc->timestamp.shift = gpm_timestamp_shift(gt); return 0; } diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index c32420cb8ed5..c2bb33febb68 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -2684,7 +2684,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define RING_WAIT (1 << 11) /* gen3+, PRBx_CTL */ #define RING_WAIT_SEMAPHORE (1 << 10) /* gen6+ */ -#define GUCPMTIMESTAMP _MMIO(0xC3E8) +#define MISC_STATUS0 _MMIO(0xA500) +#define MISC_STATUS1 _MMIO(0xA504) /* There are 16 64-bit CS General Purpose Registers per-engine on Gen8+ */ #define GEN8_RING_CS_GPR(base, n) _MMIO((base) + 0x600 + (n) * 8) From 3c6f13ad723e7206f03bb2752b01d18202b7fc9d Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Wed, 26 Jan 2022 12:43:56 +0200 Subject: [PATCH 0842/1295] drm/i915/adlp: Fix TypeC PHY-ready status readout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TCSS_DDI_STATUS register is indexed by tc_port not by the FIA port index, fix this up. This only caused an issue on TC#3/4 ports in legacy mode, as in all other cases the two indices either match (on TC#1/2) or the TCSS_DDI_STATUS_READY flag is set regardless of something being connected or not (on TC#1/2/3/4 in dp-alt and tbt-alt modes). Reported-and-tested-by: Chia-Lin Kao (AceLan) Fixes: 55ce306c2aa1 ("drm/i915/adl_p: Implement TC sequences") Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/4698 Cc: José Roberto de Souza Cc: # v5.14+ Signed-off-by: Imre Deak Reviewed-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20220126104356.2022975-1-imre.deak@intel.com (cherry picked from commit 516b33460c5bee78b2055637b0547bdb0e6af754) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_tc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_tc.c b/drivers/gpu/drm/i915/display/intel_tc.c index 40faa18947c9..dbd7d0d83a14 100644 --- a/drivers/gpu/drm/i915/display/intel_tc.c +++ b/drivers/gpu/drm/i915/display/intel_tc.c @@ -345,10 +345,11 @@ static bool icl_tc_phy_status_complete(struct intel_digital_port *dig_port) static bool adl_tc_phy_status_complete(struct intel_digital_port *dig_port) { struct drm_i915_private *i915 = to_i915(dig_port->base.base.dev); + enum tc_port tc_port = intel_port_to_tc(i915, dig_port->base.port); struct intel_uncore *uncore = &i915->uncore; u32 val; - val = intel_uncore_read(uncore, TCSS_DDI_STATUS(dig_port->tc_phy_fia_idx)); + val = intel_uncore_read(uncore, TCSS_DDI_STATUS(tc_port)); if (val == 0xffffffff) { drm_dbg_kms(&i915->drm, "Port %s: PHY in TCCOLD, assuming not complete\n", From ebe2b1add1055b903e2acd86b290a85297edc0b3 Mon Sep 17 00:00:00 2001 From: Udipto Goswami Date: Thu, 27 Jan 2022 09:39:55 +0530 Subject: [PATCH 0843/1295] usb: f_fs: Fix use-after-free for epfile Consider a case where ffs_func_eps_disable is called from ffs_func_disable as part of composition switch and at the same time ffs_epfile_release get called from userspace. ffs_epfile_release will free up the read buffer and call ffs_data_closed which in turn destroys ffs->epfiles and mark it as NULL. While this was happening the driver has already initialized the local epfile in ffs_func_eps_disable which is now freed and waiting to acquire the spinlock. Once spinlock is acquired the driver proceeds with the stale value of epfile and tries to free the already freed read buffer causing use-after-free. Following is the illustration of the race: CPU1 CPU2 ffs_func_eps_disable epfiles (local copy) ffs_epfile_release ffs_data_closed if (last file closed) ffs_data_reset ffs_data_clear ffs_epfiles_destroy spin_lock dereference epfiles Fix this races by taking epfiles local copy & assigning it under spinlock and if epfiles(local) is null then update it in ffs->epfiles then finally destroy it. Extending the scope further from the race, protecting the ep related structures, and concurrent accesses. Fixes: a9e6f83c2df1 ("usb: gadget: f_fs: stop sleeping in ffs_func_eps_disable") Co-developed-by: Udipto Goswami Reviewed-by: John Keeping Signed-off-by: Pratham Pratap Signed-off-by: Udipto Goswami Link: https://lore.kernel.org/r/1643256595-10797-1-git-send-email-quic_ugoswami@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_fs.c | 56 ++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 25ad1e97a458..1922fd02043c 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -1711,16 +1711,24 @@ static void ffs_data_put(struct ffs_data *ffs) static void ffs_data_closed(struct ffs_data *ffs) { + struct ffs_epfile *epfiles; + unsigned long flags; + ENTER(); if (atomic_dec_and_test(&ffs->opened)) { if (ffs->no_disconnect) { ffs->state = FFS_DEACTIVATED; - if (ffs->epfiles) { - ffs_epfiles_destroy(ffs->epfiles, - ffs->eps_count); - ffs->epfiles = NULL; - } + spin_lock_irqsave(&ffs->eps_lock, flags); + epfiles = ffs->epfiles; + ffs->epfiles = NULL; + spin_unlock_irqrestore(&ffs->eps_lock, + flags); + + if (epfiles) + ffs_epfiles_destroy(epfiles, + ffs->eps_count); + if (ffs->setup_state == FFS_SETUP_PENDING) __ffs_ep0_stall(ffs); } else { @@ -1767,14 +1775,27 @@ static struct ffs_data *ffs_data_new(const char *dev_name) static void ffs_data_clear(struct ffs_data *ffs) { + struct ffs_epfile *epfiles; + unsigned long flags; + ENTER(); ffs_closed(ffs); BUG_ON(ffs->gadget); - if (ffs->epfiles) { - ffs_epfiles_destroy(ffs->epfiles, ffs->eps_count); + spin_lock_irqsave(&ffs->eps_lock, flags); + epfiles = ffs->epfiles; + ffs->epfiles = NULL; + spin_unlock_irqrestore(&ffs->eps_lock, flags); + + /* + * potential race possible between ffs_func_eps_disable + * & ffs_epfile_release therefore maintaining a local + * copy of epfile will save us from use-after-free. + */ + if (epfiles) { + ffs_epfiles_destroy(epfiles, ffs->eps_count); ffs->epfiles = NULL; } @@ -1922,12 +1943,15 @@ static void ffs_epfiles_destroy(struct ffs_epfile *epfiles, unsigned count) static void ffs_func_eps_disable(struct ffs_function *func) { - struct ffs_ep *ep = func->eps; - struct ffs_epfile *epfile = func->ffs->epfiles; - unsigned count = func->ffs->eps_count; + struct ffs_ep *ep; + struct ffs_epfile *epfile; + unsigned short count; unsigned long flags; spin_lock_irqsave(&func->ffs->eps_lock, flags); + count = func->ffs->eps_count; + epfile = func->ffs->epfiles; + ep = func->eps; while (count--) { /* pending requests get nuked */ if (ep->ep) @@ -1945,14 +1969,18 @@ static void ffs_func_eps_disable(struct ffs_function *func) static int ffs_func_eps_enable(struct ffs_function *func) { - struct ffs_data *ffs = func->ffs; - struct ffs_ep *ep = func->eps; - struct ffs_epfile *epfile = ffs->epfiles; - unsigned count = ffs->eps_count; + struct ffs_data *ffs; + struct ffs_ep *ep; + struct ffs_epfile *epfile; + unsigned short count; unsigned long flags; int ret = 0; spin_lock_irqsave(&func->ffs->eps_lock, flags); + ffs = func->ffs; + ep = func->eps; + epfile = ffs->epfiles; + count = ffs->eps_count; while(count--) { ep->ep->driver_data = ep; From fa77ce201f7f2d823b07753575122d1ae5597fbe Mon Sep 17 00:00:00 2001 From: Stephan Brunner Date: Sat, 8 Jan 2022 13:00:20 +0100 Subject: [PATCH 0844/1295] USB: serial: ch341: add support for GW Instek USB2.0-Serial devices Programmable lab power supplies made by GW Instek, such as the GPP-2323, have a USB port exposing a serial port to control the device. Stringing the supplied Windows driver, references to the ch341 chip are found. Binding the existing ch341 driver to the VID/PID of the GPP-2323 ("GW Instek USB2.0-Serial" as per the USB product name) works out of the box, communication and control is now possible. This patch should work with any GPP series power supply due to similarities in the product line. Signed-off-by: Stephan Brunner Link: https://lore.kernel.org/r/4a47b864-0816-6f6a-efee-aa20e74bcdc6@stephan-brunner.net Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/ch341.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index 29f4b87a9e74..58cba8ee0277 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -85,6 +85,7 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(0x1a86, 0x5523) }, { USB_DEVICE(0x1a86, 0x7522) }, { USB_DEVICE(0x1a86, 0x7523) }, + { USB_DEVICE(0x2184, 0x0057) }, { USB_DEVICE(0x4348, 0x5523) }, { USB_DEVICE(0x9986, 0x7523) }, { }, From d48384c7ed6c8fe4727eaa0f3048f62afd1cd715 Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Tue, 11 Jan 2022 23:12:05 +0100 Subject: [PATCH 0845/1295] USB: serial: option: add ZTE MF286D modem Modem from ZTE MF286D is an Qualcomm MDM9250 based 3G/4G modem. T: Bus=02 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 3 Spd=5000 MxCh= 0 D: Ver= 3.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=19d2 ProdID=1485 Rev=52.87 S: Manufacturer=ZTE,Incorporated S: Product=ZTE Technologies MSM S: SerialNumber=MF286DZTED000000 C:* #Ifs= 7 Cfg#= 1 Atr=80 MxPwr=896mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=06 Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=02 Prot=ff Driver=rndis_host E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=83(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=88(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms Signed-off-by: Pawel Dembicki Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 42420bfc983c..962e9943fc20 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1649,6 +1649,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(2) }, { USB_DEVICE_INTERFACE_CLASS(ZTE_VENDOR_ID, 0x1476, 0xff) }, /* GosunCn ZTE WeLink ME3630 (ECM/NCM mode) */ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1481, 0xff, 0x00, 0x00) }, /* ZTE MF871A */ + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1485, 0xff, 0xff, 0xff), /* ZTE MF286D */ + .driver_info = RSVD(5) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1533, 0xff, 0xff, 0xff) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1534, 0xff, 0xff, 0xff) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1535, 0xff, 0xff, 0xff) }, From 341adeec9adad0874f29a0a1af35638207352a39 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 26 Jan 2022 23:33:04 +0800 Subject: [PATCH 0846/1295] net/smc: Forward wakeup to smc socket waitqueue after fallback When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul Signed-off-by: Wen Gu Acked-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 133 +++++++++++++++++++++++++++++++++++++++++------ net/smc/smc.h | 20 ++++++- 2 files changed, 137 insertions(+), 16 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index d5ea62b82bb8..8c89d0b0ca18 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -566,17 +566,115 @@ static void smc_stat_fallback(struct smc_sock *smc) mutex_unlock(&net->smc.mutex_fback_rsn); } +/* must be called under rcu read lock */ +static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key) +{ + struct socket_wq *wq; + __poll_t flags; + + wq = rcu_dereference(smc->sk.sk_wq); + if (!skwq_has_sleeper(wq)) + return; + + /* wake up smc sk->sk_wq */ + if (!key) { + /* sk_state_change */ + wake_up_interruptible_all(&wq->wait); + } else { + flags = key_to_poll(key); + if (flags & (EPOLLIN | EPOLLOUT)) + /* sk_data_ready or sk_write_space */ + wake_up_interruptible_sync_poll(&wq->wait, flags); + else if (flags & EPOLLERR) + /* sk_error_report */ + wake_up_interruptible_poll(&wq->wait, flags); + } +} + +static int smc_fback_mark_woken(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *key) +{ + struct smc_mark_woken *mark = + container_of(wait, struct smc_mark_woken, wait_entry); + + mark->woken = true; + mark->key = key; + return 0; +} + +static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, + void (*clcsock_callback)(struct sock *sk)) +{ + struct smc_mark_woken mark = { .woken = false }; + struct socket_wq *wq; + + init_waitqueue_func_entry(&mark.wait_entry, + smc_fback_mark_woken); + rcu_read_lock(); + wq = rcu_dereference(clcsk->sk_wq); + if (!wq) + goto out; + add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + clcsock_callback(clcsk); + remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + + if (mark.woken) + smc_fback_wakeup_waitqueue(smc, mark.key); +out: + rcu_read_unlock(); +} + +static void smc_fback_state_change(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change); +} + +static void smc_fback_data_ready(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready); +} + +static void smc_fback_write_space(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space); +} + +static void smc_fback_error_report(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); +} + static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { - wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); - wait_queue_head_t *clc_wait; - unsigned long flags; + struct sock *clcsk; mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { mutex_unlock(&smc->clcsock_release_lock); return -EBADF; } + clcsk = smc->clcsock->sk; + smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -587,16 +685,22 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; - /* There may be some entries remaining in - * smc socket->wq, which should be removed - * to clcsocket->wq during the fallback. + /* There might be some wait entries remaining + * in smc sk->sk_wq and they should be woken up + * as clcsock's wait queue is woken up. */ - clc_wait = sk_sleep(smc->clcsock->sk); - spin_lock_irqsave(&smc_wait->lock, flags); - spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); - list_splice_init(&smc_wait->head, &clc_wait->head); - spin_unlock(&clc_wait->lock); - spin_unlock_irqrestore(&smc_wait->lock, flags); + smc->clcsk_state_change = clcsk->sk_state_change; + smc->clcsk_data_ready = clcsk->sk_data_ready; + smc->clcsk_write_space = clcsk->sk_write_space; + smc->clcsk_error_report = clcsk->sk_error_report; + + clcsk->sk_state_change = smc_fback_state_change; + clcsk->sk_data_ready = smc_fback_data_ready; + clcsk->sk_write_space = smc_fback_write_space; + clcsk->sk_error_report = smc_fback_error_report; + + smc->clcsock->sk->sk_user_data = + (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); } mutex_unlock(&smc->clcsock_release_lock); return 0; @@ -2115,10 +2219,9 @@ out: static void smc_clcsock_data_ready(struct sock *listen_clcsock) { - struct smc_sock *lsmc; + struct smc_sock *lsmc = + smc_clcsock_user_data(listen_clcsock); - lsmc = (struct smc_sock *) - ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY); if (!lsmc) return; lsmc->clcsk_data_ready(listen_clcsock); diff --git a/net/smc/smc.h b/net/smc/smc.h index 3d0b8e300deb..37b2001a0255 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -139,6 +139,12 @@ enum smc_urg_state { SMC_URG_READ = 3, /* data was already read */ }; +struct smc_mark_woken { + bool woken; + void *key; + wait_queue_entry_t wait_entry; +}; + struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ @@ -228,8 +234,14 @@ struct smc_connection { struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ + void (*clcsk_state_change)(struct sock *sk); + /* original stat_change fct. */ void (*clcsk_data_ready)(struct sock *sk); - /* original data_ready fct. **/ + /* original data_ready fct. */ + void (*clcsk_write_space)(struct sock *sk); + /* original write_space fct. */ + void (*clcsk_error_report)(struct sock *sk); + /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ @@ -264,6 +276,12 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } +static inline struct smc_sock *smc_clcsock_user_data(struct sock *clcsk) +{ + return (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); +} + extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ From baf927a833ca2c6717795ac131079f485cb7a5dc Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 31 Jan 2022 09:52:01 +0100 Subject: [PATCH 0847/1295] pinctrl: microchip-sgpio: Fix support for regmap Initially the driver accessed the registers using u32 __iomem but then in the blamed commit it changed it to use regmap. The problem is that now the offset of the registers is not calculated anymore at word offset but at byte offset. Therefore make sure to multiply the offset with word size. Acked-by: Steen Hegelund Reviewed-by: Colin Foster Fixes: 2afbbab45c261a ("pinctrl: microchip-sgpio: update to support regmap") Signed-off-by: Horatiu Vultur Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220131085201.307031-1-horatiu.vultur@microchip.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-microchip-sgpio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-microchip-sgpio.c b/drivers/pinctrl/pinctrl-microchip-sgpio.c index 8e081c90bdb2..639f1130e989 100644 --- a/drivers/pinctrl/pinctrl-microchip-sgpio.c +++ b/drivers/pinctrl/pinctrl-microchip-sgpio.c @@ -137,7 +137,8 @@ static inline int sgpio_addr_to_pin(struct sgpio_priv *priv, int port, int bit) static inline u32 sgpio_get_addr(struct sgpio_priv *priv, u32 rno, u32 off) { - return priv->properties->regoff[rno] + off; + return (priv->properties->regoff[rno] + off) * + regmap_get_reg_stride(priv->regs); } static u32 sgpio_readl(struct sgpio_priv *priv, u32 rno, u32 off) From 092f45b13e51666fe8ecbf2d6cd247aa7e6c1f74 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 27 Jan 2022 14:00:02 -0500 Subject: [PATCH 0848/1295] usb: ulpi: Move of_node_put to ulpi_dev_release Drivers are not unbound from the device when ulpi_unregister_interface is called. Move of_node-freeing code to ulpi_dev_release which is called only after all users are gone. Fixes: ef6a7bcfb01c ("usb: ulpi: Support device discovery via DT") Cc: stable Reviewed-by: Heikki Krogerus Signed-off-by: Sean Anderson Link: https://lore.kernel.org/r/20220127190004.1446909-2-sean.anderson@seco.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/common/ulpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/common/ulpi.c b/drivers/usb/common/ulpi.c index 8f8405b0d608..09ad569a1a35 100644 --- a/drivers/usb/common/ulpi.c +++ b/drivers/usb/common/ulpi.c @@ -130,6 +130,7 @@ static const struct attribute_group *ulpi_dev_attr_groups[] = { static void ulpi_dev_release(struct device *dev) { + of_node_put(dev->of_node); kfree(to_ulpi_dev(dev)); } @@ -299,7 +300,6 @@ EXPORT_SYMBOL_GPL(ulpi_register_interface); */ void ulpi_unregister_interface(struct ulpi *ulpi) { - of_node_put(ulpi->dev.of_node); device_unregister(&ulpi->dev); } EXPORT_SYMBOL_GPL(ulpi_unregister_interface); From 0a907ee9d95e3ac35eb023d71f29eae0aaa52d1b Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 27 Jan 2022 14:00:03 -0500 Subject: [PATCH 0849/1295] usb: ulpi: Call of_node_put correctly of_node_put should always be called on device nodes gotten from of_get_*. Additionally, it should only be called after there are no remaining users. To address the first issue, call of_node_put if later steps in ulpi_register fail. To address the latter, call put_device if device_register fails, which will call ulpi_dev_release if necessary. Fixes: ef6a7bcfb01c ("usb: ulpi: Support device discovery via DT") Cc: stable Reviewed-by: Heikki Krogerus Signed-off-by: Sean Anderson Link: https://lore.kernel.org/r/20220127190004.1446909-3-sean.anderson@seco.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/common/ulpi.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/usb/common/ulpi.c b/drivers/usb/common/ulpi.c index 09ad569a1a35..5509d3847af4 100644 --- a/drivers/usb/common/ulpi.c +++ b/drivers/usb/common/ulpi.c @@ -248,12 +248,16 @@ static int ulpi_register(struct device *dev, struct ulpi *ulpi) return ret; ret = ulpi_read_id(ulpi); - if (ret) + if (ret) { + of_node_put(ulpi->dev.of_node); return ret; + } ret = device_register(&ulpi->dev); - if (ret) + if (ret) { + put_device(&ulpi->dev); return ret; + } dev_dbg(&ulpi->dev, "registered ULPI PHY: vendor %04x, product %04x\n", ulpi->id.vendor, ulpi->id.product); From a4f399a1416f645ac701064a55b0cb5203707ac9 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 30 Jan 2022 09:06:36 +0100 Subject: [PATCH 0850/1295] Input: wm97xx: Simplify resource management Since the commit in the Fixes tag below, 'wm->input_dev' is a managed resource that doesn't need to be explicitly unregistered or freed (see devm_input_allocate_device() documentation) So, remove some unless line of code to slightly simplify it. Fixes: c72f61e74073 ("Input: wm97xx: split out touchscreen registering") Signed-off-by: Christophe JAILLET Acked-by: Charles Keepax Link: https://lore.kernel.org/r/87dce7e80ea9b191843fa22415ca3aef5f3cc2e6.1643529968.git.christophe.jaillet@wanadoo.fr Signed-off-by: Mark Brown --- drivers/input/touchscreen/wm97xx-core.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/input/touchscreen/wm97xx-core.c b/drivers/input/touchscreen/wm97xx-core.c index 78d2ee99f37a..1b58611c8084 100644 --- a/drivers/input/touchscreen/wm97xx-core.c +++ b/drivers/input/touchscreen/wm97xx-core.c @@ -615,10 +615,9 @@ static int wm97xx_register_touch(struct wm97xx *wm) * extensions) */ wm->touch_dev = platform_device_alloc("wm97xx-touch", -1); - if (!wm->touch_dev) { - ret = -ENOMEM; - goto touch_err; - } + if (!wm->touch_dev) + return -ENOMEM; + platform_set_drvdata(wm->touch_dev, wm); wm->touch_dev->dev.parent = wm->dev; wm->touch_dev->dev.platform_data = pdata; @@ -629,9 +628,6 @@ static int wm97xx_register_touch(struct wm97xx *wm) return 0; touch_reg_err: platform_device_put(wm->touch_dev); -touch_err: - input_unregister_device(wm->input_dev); - wm->input_dev = NULL; return ret; } @@ -639,8 +635,6 @@ touch_err: static void wm97xx_unregister_touch(struct wm97xx *wm) { platform_device_unregister(wm->touch_dev); - input_unregister_device(wm->input_dev); - wm->input_dev = NULL; } static int _wm97xx_probe(struct wm97xx *wm) From 5c2b9c61ae5d8ad0a196d33b66ce44543be22281 Mon Sep 17 00:00:00 2001 From: Tommaso Merciai Date: Fri, 28 Jan 2022 19:17:13 +0100 Subject: [PATCH 0851/1295] usb: usb251xb: add boost-up property support Add support for boost-up register of usb251xb hub. boost-up property control USB electrical drive strength This register can be set: - Normal mode -> 0x00 - Low -> 0x01 - Medium -> 0x10 - High -> 0x11 (Normal Default) References: - http://www.mouser.com/catalog/specsheets/2514.pdf p29 Reviewed-by: Richard Leitner Signed-off-by: Tommaso Merciai Link: https://lore.kernel.org/r/20220128181713.96856-1-tomm.merciai@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/usb251xb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/misc/usb251xb.c b/drivers/usb/misc/usb251xb.c index 507deef1f709..04c4e3fed094 100644 --- a/drivers/usb/misc/usb251xb.c +++ b/drivers/usb/misc/usb251xb.c @@ -543,6 +543,9 @@ static int usb251xb_get_ofdata(struct usb251xb *hub, if (of_property_read_u16_array(np, "language-id", &hub->lang_id, 1)) hub->lang_id = USB251XB_DEF_LANGUAGE_ID; + if (of_property_read_u8(np, "boost-up", &hub->boost_up)) + hub->boost_up = USB251XB_DEF_BOOST_UP; + cproperty_char = of_get_property(np, "manufacturer", NULL); strlcpy(str, cproperty_char ? : USB251XB_DEF_MANUFACTURER_STRING, sizeof(str)); @@ -584,7 +587,6 @@ static int usb251xb_get_ofdata(struct usb251xb *hub, * may be as soon as needed. */ hub->bat_charge_en = USB251XB_DEF_BATTERY_CHARGING_ENABLE; - hub->boost_up = USB251XB_DEF_BOOST_UP; hub->boost_57 = USB251XB_DEF_BOOST_57; hub->boost_14 = USB251XB_DEF_BOOST_14; hub->port_map12 = USB251XB_DEF_PORT_MAP_12; From 292d2c82b105d92082c2120a44a58de9767e44f1 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 26 Jan 2022 21:52:14 +0100 Subject: [PATCH 0852/1295] usb: raw-gadget: fix handling of dual-direction-capable endpoints Under dummy_hcd, every available endpoint is *either* IN or OUT capable. But with some real hardware, there are endpoints that support both IN and OUT. In particular, the PLX 2380 has four available endpoints that each support both IN and OUT. raw-gadget currently gets confused and thinks that any endpoint that is usable as an IN endpoint can never be used as an OUT endpoint. Fix it by looking at the direction in the configured endpoint descriptor instead of looking at the hardware capabilities. With this change, I can use the PLX 2380 with raw-gadget. Fixes: f2c2e717642c ("usb: gadget: add raw-gadget interface") Cc: stable Tested-by: Andrey Konovalov Reviewed-by: Andrey Konovalov Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20220126205214.2149936-1-jannh@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/legacy/raw_gadget.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/legacy/raw_gadget.c b/drivers/usb/gadget/legacy/raw_gadget.c index c5a2c734234a..d86c3a36441e 100644 --- a/drivers/usb/gadget/legacy/raw_gadget.c +++ b/drivers/usb/gadget/legacy/raw_gadget.c @@ -1004,7 +1004,7 @@ static int raw_process_ep_io(struct raw_dev *dev, struct usb_raw_ep_io *io, ret = -EBUSY; goto out_unlock; } - if ((in && !ep->ep->caps.dir_in) || (!in && ep->ep->caps.dir_in)) { + if (in != usb_endpoint_dir_in(ep->ep->desc)) { dev_dbg(&dev->gadget->dev, "fail, wrong direction\n"); ret = -EINVAL; goto out_unlock; From 459702eea6132888b5c5b64c0e9c626da4ec2493 Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Fri, 28 Jan 2022 16:36:03 -0600 Subject: [PATCH 0853/1295] usb: gadget: udc: renesas_usb3: Fix host to USB_ROLE_NONE transition The support the external role switch a variety of situations were addressed, but the transition from USB_ROLE_HOST to USB_ROLE_NONE leaves the host up which can cause some error messages when switching from host to none, to gadget, to none, and then back to host again. xhci-hcd ee000000.usb: Abort failed to stop command ring: -110 xhci-hcd ee000000.usb: xHCI host controller not responding, assume dead xhci-hcd ee000000.usb: HC died; cleaning up usb 4-1: device not accepting address 6, error -108 usb usb4-port1: couldn't allocate usb_device After this happens it will not act as a host again. Fix this by releasing the host mode when transitioning to USB_ROLE_NONE. Fixes: 0604160d8c0b ("usb: gadget: udc: renesas_usb3: Enhance role switch support") Cc: stable Reviewed-by: Yoshihiro Shimoda Signed-off-by: Adam Ford Link: https://lore.kernel.org/r/20220128223603.2362621-1-aford173@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/renesas_usb3.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index 57d417a7c3e0..601829a6b4ba 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -2378,6 +2378,8 @@ static void handle_ext_role_switch_states(struct device *dev, switch (role) { case USB_ROLE_NONE: usb3->connection_state = USB_ROLE_NONE; + if (cur_role == USB_ROLE_HOST) + device_release_driver(host); if (usb3->driver) usb3_disconnect(usb3); usb3_vbus_out(usb3, false); From 5432184107cd0013761bdfa6cb6079527ef87b95 Mon Sep 17 00:00:00 2001 From: Pavel Hofman Date: Mon, 31 Jan 2022 08:18:13 +0100 Subject: [PATCH 0854/1295] usb: gadget: f_uac2: Define specific wTerminalType Several users have reported that their Win10 does not enumerate UAC2 gadget with the existing wTerminalType set to UAC_INPUT_TERMINAL_UNDEFINED/UAC_INPUT_TERMINAL_UNDEFINED, e.g. https://github.com/raspberrypi/linux/issues/4587#issuecomment-926567213. While the constant is officially defined by the USB terminal types document, e.g. XMOS firmware for UAC2 (commonly used for Win10) defines no undefined output terminal type in its usbaudio20.h header. Therefore wTerminalType of EP-IN is set to UAC_INPUT_TERMINAL_MICROPHONE and wTerminalType of EP-OUT to UAC_OUTPUT_TERMINAL_SPEAKER for the UAC2 gadget. Signed-off-by: Pavel Hofman Cc: stable Link: https://lore.kernel.org/r/20220131071813.7433-1-pavel.hofman@ivitera.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_uac2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_uac2.c b/drivers/usb/gadget/function/f_uac2.c index 36fa6ef0581b..097a709549d6 100644 --- a/drivers/usb/gadget/function/f_uac2.c +++ b/drivers/usb/gadget/function/f_uac2.c @@ -203,7 +203,7 @@ static struct uac2_input_terminal_descriptor io_in_it_desc = { .bDescriptorSubtype = UAC_INPUT_TERMINAL, /* .bTerminalID = DYNAMIC */ - .wTerminalType = cpu_to_le16(UAC_INPUT_TERMINAL_UNDEFINED), + .wTerminalType = cpu_to_le16(UAC_INPUT_TERMINAL_MICROPHONE), .bAssocTerminal = 0, /* .bCSourceID = DYNAMIC */ .iChannelNames = 0, @@ -231,7 +231,7 @@ static struct uac2_output_terminal_descriptor io_out_ot_desc = { .bDescriptorSubtype = UAC_OUTPUT_TERMINAL, /* .bTerminalID = DYNAMIC */ - .wTerminalType = cpu_to_le16(UAC_OUTPUT_TERMINAL_UNDEFINED), + .wTerminalType = cpu_to_le16(UAC_OUTPUT_TERMINAL_SPEAKER), .bAssocTerminal = 0, /* .bSourceID = DYNAMIC */ /* .bCSourceID = DYNAMIC */ From c816b2e65b0e86b95011418cad334f0524fc33b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?TATSUKAWA=20KOSUKE=20=28=E7=AB=8B=E5=B7=9D=20=E6=B1=9F?= =?UTF-8?q?=E4=BB=8B=29?= Date: Wed, 26 Jan 2022 23:35:02 +0000 Subject: [PATCH 0855/1295] n_tty: wake up poll(POLLRDNORM) on receiving data The poll man page says POLLRDNORM is equivalent to POLLIN when used as an event. $ man poll POLLRDNORM Equivalent to POLLIN. However, in n_tty driver, POLLRDNORM does not return until timeout even if there is terminal input, whereas POLLIN returns. The following test program works until kernel-3.17, but the test stops in poll() after commit 57087d515441 ("tty: Fix spurious poll() wakeups"). [Steps to run test program] $ cc -o test-pollrdnorm test-pollrdnorm.c $ ./test-pollrdnorm foo <-- Type in something from the terminal followed by [RET]. The string should be echoed back. ------------------------< test-pollrdnorm.c >------------------------ #include #include #include #include void main(void) { int n; unsigned char buf[8]; struct pollfd fds[1] = {{ 0, POLLRDNORM, 0 }}; n = poll(fds, 1, -1); if (n < 0) perror("poll"); n = read(0, buf, 8); if (n < 0) perror("read"); if (n > 0) write(1, buf, n); } ------------------------------------------------------------------------ The attached patch fixes this problem. Many calls to wake_up_interruptible_poll() in the kernel source code already specify "POLLIN | POLLRDNORM". Fixes: 57087d515441 ("tty: Fix spurious poll() wakeups") Cc: stable@vger.kernel.org Signed-off-by: Kosuke Tatsukawa Link: https://lore.kernel.org/r/TYCPR01MB81901C0F932203D30E452B3EA5209@TYCPR01MB8190.jpnprd01.prod.outlook.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_tty.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 8933ef1f83c0..5e988e514653 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -1329,7 +1329,7 @@ handle_newline: put_tty_queue(c, ldata); smp_store_release(&ldata->canon_head, ldata->read_head); kill_fasync(&tty->fasync, SIGIO, POLL_IN); - wake_up_interruptible_poll(&tty->read_wait, EPOLLIN); + wake_up_interruptible_poll(&tty->read_wait, EPOLLIN | EPOLLRDNORM); return; } } @@ -1561,7 +1561,7 @@ static void __receive_buf(struct tty_struct *tty, const unsigned char *cp, if (read_cnt(ldata)) { kill_fasync(&tty->fasync, SIGIO, POLL_IN); - wake_up_interruptible_poll(&tty->read_wait, EPOLLIN); + wake_up_interruptible_poll(&tty->read_wait, EPOLLIN | EPOLLRDNORM); } } From f6c6804c43fa18d3cee64b55490dfbd3bef1363a Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 28 Jan 2022 15:40:25 +0000 Subject: [PATCH 0856/1295] kvm: Move KVM_GET_XSAVE2 IOCTL definition at the end of kvm.h This way we can more easily find the next free IOCTL number when adding new IOCTLs. Fixes: be50b2065dfa ("kvm: x86: Add support for getting/setting expanded xstate buffer") Signed-off-by: Janosch Frank Message-Id: <20220128154025.102666-1-frankja@linux.ibm.com> Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index b46bcdb0cab1..5191b57e1562 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1624,9 +1624,6 @@ struct kvm_enc_region { #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) -/* Available with KVM_CAP_XSAVE2 */ -#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) - struct kvm_s390_pv_sec_parm { __u64 origin; __u64 length; @@ -2048,4 +2045,7 @@ struct kvm_stats_desc { #define KVM_GET_STATS_FD _IO(KVMIO, 0xce) +/* Available with KVM_CAP_XSAVE2 */ +#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) + #endif /* __LINUX_KVM_H */ From bd2db32e7c3e35bd4d9b8bbff689434a50893546 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 27 Jan 2022 08:16:38 +0100 Subject: [PATCH 0857/1295] moxart: fix potential use-after-free on remove path It was reported that the mmc host structure could be accessed after it was freed in moxart_remove(), so fix this by saving the base register of the device and using it instead of the pointer dereference. Cc: Ulf Hansson Cc: Xiyu Yang Cc: Xin Xiong Cc: Xin Tan Cc: Tony Lindgren Cc: Yang Li Cc: linux-mmc@vger.kernel.org Cc: stable Reported-by: whitehat002 Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220127071638.4057899-1-gregkh@linuxfoundation.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/moxart-mmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/moxart-mmc.c b/drivers/mmc/host/moxart-mmc.c index 16d1c7a43d33..b6eb75f4bbfc 100644 --- a/drivers/mmc/host/moxart-mmc.c +++ b/drivers/mmc/host/moxart-mmc.c @@ -705,12 +705,12 @@ static int moxart_remove(struct platform_device *pdev) if (!IS_ERR_OR_NULL(host->dma_chan_rx)) dma_release_channel(host->dma_chan_rx); mmc_remove_host(mmc); - mmc_free_host(mmc); writel(0, host->base + REG_INTERRUPT_MASK); writel(0, host->base + REG_POWER_CONTROL); writel(readl(host->base + REG_CLOCK_CONTROL) | CLK_OFF, host->base + REG_CLOCK_CONTROL); + mmc_free_host(mmc); return 0; } From 2d192fc4c1abeb0d04d1c8cd54405ff4a0b0255b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 16 Dec 2021 19:47:35 +0800 Subject: [PATCH 0858/1295] btrfs: don't start transaction for scrub if the fs is mounted read-only [BUG] The following super simple script would crash btrfs at unmount time, if CONFIG_BTRFS_ASSERT() is set. mkfs.btrfs -f $dev mount $dev $mnt xfs_io -f -c "pwrite 0 4k" $mnt/file umount $mnt mount -r ro $dev $mnt btrfs scrub start -Br $mnt umount $mnt This will trigger the following ASSERT() introduced by commit 0a31daa4b602 ("btrfs: add assertion for empty list of transactions at late stage of umount"). That patch is definitely not the cause, it just makes enough noise for developers. [CAUSE] We will start transaction for the following call chain during scrub: scrub_enumerate_chunks() |- btrfs_inc_block_group_ro() |- btrfs_join_transaction() However for RO mount, there is no running transaction at all, thus btrfs_join_transaction() will start a new transaction. Furthermore, since it's read-only mount, btrfs_sync_fs() will not call btrfs_commit_super() to commit the new but empty transaction. And leads to the ASSERT(). The bug has been there for a long time. Only the new ASSERT() makes it noisy enough to be noticed. [FIX] For read-only scrub on read-only mount, there is no need to start a transaction nor to allocate new chunks in btrfs_inc_block_group_ro(). Just do extra read-only mount check in btrfs_inc_block_group_ro(), and if it's read-only, skip all chunk allocation and go inc_block_group_ro() directly. CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 1db24e6d6d90..68feabc83a27 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2544,6 +2544,19 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, int ret; bool dirty_bg_running; + /* + * This can only happen when we are doing read-only scrub on read-only + * mount. + * In that case we should not start a new transaction on read-only fs. + * Thus here we skip all chunk allocations. + */ + if (sb_rdonly(fs_info->sb)) { + mutex_lock(&fs_info->ro_block_group_mutex); + ret = inc_block_group_ro(cache, 0); + mutex_unlock(&fs_info->ro_block_group_mutex); + return ret; + } + do { trans = btrfs_join_transaction(root); if (IS_ERR(trans)) From e804861bd4e69cc5fe1053eedcb024982dde8e48 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 20 Jan 2022 20:09:16 +0900 Subject: [PATCH 0859/1295] btrfs: fix deadlock between quota disable and qgroup rescan worker Quota disable ioctl starts a transaction before waiting for the qgroup rescan worker completes. However, this wait can be infinite and results in deadlock because of circular dependency among the quota disable ioctl, the qgroup rescan worker and the other task with transaction such as block group relocation task. The deadlock happens with the steps following: 1) Task A calls ioctl to disable quota. It starts a transaction and waits for qgroup rescan worker completes. 2) Task B such as block group relocation task starts a transaction and joins to the transaction that task A started. Then task B commits to the transaction. In this commit, task B waits for a commit by task A. 3) Task C as the qgroup rescan worker starts its job and starts a transaction. In this transaction start, task C waits for completion of the transaction that task A started and task B committed. This deadlock was found with fstests test case btrfs/115 and a zoned null_blk device. The test case enables and disables quota, and the block group reclaim was triggered during the quota disable by chance. The deadlock was also observed by running quota enable and disable in parallel with 'btrfs balance' command on regular null_blk devices. An example report of the deadlock: [372.469894] INFO: task kworker/u16:6:103 blocked for more than 122 seconds. [372.479944] Not tainted 5.16.0-rc8 #7 [372.485067] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [372.493898] task:kworker/u16:6 state:D stack: 0 pid: 103 ppid: 2 flags:0x00004000 [372.503285] Workqueue: btrfs-qgroup-rescan btrfs_work_helper [btrfs] [372.510782] Call Trace: [372.514092] [372.521684] __schedule+0xb56/0x4850 [372.530104] ? io_schedule_timeout+0x190/0x190 [372.538842] ? lockdep_hardirqs_on+0x7e/0x100 [372.547092] ? _raw_spin_unlock_irqrestore+0x3e/0x60 [372.555591] schedule+0xe0/0x270 [372.561894] btrfs_commit_transaction+0x18bb/0x2610 [btrfs] [372.570506] ? btrfs_apply_pending_changes+0x50/0x50 [btrfs] [372.578875] ? free_unref_page+0x3f2/0x650 [372.585484] ? finish_wait+0x270/0x270 [372.591594] ? release_extent_buffer+0x224/0x420 [btrfs] [372.599264] btrfs_qgroup_rescan_worker+0xc13/0x10c0 [btrfs] [372.607157] ? lock_release+0x3a9/0x6d0 [372.613054] ? btrfs_qgroup_account_extent+0xda0/0xda0 [btrfs] [372.620960] ? do_raw_spin_lock+0x11e/0x250 [372.627137] ? rwlock_bug.part.0+0x90/0x90 [372.633215] ? lock_is_held_type+0xe4/0x140 [372.639404] btrfs_work_helper+0x1ae/0xa90 [btrfs] [372.646268] process_one_work+0x7e9/0x1320 [372.652321] ? lock_release+0x6d0/0x6d0 [372.658081] ? pwq_dec_nr_in_flight+0x230/0x230 [372.664513] ? rwlock_bug.part.0+0x90/0x90 [372.670529] worker_thread+0x59e/0xf90 [372.676172] ? process_one_work+0x1320/0x1320 [372.682440] kthread+0x3b9/0x490 [372.687550] ? _raw_spin_unlock_irq+0x24/0x50 [372.693811] ? set_kthread_struct+0x100/0x100 [372.700052] ret_from_fork+0x22/0x30 [372.705517] [372.709747] INFO: task btrfs-transacti:2347 blocked for more than 123 seconds. [372.729827] Not tainted 5.16.0-rc8 #7 [372.745907] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [372.767106] task:btrfs-transacti state:D stack: 0 pid: 2347 ppid: 2 flags:0x00004000 [372.787776] Call Trace: [372.801652] [372.812961] __schedule+0xb56/0x4850 [372.830011] ? io_schedule_timeout+0x190/0x190 [372.852547] ? lockdep_hardirqs_on+0x7e/0x100 [372.871761] ? _raw_spin_unlock_irqrestore+0x3e/0x60 [372.886792] schedule+0xe0/0x270 [372.901685] wait_current_trans+0x22c/0x310 [btrfs] [372.919743] ? btrfs_put_transaction+0x3d0/0x3d0 [btrfs] [372.938923] ? finish_wait+0x270/0x270 [372.959085] ? join_transaction+0xc75/0xe30 [btrfs] [372.977706] start_transaction+0x938/0x10a0 [btrfs] [372.997168] transaction_kthread+0x19d/0x3c0 [btrfs] [373.013021] ? btrfs_cleanup_transaction.isra.0+0xfc0/0xfc0 [btrfs] [373.031678] kthread+0x3b9/0x490 [373.047420] ? _raw_spin_unlock_irq+0x24/0x50 [373.064645] ? set_kthread_struct+0x100/0x100 [373.078571] ret_from_fork+0x22/0x30 [373.091197] [373.105611] INFO: task btrfs:3145 blocked for more than 123 seconds. [373.114147] Not tainted 5.16.0-rc8 #7 [373.120401] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [373.130393] task:btrfs state:D stack: 0 pid: 3145 ppid: 3141 flags:0x00004000 [373.140998] Call Trace: [373.145501] [373.149654] __schedule+0xb56/0x4850 [373.155306] ? io_schedule_timeout+0x190/0x190 [373.161965] ? lockdep_hardirqs_on+0x7e/0x100 [373.168469] ? _raw_spin_unlock_irqrestore+0x3e/0x60 [373.175468] schedule+0xe0/0x270 [373.180814] wait_for_commit+0x104/0x150 [btrfs] [373.187643] ? test_and_set_bit+0x20/0x20 [btrfs] [373.194772] ? kmem_cache_free+0x124/0x550 [373.201191] ? btrfs_put_transaction+0x69/0x3d0 [btrfs] [373.208738] ? finish_wait+0x270/0x270 [373.214704] ? __btrfs_end_transaction+0x347/0x7b0 [btrfs] [373.222342] btrfs_commit_transaction+0x44d/0x2610 [btrfs] [373.230233] ? join_transaction+0x255/0xe30 [btrfs] [373.237334] ? btrfs_record_root_in_trans+0x4d/0x170 [btrfs] [373.245251] ? btrfs_apply_pending_changes+0x50/0x50 [btrfs] [373.253296] relocate_block_group+0x105/0xc20 [btrfs] [373.260533] ? mutex_lock_io_nested+0x1270/0x1270 [373.267516] ? btrfs_wait_nocow_writers+0x85/0x180 [btrfs] [373.275155] ? merge_reloc_roots+0x710/0x710 [btrfs] [373.283602] ? btrfs_wait_ordered_extents+0xd30/0xd30 [btrfs] [373.291934] ? kmem_cache_free+0x124/0x550 [373.298180] btrfs_relocate_block_group+0x35c/0x930 [btrfs] [373.306047] btrfs_relocate_chunk+0x85/0x210 [btrfs] [373.313229] btrfs_balance+0x12f4/0x2d20 [btrfs] [373.320227] ? lock_release+0x3a9/0x6d0 [373.326206] ? btrfs_relocate_chunk+0x210/0x210 [btrfs] [373.333591] ? lock_is_held_type+0xe4/0x140 [373.340031] ? rcu_read_lock_sched_held+0x3f/0x70 [373.346910] btrfs_ioctl_balance+0x548/0x700 [btrfs] [373.354207] btrfs_ioctl+0x7f2/0x71b0 [btrfs] [373.360774] ? lockdep_hardirqs_on_prepare+0x410/0x410 [373.367957] ? lockdep_hardirqs_on_prepare+0x410/0x410 [373.375327] ? btrfs_ioctl_get_supported_features+0x20/0x20 [btrfs] [373.383841] ? find_held_lock+0x2c/0x110 [373.389993] ? lock_release+0x3a9/0x6d0 [373.395828] ? mntput_no_expire+0xf7/0xad0 [373.402083] ? lock_is_held_type+0xe4/0x140 [373.408249] ? vfs_fileattr_set+0x9f0/0x9f0 [373.414486] ? selinux_file_ioctl+0x349/0x4e0 [373.420938] ? trace_raw_output_lock+0xb4/0xe0 [373.427442] ? selinux_inode_getsecctx+0x80/0x80 [373.434224] ? lockdep_hardirqs_on+0x7e/0x100 [373.440660] ? force_qs_rnp+0x2a0/0x6b0 [373.446534] ? lock_is_held_type+0x9b/0x140 [373.452763] ? __blkcg_punt_bio_submit+0x1b0/0x1b0 [373.459732] ? security_file_ioctl+0x50/0x90 [373.466089] __x64_sys_ioctl+0x127/0x190 [373.472022] do_syscall_64+0x3b/0x90 [373.477513] entry_SYSCALL_64_after_hwframe+0x44/0xae [373.484823] RIP: 0033:0x7f8f4af7e2bb [373.490493] RSP: 002b:00007ffcbf936178 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [373.500197] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f8f4af7e2bb [373.509451] RDX: 00007ffcbf936220 RSI: 00000000c4009420 RDI: 0000000000000003 [373.518659] RBP: 00007ffcbf93774a R08: 0000000000000013 R09: 00007f8f4b02d4e0 [373.527872] R10: 00007f8f4ae87740 R11: 0000000000000246 R12: 0000000000000001 [373.537222] R13: 00007ffcbf936220 R14: 0000000000000000 R15: 0000000000000002 [373.546506] [373.550878] INFO: task btrfs:3146 blocked for more than 123 seconds. [373.559383] Not tainted 5.16.0-rc8 #7 [373.565748] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [373.575748] task:btrfs state:D stack: 0 pid: 3146 ppid: 2168 flags:0x00000000 [373.586314] Call Trace: [373.590846] [373.595121] __schedule+0xb56/0x4850 [373.600901] ? __lock_acquire+0x23db/0x5030 [373.607176] ? io_schedule_timeout+0x190/0x190 [373.613954] schedule+0xe0/0x270 [373.619157] schedule_timeout+0x168/0x220 [373.625170] ? usleep_range_state+0x150/0x150 [373.631653] ? mark_held_locks+0x9e/0xe0 [373.637767] ? do_raw_spin_lock+0x11e/0x250 [373.643993] ? lockdep_hardirqs_on_prepare+0x17b/0x410 [373.651267] ? _raw_spin_unlock_irq+0x24/0x50 [373.657677] ? lockdep_hardirqs_on+0x7e/0x100 [373.664103] wait_for_completion+0x163/0x250 [373.670437] ? bit_wait_timeout+0x160/0x160 [373.676585] btrfs_quota_disable+0x176/0x9a0 [btrfs] [373.683979] ? btrfs_quota_enable+0x12f0/0x12f0 [btrfs] [373.691340] ? down_write+0xd0/0x130 [373.696880] ? down_write_killable+0x150/0x150 [373.703352] btrfs_ioctl+0x3945/0x71b0 [btrfs] [373.710061] ? find_held_lock+0x2c/0x110 [373.716192] ? lock_release+0x3a9/0x6d0 [373.722047] ? __handle_mm_fault+0x23cd/0x3050 [373.728486] ? btrfs_ioctl_get_supported_features+0x20/0x20 [btrfs] [373.737032] ? set_pte+0x6a/0x90 [373.742271] ? do_raw_spin_unlock+0x55/0x1f0 [373.748506] ? lock_is_held_type+0xe4/0x140 [373.754792] ? vfs_fileattr_set+0x9f0/0x9f0 [373.761083] ? selinux_file_ioctl+0x349/0x4e0 [373.767521] ? selinux_inode_getsecctx+0x80/0x80 [373.774247] ? __up_read+0x182/0x6e0 [373.780026] ? count_memcg_events.constprop.0+0x46/0x60 [373.787281] ? up_write+0x460/0x460 [373.792932] ? security_file_ioctl+0x50/0x90 [373.799232] __x64_sys_ioctl+0x127/0x190 [373.805237] do_syscall_64+0x3b/0x90 [373.810947] entry_SYSCALL_64_after_hwframe+0x44/0xae [373.818102] RIP: 0033:0x7f1383ea02bb [373.823847] RSP: 002b:00007fffeb4d71f8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [373.833641] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f1383ea02bb [373.842961] RDX: 00007fffeb4d7210 RSI: 00000000c0109428 RDI: 0000000000000003 [373.852179] RBP: 0000000000000003 R08: 0000000000000003 R09: 0000000000000078 [373.861408] R10: 00007f1383daec78 R11: 0000000000000202 R12: 00007fffeb4d874a [373.870647] R13: 0000000000493099 R14: 0000000000000001 R15: 0000000000000000 [373.879838] [373.884018] Showing all locks held in the system: [373.894250] 3 locks held by kworker/4:1/58: [373.900356] 1 lock held by khungtaskd/63: [373.906333] #0: ffffffff8945ff60 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x53/0x260 [373.917307] 3 locks held by kworker/u16:6/103: [373.923938] #0: ffff888127b4f138 ((wq_completion)btrfs-qgroup-rescan){+.+.}-{0:0}, at: process_one_work+0x712/0x1320 [373.936555] #1: ffff88810b817dd8 ((work_completion)(&work->normal_work)){+.+.}-{0:0}, at: process_one_work+0x73f/0x1320 [373.951109] #2: ffff888102dd4650 (sb_internal#2){.+.+}-{0:0}, at: btrfs_qgroup_rescan_worker+0x1f6/0x10c0 [btrfs] [373.964027] 2 locks held by less/1803: [373.969982] #0: ffff88813ed56098 (&tty->ldisc_sem){++++}-{0:0}, at: tty_ldisc_ref_wait+0x24/0x80 [373.981295] #1: ffffc90000b3b2e8 (&ldata->atomic_read_lock){+.+.}-{3:3}, at: n_tty_read+0x9e2/0x1060 [373.992969] 1 lock held by btrfs-transacti/2347: [373.999893] #0: ffff88813d4887a8 (&fs_info->transaction_kthread_mutex){+.+.}-{3:3}, at: transaction_kthread+0xe3/0x3c0 [btrfs] [374.015872] 3 locks held by btrfs/3145: [374.022298] #0: ffff888102dd4460 (sb_writers#18){.+.+}-{0:0}, at: btrfs_ioctl_balance+0xc3/0x700 [btrfs] [374.034456] #1: ffff88813d48a0a0 (&fs_info->reclaim_bgs_lock){+.+.}-{3:3}, at: btrfs_balance+0xfe5/0x2d20 [btrfs] [374.047646] #2: ffff88813d488838 (&fs_info->cleaner_mutex){+.+.}-{3:3}, at: btrfs_relocate_block_group+0x354/0x930 [btrfs] [374.063295] 4 locks held by btrfs/3146: [374.069647] #0: ffff888102dd4460 (sb_writers#18){.+.+}-{0:0}, at: btrfs_ioctl+0x38b1/0x71b0 [btrfs] [374.081601] #1: ffff88813d488bb8 (&fs_info->subvol_sem){+.+.}-{3:3}, at: btrfs_ioctl+0x38fd/0x71b0 [btrfs] [374.094283] #2: ffff888102dd4650 (sb_internal#2){.+.+}-{0:0}, at: btrfs_quota_disable+0xc8/0x9a0 [btrfs] [374.106885] #3: ffff88813d489800 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_quota_disable+0xd5/0x9a0 [btrfs] [374.126780] ============================================= To avoid the deadlock, wait for the qgroup rescan worker to complete before starting the transaction for the quota disable ioctl. Clear BTRFS_FS_QUOTA_ENABLE flag before the wait and the transaction to request the worker to complete. On transaction start failure, set the BTRFS_FS_QUOTA_ENABLE flag again. These BTRFS_FS_QUOTA_ENABLE flag changes can be done safely since the function btrfs_quota_disable is not called concurrently because of fs_info->subvol_sem. Also check the BTRFS_FS_QUOTA_ENABLE flag in qgroup_rescan_init to avoid another qgroup rescan worker to start after the previous qgroup worker completed. CC: stable@vger.kernel.org # 5.4+ Suggested-by: Nikolay Borisov Reviewed-by: Filipe Manana Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8928275823a1..f12dc687350c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1185,9 +1185,24 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) struct btrfs_trans_handle *trans = NULL; int ret = 0; + /* + * We need to have subvol_sem write locked, to prevent races between + * concurrent tasks trying to disable quotas, because we will unlock + * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. + */ + lockdep_assert_held_write(&fs_info->subvol_sem); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; + + /* + * Request qgroup rescan worker to complete and wait for it. This wait + * must be done before transaction start for quota disable since it may + * deadlock with transaction by the qgroup rescan worker. + */ + clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + btrfs_qgroup_wait_for_completion(fs_info, false); mutex_unlock(&fs_info->qgroup_ioctl_lock); /* @@ -1205,14 +1220,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); goto out; } if (!fs_info->quota_root) goto out; - clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; fs_info->quota_root = NULL; @@ -3383,6 +3397,9 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, btrfs_warn(fs_info, "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; + } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + /* Quota disable is in progress */ + ret = -EBUSY; } if (ret) { From 0c982944af27d131d3b74242f3528169f66950ad Mon Sep 17 00:00:00 2001 From: Su Yue Date: Fri, 21 Jan 2022 17:33:34 +0800 Subject: [PATCH 0860/1295] btrfs: tree-checker: check item_size for inode_item while mounting the crafted image, out-of-bounds access happens: [350.429619] UBSAN: array-index-out-of-bounds in fs/btrfs/struct-funcs.c:161:1 [350.429636] index 1048096 is out of range for type 'page *[16]' [350.429650] CPU: 0 PID: 9 Comm: kworker/u8:1 Not tainted 5.16.0-rc4 #1 [350.429652] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [350.429653] Workqueue: btrfs-endio-meta btrfs_work_helper [btrfs] [350.429772] Call Trace: [350.429774] [350.429776] dump_stack_lvl+0x47/0x5c [350.429780] ubsan_epilogue+0x5/0x50 [350.429786] __ubsan_handle_out_of_bounds+0x66/0x70 [350.429791] btrfs_get_16+0xfd/0x120 [btrfs] [350.429832] check_leaf+0x754/0x1a40 [btrfs] [350.429874] ? filemap_read+0x34a/0x390 [350.429878] ? load_balance+0x175/0xfc0 [350.429881] validate_extent_buffer+0x244/0x310 [btrfs] [350.429911] btrfs_validate_metadata_buffer+0xf8/0x100 [btrfs] [350.429935] end_bio_extent_readpage+0x3af/0x850 [btrfs] [350.429969] ? newidle_balance+0x259/0x480 [350.429972] end_workqueue_fn+0x29/0x40 [btrfs] [350.429995] btrfs_work_helper+0x71/0x330 [btrfs] [350.430030] ? __schedule+0x2fb/0xa40 [350.430033] process_one_work+0x1f6/0x400 [350.430035] ? process_one_work+0x400/0x400 [350.430036] worker_thread+0x2d/0x3d0 [350.430037] ? process_one_work+0x400/0x400 [350.430038] kthread+0x165/0x190 [350.430041] ? set_kthread_struct+0x40/0x40 [350.430043] ret_from_fork+0x1f/0x30 [350.430047] [350.430077] BTRFS warning (device loop0): bad eb member start: ptr 0xffe20f4e start 20975616 member offset 4293005178 size 2 check_leaf() is checking the leaf: corrupt leaf: root=4 block=29396992 slot=1, bad key order, prev (16140901064495857664 1 0) current (1 204 12582912) leaf 29396992 items 6 free space 3565 generation 6 owner DEV_TREE leaf 29396992 flags 0x1(WRITTEN) backref revision 1 fs uuid a62e00e8-e94e-4200-8217-12444de93c2e chunk uuid cecbd0f7-9ca0-441e-ae9f-f782f9732bd8 item 0 key (16140901064495857664 INODE_ITEM 0) itemoff 3955 itemsize 40 generation 0 transid 0 size 0 nbytes 17592186044416 block group 0 mode 52667 links 33 uid 0 gid 2104132511 rdev 94223634821136 sequence 100305 flags 0x2409000(none) atime 0.0 (1970-01-01 08:00:00) ctime 2973280098083405823.4294967295 (-269783007-01-01 21:37:03) mtime 18446744071572723616.4026825121 (1902-04-16 12:40:00) otime 9249929404488876031.4294967295 (622322949-04-16 04:25:58) item 1 key (1 DEV_EXTENT 12582912) itemoff 3907 itemsize 48 dev extent chunk_tree 3 chunk_objectid 256 chunk_offset 12582912 length 8388608 chunk_tree_uuid cecbd0f7-9ca0-441e-ae9f-f782f9732bd8 The corrupted leaf of device tree has an inode item. The leaf passed checksum and others checks in validate_extent_buffer until check_leaf_item(). Because of the key type BTRFS_INODE_ITEM, check_inode_item() is called even we are in the device tree. Since the item offset + sizeof(struct btrfs_inode_item) > eb->len, out-of-bounds access is triggered. The item end vs leaf boundary check has been done before check_leaf_item(), so fix it by checking item size in check_inode_item() before access of the inode item in extent buffer. Other check functions except check_dev_item() in check_leaf_item() have their item size checks. The commit for check_dev_item() is followed. No regression observed during running fstests. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215299 CC: stable@vger.kernel.org # 5.10+ CC: Wenqing Liu Signed-off-by: Su Yue Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 72e1c942197d..a819eb5e264a 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1007,6 +1007,7 @@ static int check_inode_item(struct extent_buffer *leaf, struct btrfs_inode_item *iitem; u64 super_gen = btrfs_super_generation(fs_info->super_copy); u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); + const u32 item_size = btrfs_item_size(leaf, slot); u32 mode; int ret; u32 flags; @@ -1016,6 +1017,12 @@ static int check_inode_item(struct extent_buffer *leaf, if (unlikely(ret < 0)) return ret; + if (unlikely(item_size != sizeof(*iitem))) { + generic_err(leaf, slot, "invalid item size: has %u expect %zu", + item_size, sizeof(*iitem)); + return -EUCLEAN; + } + iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); /* Here we use super block generation + 1 to handle log tree */ From ea1d1ca4025ac6c075709f549f9aa036b5b6597d Mon Sep 17 00:00:00 2001 From: Su Yue Date: Fri, 21 Jan 2022 17:33:35 +0800 Subject: [PATCH 0861/1295] btrfs: tree-checker: check item_size for dev_item Check item size before accessing the device item to avoid out of bound access, similar to inode_item check. Signed-off-by: Su Yue Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a819eb5e264a..9fd145f1c4bc 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -965,6 +965,7 @@ static int check_dev_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_dev_item *ditem; + const u32 item_size = btrfs_item_size(leaf, slot); if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) { dev_item_err(leaf, slot, @@ -972,6 +973,13 @@ static int check_dev_item(struct extent_buffer *leaf, key->objectid, BTRFS_DEV_ITEMS_OBJECTID); return -EUCLEAN; } + + if (unlikely(item_size != sizeof(*ditem))) { + dev_item_err(leaf, slot, "invalid item size: has %u expect %zu", + item_size, sizeof(*ditem)); + return -EUCLEAN; + } + ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) { dev_item_err(leaf, slot, From 28b21c558a3753171097193b6f6602a94169093a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 21 Jan 2022 15:44:39 +0000 Subject: [PATCH 0862/1295] btrfs: fix use-after-free after failure to create a snapshot At ioctl.c:create_snapshot(), we allocate a pending snapshot structure and then attach it to the transaction's list of pending snapshots. After that we call btrfs_commit_transaction(), and if that returns an error we jump to 'fail' label, where we kfree() the pending snapshot structure. This can result in a later use-after-free of the pending snapshot: 1) We allocated the pending snapshot and added it to the transaction's list of pending snapshots; 2) We call btrfs_commit_transaction(), and it fails either at the first call to btrfs_run_delayed_refs() or btrfs_start_dirty_block_groups(). In both cases, we don't abort the transaction and we release our transaction handle. We jump to the 'fail' label and free the pending snapshot structure. We return with the pending snapshot still in the transaction's list; 3) Another task commits the transaction. This time there's no error at all, and then during the transaction commit it accesses a pointer to the pending snapshot structure that the snapshot creation task has already freed, resulting in a user-after-free. This issue could actually be detected by smatch, which produced the following warning: fs/btrfs/ioctl.c:843 create_snapshot() warn: '&pending_snapshot->list' not removed from list So fix this by not having the snapshot creation ioctl directly add the pending snapshot to the transaction's list. Instead add the pending snapshot to the transaction handle, and then at btrfs_commit_transaction() we add the snapshot to the list only when we can guarantee that any error returned after that point will result in a transaction abort, in which case the ioctl code can safely free the pending snapshot and no one can access it anymore. CC: stable@vger.kernel.org # 5.10+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 5 +---- fs/btrfs/transaction.c | 24 ++++++++++++++++++++++++ fs/btrfs/transaction.h | 2 ++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index eef5b300b9a9..90c11ddff6e5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -805,10 +805,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto fail; } - spin_lock(&fs_info->trans_lock); - list_add(&pending_snapshot->list, - &trans->transaction->pending_snapshots); - spin_unlock(&fs_info->trans_lock); + trans->pending_snapshot = pending_snapshot; ret = btrfs_commit_transaction(trans); if (ret) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 03de89b45f27..c43bbc7f623e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2000,6 +2000,27 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } +/* + * Add a pending snapshot associated with the given transaction handle to the + * respective handle. This must be called after the transaction commit started + * and while holding fs_info->trans_lock. + * This serves to guarantee a caller of btrfs_commit_transaction() that it can + * safely free the pending snapshot pointer in case btrfs_commit_transaction() + * returns an error. + */ +static void add_pending_snapshot(struct btrfs_trans_handle *trans) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + + if (!trans->pending_snapshot) + return; + + lockdep_assert_held(&trans->fs_info->trans_lock); + ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START); + + list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2073,6 +2094,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (cur_trans->state >= TRANS_STATE_COMMIT_START) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + add_pending_snapshot(trans); + spin_unlock(&fs_info->trans_lock); refcount_inc(&cur_trans->use_count); @@ -2163,6 +2186,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * COMMIT_DOING so make sure to wait for num_writers to == 1 again. */ spin_lock(&fs_info->trans_lock); + add_pending_snapshot(trans); cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); wait_event(cur_trans->writer_wait, diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 1852ed9de7fd..9402d8d94484 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -123,6 +123,8 @@ struct btrfs_trans_handle { struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; + /* Set by a task that wants to create a snapshot. */ + struct btrfs_pending_snapshot *pending_snapshot; refcount_t use_count; unsigned int type; /* From 37b4599547e324589e011c20f74b021d6d25cb7f Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 21 Jan 2022 05:45:22 -0800 Subject: [PATCH 0863/1295] btrfs: fix use of uninitialized variable at rm device ioctl Clang static analysis reports this problem ioctl.c:3333:8: warning: 3rd function call argument is an uninitialized value ret = exclop_start_or_cancel_reloc(fs_info, cancel is only set in one branch of an if-check and is always used. So initialize to false. Fixes: 1a15eb724aae ("btrfs: use btrfs_get_dev_args_from_path in dev removal ioctls") Reviewed-by: Filipe Manana Reviewed-by: Anand Jain Signed-off-by: Tom Rix Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 90c11ddff6e5..925522756e28 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3353,7 +3353,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct block_device *bdev = NULL; fmode_t mode; int ret; - bool cancel; + bool cancel = false; if (!capable(CAP_SYS_ADMIN)) return -EPERM; From 40cdc509877bacb438213b83c7541c5e24a1d9ec Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 18 Jan 2022 13:39:34 +0000 Subject: [PATCH 0864/1295] btrfs: skip reserved bytes warning on unmount after log cleanup failure After the recent changes made by commit c2e39305299f01 ("btrfs: clear extent buffer uptodate when we fail to write it") and its followup fix, commit 651740a5024117 ("btrfs: check WRITE_ERR when trying to read an extent buffer"), we can now end up not cleaning up space reservations of log tree extent buffers after a transaction abort happens, as well as not cleaning up still dirty extent buffers. This happens because if writeback for a log tree extent buffer failed, then we have cleared the bit EXTENT_BUFFER_UPTODATE from the extent buffer and we have also set the bit EXTENT_BUFFER_WRITE_ERR on it. Later on, when trying to free the log tree with free_log_tree(), which iterates over the tree, we can end up getting an -EIO error when trying to read a node or a leaf, since read_extent_buffer_pages() returns -EIO if an extent buffer does not have EXTENT_BUFFER_UPTODATE set and has the EXTENT_BUFFER_WRITE_ERR bit set. Getting that -EIO means that we return immediately as we can not iterate over the entire tree. In that case we never update the reserved space for an extent buffer in the respective block group and space_info object. When this happens we get the following traces when unmounting the fs: [174957.284509] BTRFS: error (device dm-0) in cleanup_transaction:1913: errno=-5 IO failure [174957.286497] BTRFS: error (device dm-0) in free_log_tree:3420: errno=-5 IO failure [174957.399379] ------------[ cut here ]------------ [174957.402497] WARNING: CPU: 2 PID: 3206883 at fs/btrfs/block-group.c:127 btrfs_put_block_group+0x77/0xb0 [btrfs] [174957.407523] Modules linked in: btrfs overlay dm_zero (...) [174957.424917] CPU: 2 PID: 3206883 Comm: umount Tainted: G W 5.16.0-rc5-btrfs-next-109 #1 [174957.426689] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [174957.428716] RIP: 0010:btrfs_put_block_group+0x77/0xb0 [btrfs] [174957.429717] Code: 21 48 8b bd (...) [174957.432867] RSP: 0018:ffffb70d41cffdd0 EFLAGS: 00010206 [174957.433632] RAX: 0000000000000001 RBX: ffff8b09c3848000 RCX: ffff8b0758edd1c8 [174957.434689] RDX: 0000000000000001 RSI: ffffffffc0b467e7 RDI: ffff8b0758edd000 [174957.436068] RBP: ffff8b0758edd000 R08: 0000000000000000 R09: 0000000000000000 [174957.437114] R10: 0000000000000246 R11: 0000000000000000 R12: ffff8b09c3848148 [174957.438140] R13: ffff8b09c3848198 R14: ffff8b0758edd188 R15: dead000000000100 [174957.439317] FS: 00007f328fb82800(0000) GS:ffff8b0a2d200000(0000) knlGS:0000000000000000 [174957.440402] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [174957.441164] CR2: 00007fff13563e98 CR3: 0000000404f4e005 CR4: 0000000000370ee0 [174957.442117] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [174957.443076] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [174957.443948] Call Trace: [174957.444264] [174957.444538] btrfs_free_block_groups+0x255/0x3c0 [btrfs] [174957.445238] close_ctree+0x301/0x357 [btrfs] [174957.445803] ? call_rcu+0x16c/0x290 [174957.446250] generic_shutdown_super+0x74/0x120 [174957.446832] kill_anon_super+0x14/0x30 [174957.447305] btrfs_kill_super+0x12/0x20 [btrfs] [174957.447890] deactivate_locked_super+0x31/0xa0 [174957.448440] cleanup_mnt+0x147/0x1c0 [174957.448888] task_work_run+0x5c/0xa0 [174957.449336] exit_to_user_mode_prepare+0x1e5/0x1f0 [174957.449934] syscall_exit_to_user_mode+0x16/0x40 [174957.450512] do_syscall_64+0x48/0xc0 [174957.450980] entry_SYSCALL_64_after_hwframe+0x44/0xae [174957.451605] RIP: 0033:0x7f328fdc4a97 [174957.452059] Code: 03 0c 00 f7 (...) [174957.454320] RSP: 002b:00007fff13564ec8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [174957.455262] RAX: 0000000000000000 RBX: 00007f328feea264 RCX: 00007f328fdc4a97 [174957.456131] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000560b8ae51dd0 [174957.457118] RBP: 0000560b8ae51ba0 R08: 0000000000000000 R09: 00007fff13563c40 [174957.458005] R10: 00007f328fe49fc0 R11: 0000000000000246 R12: 0000000000000000 [174957.459113] R13: 0000560b8ae51dd0 R14: 0000560b8ae51cb0 R15: 0000000000000000 [174957.460193] [174957.460534] irq event stamp: 0 [174957.461003] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [174957.461947] hardirqs last disabled at (0): [] copy_process+0x934/0x2040 [174957.463147] softirqs last enabled at (0): [] copy_process+0x934/0x2040 [174957.465116] softirqs last disabled at (0): [<0000000000000000>] 0x0 [174957.466323] ---[ end trace bc7ee0c490bce3af ]--- [174957.467282] ------------[ cut here ]------------ [174957.468184] WARNING: CPU: 2 PID: 3206883 at fs/btrfs/block-group.c:3976 btrfs_free_block_groups+0x330/0x3c0 [btrfs] [174957.470066] Modules linked in: btrfs overlay dm_zero (...) [174957.483137] CPU: 2 PID: 3206883 Comm: umount Tainted: G W 5.16.0-rc5-btrfs-next-109 #1 [174957.484691] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [174957.486853] RIP: 0010:btrfs_free_block_groups+0x330/0x3c0 [btrfs] [174957.488050] Code: 00 00 00 ad de (...) [174957.491479] RSP: 0018:ffffb70d41cffde0 EFLAGS: 00010206 [174957.492520] RAX: ffff8b08d79310b0 RBX: ffff8b09c3848000 RCX: 0000000000000000 [174957.493868] RDX: 0000000000000001 RSI: fffff443055ee600 RDI: ffffffffb1131846 [174957.495183] RBP: ffff8b08d79310b0 R08: 0000000000000000 R09: 0000000000000000 [174957.496580] R10: 0000000000000001 R11: 0000000000000000 R12: ffff8b08d7931000 [174957.498027] R13: ffff8b09c38492b0 R14: dead000000000122 R15: dead000000000100 [174957.499438] FS: 00007f328fb82800(0000) GS:ffff8b0a2d200000(0000) knlGS:0000000000000000 [174957.500990] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [174957.502117] CR2: 00007fff13563e98 CR3: 0000000404f4e005 CR4: 0000000000370ee0 [174957.503513] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [174957.504864] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [174957.506167] Call Trace: [174957.506654] [174957.507047] close_ctree+0x301/0x357 [btrfs] [174957.507867] ? call_rcu+0x16c/0x290 [174957.508567] generic_shutdown_super+0x74/0x120 [174957.509447] kill_anon_super+0x14/0x30 [174957.510194] btrfs_kill_super+0x12/0x20 [btrfs] [174957.511123] deactivate_locked_super+0x31/0xa0 [174957.511976] cleanup_mnt+0x147/0x1c0 [174957.512610] task_work_run+0x5c/0xa0 [174957.513309] exit_to_user_mode_prepare+0x1e5/0x1f0 [174957.514231] syscall_exit_to_user_mode+0x16/0x40 [174957.515069] do_syscall_64+0x48/0xc0 [174957.515718] entry_SYSCALL_64_after_hwframe+0x44/0xae [174957.516688] RIP: 0033:0x7f328fdc4a97 [174957.517413] Code: 03 0c 00 f7 d8 (...) [174957.521052] RSP: 002b:00007fff13564ec8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [174957.522514] RAX: 0000000000000000 RBX: 00007f328feea264 RCX: 00007f328fdc4a97 [174957.523950] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000560b8ae51dd0 [174957.525375] RBP: 0000560b8ae51ba0 R08: 0000000000000000 R09: 00007fff13563c40 [174957.526763] R10: 00007f328fe49fc0 R11: 0000000000000246 R12: 0000000000000000 [174957.528058] R13: 0000560b8ae51dd0 R14: 0000560b8ae51cb0 R15: 0000000000000000 [174957.529404] [174957.529843] irq event stamp: 0 [174957.530256] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [174957.531061] hardirqs last disabled at (0): [] copy_process+0x934/0x2040 [174957.532075] softirqs last enabled at (0): [] copy_process+0x934/0x2040 [174957.533083] softirqs last disabled at (0): [<0000000000000000>] 0x0 [174957.533865] ---[ end trace bc7ee0c490bce3b0 ]--- [174957.534452] BTRFS info (device dm-0): space_info 4 has 1070841856 free, is not full [174957.535404] BTRFS info (device dm-0): space_info total=1073741824, used=2785280, pinned=0, reserved=49152, may_use=0, readonly=65536 zone_unusable=0 [174957.537029] BTRFS info (device dm-0): global_block_rsv: size 0 reserved 0 [174957.537859] BTRFS info (device dm-0): trans_block_rsv: size 0 reserved 0 [174957.538697] BTRFS info (device dm-0): chunk_block_rsv: size 0 reserved 0 [174957.539552] BTRFS info (device dm-0): delayed_block_rsv: size 0 reserved 0 [174957.540403] BTRFS info (device dm-0): delayed_refs_rsv: size 0 reserved 0 This also means that in case we have log tree extent buffers that are still dirty, we can end up not cleaning them up in case we find an extent buffer with EXTENT_BUFFER_WRITE_ERR set on it, as in that case we have no way for iterating over the rest of the tree. This issue is very often triggered with test cases generic/475 and generic/648 from fstests. The issue could almost be fixed by iterating over the io tree attached to each log root which keeps tracks of the range of allocated extent buffers, log_root->dirty_log_pages, however that does not work and has some inconveniences: 1) After we sync the log, we clear the range of the extent buffers from the io tree, so we can't find them after writeback. We could keep the ranges in the io tree, with a separate bit to signal they represent extent buffers already written, but that means we need to hold into more memory until the transaction commits. How much more memory is used depends a lot on whether we are able to allocate contiguous extent buffers on disk (and how often) for a log tree - if we are able to, then a single extent state record can represent multiple extent buffers, otherwise we need multiple extent state record structures to track each extent buffer. In fact, my earlier approach did that: https://lore.kernel.org/linux-btrfs/3aae7c6728257c7ce2279d6660ee2797e5e34bbd.1641300250.git.fdmanana@suse.com/ However that can cause a very significant negative impact on performance, not only due to the extra memory usage but also because we get a larger and deeper dirty_log_pages io tree. We got a report that, on beefy machines at least, we can get such performance drop with fsmark for example: https://lore.kernel.org/linux-btrfs/20220117082426.GE32491@xsang-OptiPlex-9020/ 2) We would be doing it only to deal with an unexpected and exceptional case, which is basically failure to read an extent buffer from disk due to IO failures. On a healthy system we don't expect transaction aborts to happen after all; 3) Instead of relying on iterating the log tree or tracking the ranges of extent buffers in the dirty_log_pages io tree, using the radix tree that tracks extent buffers (fs_info->buffer_radix) to find all log tree extent buffers is not reliable either, because after writeback of an extent buffer it can be evicted from memory by the release page callback of the btree inode (btree_releasepage()). Since there's no way to be able to properly cleanup a log tree without being able to read its extent buffers from disk and without using more memory to track the logical ranges of the allocated extent buffers do the following: 1) When we fail to cleanup a log tree, setup a flag that indicates that failure; 2) Trigger writeback of all log tree extent buffers that are still dirty, and wait for the writeback to complete. This is just to cleanup their state, page states, page leaks, etc; 3) When unmounting the fs, ignore if the number of bytes reserved in a block group and in a space_info is not 0 if, and only if, we failed to cleanup a log tree. Also ignore only for metadata block groups and the metadata space_info object. This is far from a perfect solution, but it serves to silence test failures such as those from generic/475 and generic/648. However having a non-zero value for the reserved bytes counters on unmount after a transaction abort, is not such a terrible thing and it's completely harmless, it does not affect the filesystem integrity in any way. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 26 ++++++++++++++++++++++++-- fs/btrfs/ctree.h | 6 ++++++ fs/btrfs/tree-log.c | 23 +++++++++++++++++++++++ 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 68feabc83a27..8202ad6aa131 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -124,7 +124,16 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) { if (refcount_dec_and_test(&cache->refs)) { WARN_ON(cache->pinned > 0); - WARN_ON(cache->reserved > 0); + /* + * If there was a failure to cleanup a log tree, very likely due + * to an IO failure on a writeback attempt of one or more of its + * extent buffers, we could not do proper (and cheap) unaccounting + * of their reserved space, so don't warn on reserved > 0 in that + * case. + */ + if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info)) + WARN_ON(cache->reserved > 0); /* * A block_group shouldn't be on the discard_list anymore. @@ -3987,9 +3996,22 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) * important and indicates a real bug if this happens. */ if (WARN_ON(space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || space_info->bytes_may_use > 0)) btrfs_dump_space_info(info, space_info, 0, 0); + + /* + * If there was a failure to cleanup a log tree, very likely due + * to an IO failure on a writeback attempt of one or more of its + * extent buffers, we could not do proper (and cheap) unaccounting + * of their reserved space, so don't warn on bytes_reserved > 0 in + * that case. + */ + if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { + if (WARN_ON(space_info->bytes_reserved > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + } + WARN_ON(space_info->reclaim_size > 0); list_del(&space_info->list); btrfs_sysfs_remove_space_info(space_info); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b4a9b1c58d22..8992e0096163 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -145,6 +145,9 @@ enum { BTRFS_FS_STATE_DUMMY_FS_INFO, BTRFS_FS_STATE_NO_CSUMS, + + /* Indicates there was an error cleaning up a log tree. */ + BTRFS_FS_STATE_LOG_CLEANUP_ERROR, }; #define BTRFS_BACKREF_REV_MAX 256 @@ -3593,6 +3596,9 @@ do { \ #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ &(fs_info)->fs_state))) +#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ + (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ + &(fs_info)->fs_state))) __printf(5, 6) __cold diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c1ddbe800897..3ee014c06b82 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3414,6 +3414,29 @@ static void free_log_tree(struct btrfs_trans_handle *trans, if (log->node) { ret = walk_log_tree(trans, log, &wc); if (ret) { + /* + * We weren't able to traverse the entire log tree, the + * typical scenario is getting an -EIO when reading an + * extent buffer of the tree, due to a previous writeback + * failure of it. + */ + set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + &log->fs_info->fs_state); + + /* + * Some extent buffers of the log tree may still be dirty + * and not yet written back to storage, because we may + * have updates to a log tree without syncing a log tree, + * such as during rename and link operations. So flush + * them out and wait for their writeback to complete, so + * that we properly cleanup their state and pages. + */ + btrfs_write_marked_extents(log->fs_info, + &log->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); + btrfs_wait_tree_log_extents(log, + EXTENT_DIRTY | EXTENT_NEW); + if (trans) btrfs_abort_transaction(trans, ret); else From f83a96e5f033fbbd21764705cb9c04234b96218e Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Mon, 31 Jan 2022 15:17:08 +0100 Subject: [PATCH 0865/1295] spi: mediatek: Avoid NULL pointer crash in interrupt In some case, like after a transfer timeout, master->cur_msg pointer is NULL which led to a kernel crash when trying to use master->cur_msg->spi. mtk_spi_can_dma(), pointed by master->can_dma, doesn't use this parameter avoid the problem by setting NULL as second parameter. Fixes: a568231f46322 ("spi: mediatek: Add spi bus for Mediatek MT8173") Signed-off-by: Benjamin Gaignard Link: https://lore.kernel.org/r/20220131141708.888710-1-benjamin.gaignard@collabora.com Signed-off-by: Mark Brown --- drivers/spi/spi-mt65xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-mt65xx.c b/drivers/spi/spi-mt65xx.c index a15de10ee286..753bd313e6fd 100644 --- a/drivers/spi/spi-mt65xx.c +++ b/drivers/spi/spi-mt65xx.c @@ -624,7 +624,7 @@ static irqreturn_t mtk_spi_interrupt(int irq, void *dev_id) else mdata->state = MTK_SPI_IDLE; - if (!master->can_dma(master, master->cur_msg->spi, trans)) { + if (!master->can_dma(master, NULL, trans)) { if (trans->rx_buf) { cnt = mdata->xfer_len / 4; ioread32_rep(mdata->base + SPI_RX_DATA_REG, From b54240ad494300ff0994c4539a531727874381f4 Mon Sep 17 00:00:00 2001 From: Vijayanand Jitta Date: Mon, 31 Jan 2022 12:42:35 +0530 Subject: [PATCH 0866/1295] iommu: Fix potential use-after-free during probe Kasan has reported the following use after free on dev->iommu. when a device probe fails and it is in process of freeing dev->iommu in dev_iommu_free function, a deferred_probe_work_func runs in parallel and tries to access dev->iommu->fwspec in of_iommu_configure path thus causing use after free. BUG: KASAN: use-after-free in of_iommu_configure+0xb4/0x4a4 Read of size 8 at addr ffffff87a2f1acb8 by task kworker/u16:2/153 Workqueue: events_unbound deferred_probe_work_func Call trace: dump_backtrace+0x0/0x33c show_stack+0x18/0x24 dump_stack_lvl+0x16c/0x1e0 print_address_description+0x84/0x39c __kasan_report+0x184/0x308 kasan_report+0x50/0x78 __asan_load8+0xc0/0xc4 of_iommu_configure+0xb4/0x4a4 of_dma_configure_id+0x2fc/0x4d4 platform_dma_configure+0x40/0x5c really_probe+0x1b4/0xb74 driver_probe_device+0x11c/0x228 __device_attach_driver+0x14c/0x304 bus_for_each_drv+0x124/0x1b0 __device_attach+0x25c/0x334 device_initial_probe+0x24/0x34 bus_probe_device+0x78/0x134 deferred_probe_work_func+0x130/0x1a8 process_one_work+0x4c8/0x970 worker_thread+0x5c8/0xaec kthread+0x1f8/0x220 ret_from_fork+0x10/0x18 Allocated by task 1: ____kasan_kmalloc+0xd4/0x114 __kasan_kmalloc+0x10/0x1c kmem_cache_alloc_trace+0xe4/0x3d4 __iommu_probe_device+0x90/0x394 probe_iommu_group+0x70/0x9c bus_for_each_dev+0x11c/0x19c bus_iommu_probe+0xb8/0x7d4 bus_set_iommu+0xcc/0x13c arm_smmu_bus_init+0x44/0x130 [arm_smmu] arm_smmu_device_probe+0xb88/0xc54 [arm_smmu] platform_drv_probe+0xe4/0x13c really_probe+0x2c8/0xb74 driver_probe_device+0x11c/0x228 device_driver_attach+0xf0/0x16c __driver_attach+0x80/0x320 bus_for_each_dev+0x11c/0x19c driver_attach+0x38/0x48 bus_add_driver+0x1dc/0x3a4 driver_register+0x18c/0x244 __platform_driver_register+0x88/0x9c init_module+0x64/0xff4 [arm_smmu] do_one_initcall+0x17c/0x2f0 do_init_module+0xe8/0x378 load_module+0x3f80/0x4a40 __se_sys_finit_module+0x1a0/0x1e4 __arm64_sys_finit_module+0x44/0x58 el0_svc_common+0x100/0x264 do_el0_svc+0x38/0xa4 el0_svc+0x20/0x30 el0_sync_handler+0x68/0xac el0_sync+0x160/0x180 Freed by task 1: kasan_set_track+0x4c/0x84 kasan_set_free_info+0x28/0x4c ____kasan_slab_free+0x120/0x15c __kasan_slab_free+0x18/0x28 slab_free_freelist_hook+0x204/0x2fc kfree+0xfc/0x3a4 __iommu_probe_device+0x284/0x394 probe_iommu_group+0x70/0x9c bus_for_each_dev+0x11c/0x19c bus_iommu_probe+0xb8/0x7d4 bus_set_iommu+0xcc/0x13c arm_smmu_bus_init+0x44/0x130 [arm_smmu] arm_smmu_device_probe+0xb88/0xc54 [arm_smmu] platform_drv_probe+0xe4/0x13c really_probe+0x2c8/0xb74 driver_probe_device+0x11c/0x228 device_driver_attach+0xf0/0x16c __driver_attach+0x80/0x320 bus_for_each_dev+0x11c/0x19c driver_attach+0x38/0x48 bus_add_driver+0x1dc/0x3a4 driver_register+0x18c/0x244 __platform_driver_register+0x88/0x9c init_module+0x64/0xff4 [arm_smmu] do_one_initcall+0x17c/0x2f0 do_init_module+0xe8/0x378 load_module+0x3f80/0x4a40 __se_sys_finit_module+0x1a0/0x1e4 __arm64_sys_finit_module+0x44/0x58 el0_svc_common+0x100/0x264 do_el0_svc+0x38/0xa4 el0_svc+0x20/0x30 el0_sync_handler+0x68/0xac el0_sync+0x160/0x180 Fix this by setting dev->iommu to NULL first and then freeing dev_iommu structure in dev_iommu_free function. Suggested-by: Robin Murphy Signed-off-by: Vijayanand Jitta Link: https://lore.kernel.org/r/1643613155-20215-1-git-send-email-quic_vjitta@quicinc.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 8b86406b7162..3632bf8b4031 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -207,9 +207,14 @@ static struct dev_iommu *dev_iommu_get(struct device *dev) static void dev_iommu_free(struct device *dev) { - iommu_fwspec_free(dev); - kfree(dev->iommu); + struct dev_iommu *param = dev->iommu; + dev->iommu = NULL; + if (param->fwspec) { + fwnode_handle_put(param->fwspec->iommu_fwnode); + kfree(param->fwspec); + } + kfree(param); } static int __iommu_probe_device(struct device *dev, struct list_head *group_list) From 30209b93177a75843673de92771716c941c20ef5 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 28 Jan 2022 18:44:33 +0800 Subject: [PATCH 0867/1295] iommu: Fix some W=1 warnings The code is mostly free of W=1 warning, so fix the following: drivers/iommu/iommu.c:996: warning: expecting prototype for iommu_group_for_each_dev(). Prototype was for __iommu_group_for_each_dev() instead drivers/iommu/iommu.c:3048: warning: Function parameter or member 'drvdata' not described in 'iommu_sva_bind_device' drivers/iommu/ioasid.c:354: warning: Function parameter or member 'ioasid' not described in 'ioasid_get' drivers/iommu/omap-iommu.c:1098: warning: expecting prototype for omap_iommu_suspend_prepare(). Prototype was for omap_iommu_prepare() instead Signed-off-by: John Garry Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/1643366673-26803-1-git-send-email-john.garry@huawei.com Signed-off-by: Joerg Roedel --- drivers/iommu/ioasid.c | 1 + drivers/iommu/iommu.c | 24 ++++++++++++------------ drivers/iommu/omap-iommu.c | 2 +- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c index 50ee27bbd04e..06fee7416816 100644 --- a/drivers/iommu/ioasid.c +++ b/drivers/iommu/ioasid.c @@ -349,6 +349,7 @@ EXPORT_SYMBOL_GPL(ioasid_alloc); /** * ioasid_get - obtain a reference to the IOASID + * @ioasid: the ID to get */ void ioasid_get(ioasid_t ioasid) { diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 3632bf8b4031..107dcf5938d6 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -985,17 +985,6 @@ static int iommu_group_device_count(struct iommu_group *group) return ret; } -/** - * iommu_group_for_each_dev - iterate over each device in the group - * @group: the group - * @data: caller opaque data to be passed to callback function - * @fn: caller supplied callback function - * - * This function is called by group users to iterate over group devices. - * Callers should hold a reference count to the group during callback. - * The group->mutex is held across callbacks, which will block calls to - * iommu_group_add/remove_device. - */ static int __iommu_group_for_each_dev(struct iommu_group *group, void *data, int (*fn)(struct device *, void *)) { @@ -1010,7 +999,17 @@ static int __iommu_group_for_each_dev(struct iommu_group *group, void *data, return ret; } - +/** + * iommu_group_for_each_dev - iterate over each device in the group + * @group: the group + * @data: caller opaque data to be passed to callback function + * @fn: caller supplied callback function + * + * This function is called by group users to iterate over group devices. + * Callers should hold a reference count to the group during callback. + * The group->mutex is held across callbacks, which will block calls to + * iommu_group_add/remove_device. + */ int iommu_group_for_each_dev(struct iommu_group *group, void *data, int (*fn)(struct device *, void *)) { @@ -3037,6 +3036,7 @@ EXPORT_SYMBOL_GPL(iommu_aux_get_pasid); * iommu_sva_bind_device() - Bind a process address space to a device * @dev: the device * @mm: the mm to bind, caller must hold a reference to it + * @drvdata: opaque data pointer to pass to bind callback * * Create a bond between device and address space, allowing the device to access * the mm using the returned PASID. If a bond already exists between @device and diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 91749654fd49..980e4af3f06b 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1085,7 +1085,7 @@ static __maybe_unused int omap_iommu_runtime_resume(struct device *dev) } /** - * omap_iommu_suspend_prepare - prepare() dev_pm_ops implementation + * omap_iommu_prepare - prepare() dev_pm_ops implementation * @dev: iommu device * * This function performs the necessary checks to determine if the IOMMU From 99e675d473eb8cf2deac1376a0f840222fc1adcf Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 28 Jan 2022 11:10:02 +0800 Subject: [PATCH 0868/1295] iommu/vt-d: Fix potential memory leak in intel_setup_irq_remapping() After commit e3beca48a45b ("irqdomain/treewide: Keep firmware node unconditionally allocated"). For tear down scenario, fn is only freed after fail to allocate ir_domain, though it also should be freed in case dmar_enable_qi returns error. Besides free fn, irq_domain and ir_msi_domain need to be removed as well if intel_setup_irq_remapping fails to enable queued invalidation. Improve the rewinding path by add out_free_ir_domain and out_free_fwnode lables per Baolu's suggestion. Fixes: e3beca48a45b ("irqdomain/treewide: Keep firmware node unconditionally allocated") Suggested-by: Lu Baolu Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20220119063640.16864-1-guoqing.jiang@linux.dev Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20220128031002.2219155-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/irq_remapping.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index f912fe45bea2..a67319597884 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -569,9 +569,8 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) fn, &intel_ir_domain_ops, iommu); if (!iommu->ir_domain) { - irq_domain_free_fwnode(fn); pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id); - goto out_free_bitmap; + goto out_free_fwnode; } iommu->ir_msi_domain = arch_create_remap_msi_irq_domain(iommu->ir_domain, @@ -595,7 +594,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) if (dmar_enable_qi(iommu)) { pr_err("Failed to enable queued invalidation\n"); - goto out_free_bitmap; + goto out_free_ir_domain; } } @@ -619,6 +618,14 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) return 0; +out_free_ir_domain: + if (iommu->ir_msi_domain) + irq_domain_remove(iommu->ir_msi_domain); + iommu->ir_msi_domain = NULL; + irq_domain_remove(iommu->ir_domain); + iommu->ir_domain = NULL; +out_free_fwnode: + irq_domain_free_fwnode(fn); out_free_bitmap: bitmap_free(bitmap); out_free_pages: From c26b85ea16365079be8d206b20556a60a0c69ad4 Mon Sep 17 00:00:00 2001 From: Ajish Koshy Date: Mon, 24 Jan 2022 13:52:55 +0530 Subject: [PATCH 0869/1295] scsi: pm80xx: Fix double completion for SATA devices Current code handles completions for SATA devices in mpi_sata_completion() and mpi_sata_event(). However, at the time when any SATA event happens, for almost all the event types, the command is still in the target. It is therefore incorrect to complete the task in sata_event(). There are some events for which we get sata_completions, some need recovery procedure and others abort. All the tasks must be completed via sata_completion() path. Removed the task done related code from sata_events(). For tasks where we don't get completions, let top layer call abort() to abort the command post timeout. Link: https://lore.kernel.org/r/20220124082255.86223-1-Ajish.Koshy@microchip.com Acked-by: Jack Wang Co-developed-by: Viswas G Signed-off-by: Viswas G Signed-off-by: Ajish Koshy Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm8001_hwi.c | 18 ------------------ drivers/scsi/pm8001/pm80xx_hwi.c | 26 -------------------------- 2 files changed, 44 deletions(-) diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c index c814e5071712..9ec310b795c3 100644 --- a/drivers/scsi/pm8001/pm8001_hwi.c +++ b/drivers/scsi/pm8001/pm8001_hwi.c @@ -2692,7 +2692,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, void *piomb) u32 tag = le32_to_cpu(psataPayload->tag); u32 port_id = le32_to_cpu(psataPayload->port_id); u32 dev_id = le32_to_cpu(psataPayload->device_id); - unsigned long flags; if (event) pm8001_dbg(pm8001_ha, FAIL, "SATA EVENT 0x%x\n", event); @@ -2724,8 +2723,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, void *piomb) ts->resp = SAS_TASK_COMPLETE; ts->stat = SAS_DATA_OVERRUN; ts->residual = 0; - if (pm8001_dev) - atomic_dec(&pm8001_dev->running_req); break; case IO_XFER_ERROR_BREAK: pm8001_dbg(pm8001_ha, IO, "IO_XFER_ERROR_BREAK\n"); @@ -2767,7 +2764,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, void *piomb) IO_OPEN_CNX_ERROR_IT_NEXUS_LOSS); ts->resp = SAS_TASK_COMPLETE; ts->stat = SAS_QUEUE_FULL; - pm8001_ccb_task_free_done(pm8001_ha, t, ccb, tag); return; } break; @@ -2853,20 +2849,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, void *piomb) ts->stat = SAS_OPEN_TO; break; } - spin_lock_irqsave(&t->task_state_lock, flags); - t->task_state_flags &= ~SAS_TASK_STATE_PENDING; - t->task_state_flags &= ~SAS_TASK_AT_INITIATOR; - t->task_state_flags |= SAS_TASK_STATE_DONE; - if (unlikely((t->task_state_flags & SAS_TASK_STATE_ABORTED))) { - spin_unlock_irqrestore(&t->task_state_lock, flags); - pm8001_dbg(pm8001_ha, FAIL, - "task 0x%p done with io_status 0x%x resp 0x%x stat 0x%x but aborted by upper layer!\n", - t, event, ts->resp, ts->stat); - pm8001_ccb_task_free(pm8001_ha, t, ccb, tag); - } else { - spin_unlock_irqrestore(&t->task_state_lock, flags); - pm8001_ccb_task_free_done(pm8001_ha, t, ccb, tag); - } } /*See the comments for mpi_ssp_completion */ diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index 2530d1365556..c350d2017aa4 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -2821,7 +2821,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, u32 tag = le32_to_cpu(psataPayload->tag); u32 port_id = le32_to_cpu(psataPayload->port_id); u32 dev_id = le32_to_cpu(psataPayload->device_id); - unsigned long flags; if (event) pm8001_dbg(pm8001_ha, FAIL, "SATA EVENT 0x%x\n", event); @@ -2854,8 +2853,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, ts->resp = SAS_TASK_COMPLETE; ts->stat = SAS_DATA_OVERRUN; ts->residual = 0; - if (pm8001_dev) - atomic_dec(&pm8001_dev->running_req); break; case IO_XFER_ERROR_BREAK: pm8001_dbg(pm8001_ha, IO, "IO_XFER_ERROR_BREAK\n"); @@ -2904,11 +2901,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, IO_OPEN_CNX_ERROR_IT_NEXUS_LOSS); ts->resp = SAS_TASK_COMPLETE; ts->stat = SAS_QUEUE_FULL; - spin_unlock_irqrestore(&circularQ->oq_lock, - circularQ->lock_flags); - pm8001_ccb_task_free_done(pm8001_ha, t, ccb, tag); - spin_lock_irqsave(&circularQ->oq_lock, - circularQ->lock_flags); return; } break; @@ -3008,24 +3000,6 @@ static void mpi_sata_event(struct pm8001_hba_info *pm8001_ha, ts->stat = SAS_OPEN_TO; break; } - spin_lock_irqsave(&t->task_state_lock, flags); - t->task_state_flags &= ~SAS_TASK_STATE_PENDING; - t->task_state_flags &= ~SAS_TASK_AT_INITIATOR; - t->task_state_flags |= SAS_TASK_STATE_DONE; - if (unlikely((t->task_state_flags & SAS_TASK_STATE_ABORTED))) { - spin_unlock_irqrestore(&t->task_state_lock, flags); - pm8001_dbg(pm8001_ha, FAIL, - "task 0x%p done with io_status 0x%x resp 0x%x stat 0x%x but aborted by upper layer!\n", - t, event, ts->resp, ts->stat); - pm8001_ccb_task_free(pm8001_ha, t, ccb, tag); - } else { - spin_unlock_irqrestore(&t->task_state_lock, flags); - spin_unlock_irqrestore(&circularQ->oq_lock, - circularQ->lock_flags); - pm8001_ccb_task_free_done(pm8001_ha, t, ccb, tag); - spin_lock_irqsave(&circularQ->oq_lock, - circularQ->lock_flags); - } } /*See the comments for mpi_ssp_completion */ From 936bd03405fc83ba039d42bc93ffd4b88418f1d3 Mon Sep 17 00:00:00 2001 From: John Meneghini Date: Mon, 24 Jan 2022 09:51:10 -0500 Subject: [PATCH 0870/1295] scsi: bnx2fc: Make bnx2fc_recv_frame() mp safe Running tests with a debug kernel shows that bnx2fc_recv_frame() is modifying the per_cpu lport stats counters in a non-mpsafe way. Just boot a debug kernel and run the bnx2fc driver with the hardware enabled. [ 1391.699147] BUG: using smp_processor_id() in preemptible [00000000] code: bnx2fc_ [ 1391.699160] caller is bnx2fc_recv_frame+0xbf9/0x1760 [bnx2fc] [ 1391.699174] CPU: 2 PID: 4355 Comm: bnx2fc_l2_threa Kdump: loaded Tainted: G B [ 1391.699180] Hardware name: HP ProLiant DL120 G7, BIOS J01 07/01/2013 [ 1391.699183] Call Trace: [ 1391.699188] dump_stack_lvl+0x57/0x7d [ 1391.699198] check_preemption_disabled+0xc8/0xd0 [ 1391.699205] bnx2fc_recv_frame+0xbf9/0x1760 [bnx2fc] [ 1391.699215] ? do_raw_spin_trylock+0xb5/0x180 [ 1391.699221] ? bnx2fc_npiv_create_vports.isra.0+0x4e0/0x4e0 [bnx2fc] [ 1391.699229] ? bnx2fc_l2_rcv_thread+0xb7/0x3a0 [bnx2fc] [ 1391.699240] bnx2fc_l2_rcv_thread+0x1af/0x3a0 [bnx2fc] [ 1391.699250] ? bnx2fc_ulp_init+0xc0/0xc0 [bnx2fc] [ 1391.699258] kthread+0x364/0x420 [ 1391.699263] ? _raw_spin_unlock_irq+0x24/0x50 [ 1391.699268] ? set_kthread_struct+0x100/0x100 [ 1391.699273] ret_from_fork+0x22/0x30 Restore the old get_cpu/put_cpu code with some modifications to reduce the size of the critical section. Link: https://lore.kernel.org/r/20220124145110.442335-1-jmeneghi@redhat.com Fixes: d576a5e80cd0 ("bnx2fc: Improve stats update mechanism") Tested-by: Guangwu Zhang Acked-by: Saurav Kashyap Signed-off-by: John Meneghini Signed-off-by: Martin K. Petersen --- drivers/scsi/bnx2fc/bnx2fc_fcoe.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c index 9be273c320e2..a826456c6075 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c +++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c @@ -508,7 +508,8 @@ static int bnx2fc_l2_rcv_thread(void *arg) static void bnx2fc_recv_frame(struct sk_buff *skb) { - u32 fr_len; + u64 crc_err; + u32 fr_len, fr_crc; struct fc_lport *lport; struct fcoe_rcv_info *fr; struct fc_stats *stats; @@ -542,6 +543,11 @@ static void bnx2fc_recv_frame(struct sk_buff *skb) skb_pull(skb, sizeof(struct fcoe_hdr)); fr_len = skb->len - sizeof(struct fcoe_crc_eof); + stats = per_cpu_ptr(lport->stats, get_cpu()); + stats->RxFrames++; + stats->RxWords += fr_len / FCOE_WORD_TO_BYTE; + put_cpu(); + fp = (struct fc_frame *)skb; fc_frame_init(fp); fr_dev(fp) = lport; @@ -624,16 +630,15 @@ static void bnx2fc_recv_frame(struct sk_buff *skb) return; } - stats = per_cpu_ptr(lport->stats, smp_processor_id()); - stats->RxFrames++; - stats->RxWords += fr_len / FCOE_WORD_TO_BYTE; + fr_crc = le32_to_cpu(fr_crc(fp)); - if (le32_to_cpu(fr_crc(fp)) != - ~crc32(~0, skb->data, fr_len)) { - if (stats->InvalidCRCCount < 5) + if (unlikely(fr_crc != ~crc32(~0, skb->data, fr_len))) { + stats = per_cpu_ptr(lport->stats, get_cpu()); + crc_err = (stats->InvalidCRCCount++); + put_cpu(); + if (crc_err < 5) printk(KERN_WARNING PFX "dropping frame with " "CRC error\n"); - stats->InvalidCRCCount++; kfree_skb(skb); return; } From ec049891b2dc16591813eacaddc476b3d27c8c14 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Mon, 31 Jan 2022 11:34:05 +0000 Subject: [PATCH 0871/1295] kselftest: Fix vdso_test_abi return status vdso_test_abi contains a batch of tests that verify the validity of the vDSO ABI. When a vDSO symbol is not found the relevant test is skipped reporting KSFT_SKIP. All the tests return values are then added in a single variable which is checked to verify failures. This approach can have side effects which result in reporting the wrong kselftest exit status. Fix vdso_test_abi verifying the return code of each test separately. Cc: Shuah Khan Cc: Andy Lutomirski Cc: Thomas Gleixner Reported-by: Cristian Marussi Signed-off-by: Vincenzo Frascino Signed-off-by: Shuah Khan --- tools/testing/selftests/vDSO/vdso_test_abi.c | 135 +++++++++---------- 1 file changed, 62 insertions(+), 73 deletions(-) diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index 3d603f1394af..883ca85424bc 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -33,110 +33,114 @@ typedef long (*vdso_clock_gettime_t)(clockid_t clk_id, struct timespec *ts); typedef long (*vdso_clock_getres_t)(clockid_t clk_id, struct timespec *ts); typedef time_t (*vdso_time_t)(time_t *t); -static int vdso_test_gettimeofday(void) +#define VDSO_TEST_PASS_MSG() "\n%s(): PASS\n", __func__ +#define VDSO_TEST_FAIL_MSG(x) "\n%s(): %s FAIL\n", __func__, x +#define VDSO_TEST_SKIP_MSG(x) "\n%s(): SKIP: Could not find %s\n", __func__, x + +static void vdso_test_gettimeofday(void) { /* Find gettimeofday. */ vdso_gettimeofday_t vdso_gettimeofday = (vdso_gettimeofday_t)vdso_sym(version, name[0]); if (!vdso_gettimeofday) { - printf("Could not find %s\n", name[0]); - return KSFT_SKIP; + ksft_test_result_skip(VDSO_TEST_SKIP_MSG(name[0])); + return; } struct timeval tv; long ret = vdso_gettimeofday(&tv, 0); if (ret == 0) { - printf("The time is %lld.%06lld\n", - (long long)tv.tv_sec, (long long)tv.tv_usec); + ksft_print_msg("The time is %lld.%06lld\n", + (long long)tv.tv_sec, (long long)tv.tv_usec); + ksft_test_result_pass(VDSO_TEST_PASS_MSG()); } else { - printf("%s failed\n", name[0]); - return KSFT_FAIL; + ksft_test_result_fail(VDSO_TEST_FAIL_MSG(name[0])); } - - return KSFT_PASS; } -static int vdso_test_clock_gettime(clockid_t clk_id) +static void vdso_test_clock_gettime(clockid_t clk_id) { /* Find clock_gettime. */ vdso_clock_gettime_t vdso_clock_gettime = (vdso_clock_gettime_t)vdso_sym(version, name[1]); if (!vdso_clock_gettime) { - printf("Could not find %s\n", name[1]); - return KSFT_SKIP; + ksft_test_result_skip(VDSO_TEST_SKIP_MSG(name[1])); + return; } struct timespec ts; long ret = vdso_clock_gettime(clk_id, &ts); if (ret == 0) { - printf("The time is %lld.%06lld\n", - (long long)ts.tv_sec, (long long)ts.tv_nsec); + ksft_print_msg("The time is %lld.%06lld\n", + (long long)ts.tv_sec, (long long)ts.tv_nsec); + ksft_test_result_pass(VDSO_TEST_PASS_MSG()); } else { - printf("%s failed\n", name[1]); - return KSFT_FAIL; + ksft_test_result_fail(VDSO_TEST_FAIL_MSG(name[1])); } - - return KSFT_PASS; } -static int vdso_test_time(void) +static void vdso_test_time(void) { /* Find time. */ vdso_time_t vdso_time = (vdso_time_t)vdso_sym(version, name[2]); if (!vdso_time) { - printf("Could not find %s\n", name[2]); - return KSFT_SKIP; + ksft_test_result_skip(VDSO_TEST_SKIP_MSG(name[2])); + return; } long ret = vdso_time(NULL); if (ret > 0) { - printf("The time in hours since January 1, 1970 is %lld\n", + ksft_print_msg("The time in hours since January 1, 1970 is %lld\n", (long long)(ret / 3600)); + ksft_test_result_pass(VDSO_TEST_PASS_MSG()); } else { - printf("%s failed\n", name[2]); - return KSFT_FAIL; + ksft_test_result_fail(VDSO_TEST_FAIL_MSG(name[2])); } - - return KSFT_PASS; } -static int vdso_test_clock_getres(clockid_t clk_id) +static void vdso_test_clock_getres(clockid_t clk_id) { + int clock_getres_fail = 0; + /* Find clock_getres. */ vdso_clock_getres_t vdso_clock_getres = (vdso_clock_getres_t)vdso_sym(version, name[3]); if (!vdso_clock_getres) { - printf("Could not find %s\n", name[3]); - return KSFT_SKIP; + ksft_test_result_skip(VDSO_TEST_SKIP_MSG(name[3])); + return; } struct timespec ts, sys_ts; long ret = vdso_clock_getres(clk_id, &ts); if (ret == 0) { - printf("The resolution is %lld %lld\n", - (long long)ts.tv_sec, (long long)ts.tv_nsec); + ksft_print_msg("The vdso resolution is %lld %lld\n", + (long long)ts.tv_sec, (long long)ts.tv_nsec); } else { - printf("%s failed\n", name[3]); - return KSFT_FAIL; + clock_getres_fail++; } ret = syscall(SYS_clock_getres, clk_id, &sys_ts); - if ((sys_ts.tv_sec != ts.tv_sec) || (sys_ts.tv_nsec != ts.tv_nsec)) { - printf("%s failed\n", name[3]); - return KSFT_FAIL; - } + ksft_print_msg("The syscall resolution is %lld %lld\n", + (long long)sys_ts.tv_sec, (long long)sys_ts.tv_nsec); - return KSFT_PASS; + if ((sys_ts.tv_sec != ts.tv_sec) || (sys_ts.tv_nsec != ts.tv_nsec)) + clock_getres_fail++; + + if (clock_getres_fail > 0) { + ksft_test_result_fail(VDSO_TEST_FAIL_MSG(name[3])); + } else { + ksft_test_result_pass(VDSO_TEST_PASS_MSG()); + } } const char *vdso_clock_name[12] = { @@ -158,36 +162,23 @@ const char *vdso_clock_name[12] = { * This function calls vdso_test_clock_gettime and vdso_test_clock_getres * with different values for clock_id. */ -static inline int vdso_test_clock(clockid_t clock_id) +static inline void vdso_test_clock(clockid_t clock_id) { - int ret0, ret1; + ksft_print_msg("\nclock_id: %s\n", vdso_clock_name[clock_id]); - ret0 = vdso_test_clock_gettime(clock_id); - /* A skipped test is considered passed */ - if (ret0 == KSFT_SKIP) - ret0 = KSFT_PASS; + vdso_test_clock_gettime(clock_id); - ret1 = vdso_test_clock_getres(clock_id); - /* A skipped test is considered passed */ - if (ret1 == KSFT_SKIP) - ret1 = KSFT_PASS; - - ret0 += ret1; - - printf("clock_id: %s", vdso_clock_name[clock_id]); - - if (ret0 > 0) - printf(" [FAIL]\n"); - else - printf(" [PASS]\n"); - - return ret0; + vdso_test_clock_getres(clock_id); } +#define VDSO_TEST_PLAN 16 + int main(int argc, char **argv) { unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); - int ret; + + ksft_print_header(); + ksft_set_plan(VDSO_TEST_PLAN); if (!sysinfo_ehdr) { printf("AT_SYSINFO_EHDR is not present!\n"); @@ -201,44 +192,42 @@ int main(int argc, char **argv) vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR)); - ret = vdso_test_gettimeofday(); + vdso_test_gettimeofday(); #if _POSIX_TIMERS > 0 #ifdef CLOCK_REALTIME - ret += vdso_test_clock(CLOCK_REALTIME); + vdso_test_clock(CLOCK_REALTIME); #endif #ifdef CLOCK_BOOTTIME - ret += vdso_test_clock(CLOCK_BOOTTIME); + vdso_test_clock(CLOCK_BOOTTIME); #endif #ifdef CLOCK_TAI - ret += vdso_test_clock(CLOCK_TAI); + vdso_test_clock(CLOCK_TAI); #endif #ifdef CLOCK_REALTIME_COARSE - ret += vdso_test_clock(CLOCK_REALTIME_COARSE); + vdso_test_clock(CLOCK_REALTIME_COARSE); #endif #ifdef CLOCK_MONOTONIC - ret += vdso_test_clock(CLOCK_MONOTONIC); + vdso_test_clock(CLOCK_MONOTONIC); #endif #ifdef CLOCK_MONOTONIC_RAW - ret += vdso_test_clock(CLOCK_MONOTONIC_RAW); + vdso_test_clock(CLOCK_MONOTONIC_RAW); #endif #ifdef CLOCK_MONOTONIC_COARSE - ret += vdso_test_clock(CLOCK_MONOTONIC_COARSE); + vdso_test_clock(CLOCK_MONOTONIC_COARSE); #endif #endif - ret += vdso_test_time(); + vdso_test_time(); - if (ret > 0) - return KSFT_FAIL; - - return KSFT_PASS; + ksft_print_cnts(); + return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL; } From edb854a3680bacc9ef9b91ec0c5ff6105886f6f3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Jan 2022 23:37:33 +0800 Subject: [PATCH 0872/1295] scsi: core: Reallocate device's budget map on queue depth change We currently use ->cmd_per_lun as initial queue depth for setting up the budget_map. Martin Wilck reported that it is common for the queue_depth to be subsequently updated in slave_configure() based on detected hardware characteristics. As a result, for some drivers, the static host template settings for cmd_per_lun and can_queue won't actually get used in practice. And if the default values are used to allocate the budget_map, memory may be consumed unnecessarily. Fix the issue by reallocating the budget_map after ->slave_configure() returns. At that time the device queue_depth should accurately reflect what the hardware needs. Link: https://lore.kernel.org/r/20220127153733.409132-1-ming.lei@redhat.com Cc: Bart Van Assche Reported-by: Martin Wilck Suggested-by: Martin Wilck Tested-by: Martin Wilck Reviewed-by: Martin Wilck Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_scan.c | 55 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 3520b9384428..f4e6c68ac99e 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -214,6 +214,48 @@ static void scsi_unlock_floptical(struct scsi_device *sdev, SCSI_TIMEOUT, 3, NULL); } +static int scsi_realloc_sdev_budget_map(struct scsi_device *sdev, + unsigned int depth) +{ + int new_shift = sbitmap_calculate_shift(depth); + bool need_alloc = !sdev->budget_map.map; + bool need_free = false; + int ret; + struct sbitmap sb_backup; + + /* + * realloc if new shift is calculated, which is caused by setting + * up one new default queue depth after calling ->slave_configure + */ + if (!need_alloc && new_shift != sdev->budget_map.shift) + need_alloc = need_free = true; + + if (!need_alloc) + return 0; + + /* + * Request queue has to be frozen for reallocating budget map, + * and here disk isn't added yet, so freezing is pretty fast + */ + if (need_free) { + blk_mq_freeze_queue(sdev->request_queue); + sb_backup = sdev->budget_map; + } + ret = sbitmap_init_node(&sdev->budget_map, + scsi_device_max_queue_depth(sdev), + new_shift, GFP_KERNEL, + sdev->request_queue->node, false, true); + if (need_free) { + if (ret) + sdev->budget_map = sb_backup; + else + sbitmap_free(&sb_backup); + ret = 0; + blk_mq_unfreeze_queue(sdev->request_queue); + } + return ret; +} + /** * scsi_alloc_sdev - allocate and setup a scsi_Device * @starget: which target to allocate a &scsi_device for @@ -306,11 +348,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, * default device queue depth to figure out sbitmap shift * since we use this queue depth most of times. */ - if (sbitmap_init_node(&sdev->budget_map, - scsi_device_max_queue_depth(sdev), - sbitmap_calculate_shift(depth), - GFP_KERNEL, sdev->request_queue->node, - false, true)) { + if (scsi_realloc_sdev_budget_map(sdev, depth)) { put_device(&starget->dev); kfree(sdev); goto out; @@ -1017,6 +1055,13 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result, } return SCSI_SCAN_NO_RESPONSE; } + + /* + * The queue_depth is often changed in ->slave_configure. + * Set up budget map again since memory consumption of + * the map depends on actual queue depth. + */ + scsi_realloc_sdev_budget_map(sdev, sdev->queue_depth); } if (sdev->scsi_level >= SCSI_3) From 04662bac0067e2fd7f243d6abaa4d779bce14114 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 30 Jan 2022 14:38:18 -0800 Subject: [PATCH 0873/1295] ACPI: require CRC32 to build ACPI core now requires crc32() but the kernel build can fail when CRC32 is not set/enabled, so select it in the ACPI Kconfig entry. Fixes this build error: ia64-linux-ld: drivers/acpi/scan.o: in function `acpi_store_pld_crc': include/acpi/platform/aclinuxex.h:62: undefined reference to `crc32_le' Fixes: 882c982dada4 ("acpi: Store CRC-32 hash of the _PLD in struct acpi_device") Signed-off-by: Randy Dunlap Reported-by: Guenter Roeck Reviewed-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Rafael J. Wysocki --- drivers/acpi/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index ba45541b1f1f..273741dedfd2 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -11,6 +11,7 @@ menuconfig ACPI depends on ARCH_SUPPORTS_ACPI select PNP select NLS + select CRC32 default y if X86 help Advanced Configuration and Power Interface (ACPI) support for From 29d650f7e3ab55283b89c9f5883d0c256ce478b5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 24 Jan 2022 15:48:31 -0800 Subject: [PATCH 0874/1295] xfs: reject crazy array sizes being fed to XFS_IOC_GETBMAP* Syzbot tripped over the following complaint from the kernel: WARNING: CPU: 2 PID: 15402 at mm/util.c:597 kvmalloc_node+0x11e/0x125 mm/util.c:597 While trying to run XFS_IOC_GETBMAP against the following structure: struct getbmap fubar = { .bmv_count = 0x22dae649, }; Obviously, this is a crazy huge value since the next thing that the ioctl would do is allocate 37GB of memory. This is enough to make kvmalloc mad, but isn't large enough to trip the validation functions. In other words, I'm fussing with checks that were **already sufficient** because that's easier than dealing with 644 internal bug reports. Yes, that's right, six hundred and forty-four. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Catherine Hoang --- fs/xfs/xfs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 03a6198c97f6..2515fe8299e1 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1464,7 +1464,7 @@ xfs_ioc_getbmap( if (bmx.bmv_count < 2) return -EINVAL; - if (bmx.bmv_count > ULONG_MAX / recsize) + if (bmx.bmv_count >= INT_MAX / recsize) return -ENOMEM; buf = kvcalloc(bmx.bmv_count, sizeof(*buf), GFP_KERNEL); From 3d2504663c41104b4359a15f35670cfa82de1bbf Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Tue, 14 Dec 2021 10:08:22 +0000 Subject: [PATCH 0875/1295] i40e: Fix reset bw limit when DCB enabled with 1 TC There was an AQ error I40E_AQ_RC_EINVAL when trying to reset bw limit as part of bw allocation setup. This was caused by trying to reset bw limit with DCB enabled. Bw limit should not be reset when DCB is enabled. The code was relying on the pf->flags to check if DCB is enabled but if only 1 TC is available this flag will not be set even though DCB is enabled. Add a check for number of TC and if it is 1 don't try to reset bw limit even if pf->flags shows DCB as disabled. Fixes: fa38e30ac73f ("i40e: Fix for Tx timeouts when interface is brought up if DCB is enabled") Suggested-by: Alexander Lobakin # Flatten the condition Signed-off-by: Sylwester Dziedziuch Signed-off-by: Jedrzej Jagielski Reviewed-by: Alexander Lobakin Tested-by: Imam Hassan Reza Biswas Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index f70c478dafdb..5cb4dc69fe87 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -5372,7 +5372,15 @@ static int i40e_vsi_configure_bw_alloc(struct i40e_vsi *vsi, u8 enabled_tc, /* There is no need to reset BW when mqprio mode is on. */ if (pf->flags & I40E_FLAG_TC_MQPRIO) return 0; - if (!vsi->mqprio_qopt.qopt.hw && !(pf->flags & I40E_FLAG_DCB_ENABLED)) { + + if (!vsi->mqprio_qopt.qopt.hw) { + if (pf->flags & I40E_FLAG_DCB_ENABLED) + goto skip_reset; + + if (IS_ENABLED(CONFIG_I40E_DCB) && + i40e_dcb_hw_get_num_tc(&pf->hw) == 1) + goto skip_reset; + ret = i40e_set_bw_limit(vsi, vsi->seid, 0); if (ret) dev_info(&pf->pdev->dev, @@ -5380,6 +5388,8 @@ static int i40e_vsi_configure_bw_alloc(struct i40e_vsi *vsi, u8 enabled_tc, vsi->seid); return ret; } + +skip_reset: memset(&bw_data, 0, sizeof(bw_data)); bw_data.tc_valid_bits = enabled_tc; for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) From 0aed75fd30dacd31144188f7ddd5d571db7511c5 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 27 Jan 2022 21:12:50 +0800 Subject: [PATCH 0876/1295] scsi: pm8001: Fix warning for undescribed param in process_one_iomb() make W=1 complains of an undescribed function parameter: drivers/scsi/pm8001/pm80xx_hwi.c:3938: warning: Function parameter or member 'circularQ' not described in 'process_one_iomb' Fix it. Link: https://lore.kernel.org/r/1643289172-165636-2-git-send-email-john.garry@huawei.com Reported-by: Damien Le Moal Reviewed-by: Damien Le Moal Acked-by: Jack Wang Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm80xx_hwi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index c350d2017aa4..b7ab643ef014 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -3905,6 +3905,7 @@ static int ssp_coalesced_comp_resp(struct pm8001_hba_info *pm8001_ha, /** * process_one_iomb - process one outbound Queue memory block * @pm8001_ha: our hba card information + * @circularQ: outbound circular queue * @piomb: IO message buffer */ static void process_one_iomb(struct pm8001_hba_info *pm8001_ha, From 61f162aa4381845acbdc7f2be4dfb694d027c018 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 27 Jan 2022 21:12:51 +0800 Subject: [PATCH 0877/1295] scsi: pm8001: Fix use-after-free for aborted TMF sas_task Currently a use-after-free may occur if a TMF sas_task is aborted before we handle the IO completion in mpi_ssp_completion(). The abort occurs due to timeout. When the timeout occurs, the SAS_TASK_STATE_ABORTED flag is set and the sas_task is freed in pm8001_exec_internal_tmf_task(). However, if the I/O completion occurs later, the I/O completion still thinks that the sas_task is available. Fix this by clearing the ccb->task if the TMF times out - the I/O completion handler does nothing if this pointer is cleared. Link: https://lore.kernel.org/r/1643289172-165636-3-git-send-email-john.garry@huawei.com Reviewed-by: Damien Le Moal Acked-by: Jack Wang Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm8001_sas.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index 160ee8b228c9..32edda3e55c6 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -769,8 +769,13 @@ static int pm8001_exec_internal_tmf_task(struct domain_device *dev, res = -TMF_RESP_FUNC_FAILED; /* Even TMF timed out, return direct. */ if (task->task_state_flags & SAS_TASK_STATE_ABORTED) { + struct pm8001_ccb_info *ccb = task->lldd_task; + pm8001_dbg(pm8001_ha, FAIL, "TMF task[%x]timeout.\n", tmf->tmf); + + if (ccb) + ccb->task = NULL; goto ex_err; } From df7abcaa1246e2537ab4016077b5443bb3c09378 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 27 Jan 2022 21:12:52 +0800 Subject: [PATCH 0878/1295] scsi: pm8001: Fix use-after-free for aborted SSP/STP sas_task Currently a use-after-free may occur if a sas_task is aborted by the upper layer before we handle the I/O completion in mpi_ssp_completion() or mpi_sata_completion(). In this case, the following are the two steps in handling those I/O completions: - Call complete() to inform the upper layer handler of completion of the I/O. - Release driver resources associated with the sas_task in pm8001_ccb_task_free() call. When complete() is called, the upper layer may free the sas_task. As such, we should not touch the associated sas_task afterwards, but we do so in the pm8001_ccb_task_free() call. Fix by swapping the complete() and pm8001_ccb_task_free() calls ordering. Link: https://lore.kernel.org/r/1643289172-165636-4-git-send-email-john.garry@huawei.com Reviewed-by: Damien Le Moal Acked-by: Jack Wang Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm80xx_hwi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index b7ab643ef014..9d20f8009b89 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -2185,9 +2185,9 @@ mpi_ssp_completion(struct pm8001_hba_info *pm8001_ha, void *piomb) pm8001_dbg(pm8001_ha, FAIL, "task 0x%p done with io_status 0x%x resp 0x%x stat 0x%x but aborted by upper layer!\n", t, status, ts->resp, ts->stat); + pm8001_ccb_task_free(pm8001_ha, t, ccb, tag); if (t->slow_task) complete(&t->slow_task->completion); - pm8001_ccb_task_free(pm8001_ha, t, ccb, tag); } else { spin_unlock_irqrestore(&t->task_state_lock, flags); pm8001_ccb_task_free(pm8001_ha, t, ccb, tag); @@ -2794,9 +2794,9 @@ mpi_sata_completion(struct pm8001_hba_info *pm8001_ha, pm8001_dbg(pm8001_ha, FAIL, "task 0x%p done with io_status 0x%x resp 0x%x stat 0x%x but aborted by upper layer!\n", t, status, ts->resp, ts->stat); + pm8001_ccb_task_free(pm8001_ha, t, ccb, tag); if (t->slow_task) complete(&t->slow_task->completion); - pm8001_ccb_task_free(pm8001_ha, t, ccb, tag); } else { spin_unlock_irqrestore(&t->task_state_lock, flags); spin_unlock_irqrestore(&circularQ->oq_lock, From 1b777d4d9e383d2744fc9b3a09af6ec1893c8b1a Mon Sep 17 00:00:00 2001 From: Nick Lopez Date: Sat, 22 Jan 2022 01:19:06 -0700 Subject: [PATCH 0879/1295] drm/nouveau: fix off by one in BIOS boundary checking Bounds checking when parsing init scripts embedded in the BIOS reject access to the last byte. This causes driver initialization to fail on Apple eMac's with GeForce 2 MX GPUs, leaving the system with no working console. This is probably only seen on OpenFirmware machines like PowerPC Macs because the BIOS image provided by OF is only the used parts of the ROM, not a power-of-two blocks read from PCI directly so PCs always have empty bytes at the end that are never accessed. Signed-off-by: Nick Lopez Fixes: 4d4e9907ff572 ("drm/nouveau/bios: guard against out-of-bounds accesses to image") Cc: # v4.10+ Reviewed-by: Ilia Mirkin Reviewed-by: Karol Herbst Signed-off-by: Karol Herbst Link: https://patchwork.freedesktop.org/patch/msgid/20220122081906.2633061-1-github@glowingmonkey.org --- drivers/gpu/drm/nouveau/nvkm/subdev/bios/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/base.c index d0f52d59fc2f..64e423dddd9e 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/base.c @@ -38,7 +38,7 @@ nvbios_addr(struct nvkm_bios *bios, u32 *addr, u8 size) *addr += bios->imaged_addr; } - if (unlikely(*addr + size >= bios->size)) { + if (unlikely(*addr + size > bios->size)) { nvkm_error(&bios->subdev, "OOB %d %08x %08x\n", size, p, *addr); return false; } From c763ec4c10f78678d6d4415646237f07109a5a5f Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 31 Jan 2022 19:13:27 +0800 Subject: [PATCH 0880/1295] scsi: hisi_sas: Fix setting of hisi_sas_slot.is_internal The hisi_sas_slot.is_internal member is not set properly for ATA commands which the driver sends directly. A TMF struct pointer is normally used as a test to set this, but it is NULL for those commands. It's not ideal, but pass an empty TMF struct to set that member properly. Link: https://lore.kernel.org/r/1643627607-138785-1-git-send-email-john.garry@huawei.com Fixes: dc313f6b125b ("scsi: hisi_sas: Factor out task prep and delivery code") Reported-by: Xiang Chen Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_main.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 2f53a2ee024a..ebf5ec38891b 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -400,8 +400,7 @@ void hisi_sas_task_deliver(struct hisi_hba *hisi_hba, struct hisi_sas_slot *slot, struct hisi_sas_dq *dq, struct hisi_sas_device *sas_dev, - struct hisi_sas_internal_abort *abort, - struct hisi_sas_tmf_task *tmf) + struct hisi_sas_internal_abort *abort) { struct hisi_sas_cmd_hdr *cmd_hdr_base; int dlvry_queue_slot, dlvry_queue; @@ -427,8 +426,6 @@ void hisi_sas_task_deliver(struct hisi_hba *hisi_hba, cmd_hdr_base = hisi_hba->cmd_hdr[dlvry_queue]; slot->cmd_hdr = &cmd_hdr_base[dlvry_queue_slot]; - slot->tmf = tmf; - slot->is_internal = tmf; task->lldd_task = slot; memset(slot->cmd_hdr, 0, sizeof(struct hisi_sas_cmd_hdr)); @@ -587,7 +584,7 @@ static int hisi_sas_task_exec(struct sas_task *task, gfp_t gfp_flags, slot->is_internal = tmf; /* protect task_prep and start_delivery sequence */ - hisi_sas_task_deliver(hisi_hba, slot, dq, sas_dev, NULL, tmf); + hisi_sas_task_deliver(hisi_hba, slot, dq, sas_dev, NULL); return 0; @@ -1380,12 +1377,13 @@ static int hisi_sas_softreset_ata_disk(struct domain_device *device) struct hisi_hba *hisi_hba = dev_to_hisi_hba(device); struct device *dev = hisi_hba->dev; int s = sizeof(struct host_to_dev_fis); + struct hisi_sas_tmf_task tmf = {}; ata_for_each_link(link, ap, EDGE) { int pmp = sata_srst_pmp(link); hisi_sas_fill_ata_reset_cmd(link->device, 1, pmp, fis); - rc = hisi_sas_exec_internal_tmf_task(device, fis, s, NULL); + rc = hisi_sas_exec_internal_tmf_task(device, fis, s, &tmf); if (rc != TMF_RESP_FUNC_COMPLETE) break; } @@ -1396,7 +1394,7 @@ static int hisi_sas_softreset_ata_disk(struct domain_device *device) hisi_sas_fill_ata_reset_cmd(link->device, 0, pmp, fis); rc = hisi_sas_exec_internal_tmf_task(device, fis, - s, NULL); + s, &tmf); if (rc != TMF_RESP_FUNC_COMPLETE) dev_err(dev, "ata disk %016llx de-reset failed\n", SAS_ADDR(device->sas_addr)); @@ -2067,7 +2065,7 @@ hisi_sas_internal_abort_task_exec(struct hisi_hba *hisi_hba, int device_id, slot->port = port; slot->is_internal = true; - hisi_sas_task_deliver(hisi_hba, slot, dq, sas_dev, abort, NULL); + hisi_sas_task_deliver(hisi_hba, slot, dq, sas_dev, abort); return 0; From 6533e558c6505e94c3e0ed4281ed5e31ec985f4d Mon Sep 17 00:00:00 2001 From: Karen Sornek Date: Wed, 12 Jan 2022 10:19:47 +0100 Subject: [PATCH 0881/1295] i40e: Fix reset path while removing the driver Fix the crash in kernel while dereferencing the NULL pointer, when the driver is unloaded and simultaneously the VSI rings are being stopped. The hardware requires 50msec in order to finish RX queues disable. For this purpose the driver spins in mdelay function for the operation to be completed. For example changing number of queues which requires reset would fail in the following call stack: 1) i40e_prep_for_reset 2) i40e_pf_quiesce_all_vsi 3) i40e_quiesce_vsi 4) i40e_vsi_close 5) i40e_down 6) i40e_vsi_stop_rings 7) i40e_vsi_control_rx -> disable requires the delay of 50msecs 8) continue back in i40e_down function where i40e_clean_tx_ring(vsi->tx_rings[i]) is going to crash When the driver was spinning vsi_release called i40e_vsi_free_arrays where the vsi->tx_rings resources were freed and the pointer was set to NULL. Fixes: 5b6d4a7f20b0 ("i40e: Fix crash during removing i40e driver") Signed-off-by: Slawomir Laba Signed-off-by: Sylwester Dziedziuch Signed-off-by: Karen Sornek Tested-by: Gurucharan G Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 2e02cc68cd3f..80c5cecaf2b5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -144,6 +144,7 @@ enum i40e_state_t { __I40E_VIRTCHNL_OP_PENDING, __I40E_RECOVERY_MODE, __I40E_VF_RESETS_DISABLED, /* disable resets during i40e_remove */ + __I40E_IN_REMOVE, __I40E_VFS_RELEASING, /* This must be last as it determines the size of the BITMAP */ __I40E_STATE_SIZE__, diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 5cb4dc69fe87..0c4b7dfb3b35 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -10863,6 +10863,9 @@ static void i40e_reset_and_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) { int ret; + + if (test_bit(__I40E_IN_REMOVE, pf->state)) + return; /* Now we wait for GRST to settle out. * We don't have to delete the VEBs or VSIs from the hw switch * because the reset will make them disappear. @@ -12222,6 +12225,8 @@ int i40e_reconfig_rss_queues(struct i40e_pf *pf, int queue_count) vsi->req_queue_pairs = queue_count; i40e_prep_for_reset(pf); + if (test_bit(__I40E_IN_REMOVE, pf->state)) + return pf->alloc_rss_size; pf->alloc_rss_size = new_rss_size; @@ -13048,6 +13053,10 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog, if (need_reset) i40e_prep_for_reset(pf); + /* VSI shall be deleted in a moment, just return EINVAL */ + if (test_bit(__I40E_IN_REMOVE, pf->state)) + return -EINVAL; + old_prog = xchg(&vsi->xdp_prog, prog); if (need_reset) { @@ -15938,8 +15947,13 @@ static void i40e_remove(struct pci_dev *pdev) i40e_write_rx_ctl(hw, I40E_PFQF_HENA(0), 0); i40e_write_rx_ctl(hw, I40E_PFQF_HENA(1), 0); - while (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state)) + /* Grab __I40E_RESET_RECOVERY_PENDING and set __I40E_IN_REMOVE + * flags, once they are set, i40e_rebuild should not be called as + * i40e_prep_for_reset always returns early. + */ + while (test_and_set_bit(__I40E_RESET_RECOVERY_PENDING, pf->state)) usleep_range(1000, 2000); + set_bit(__I40E_IN_REMOVE, pf->state); if (pf->flags & I40E_FLAG_SRIOV_ENABLED) { set_bit(__I40E_VF_RESETS_DISABLED, pf->state); @@ -16138,6 +16152,9 @@ static void i40e_pci_error_reset_done(struct pci_dev *pdev) { struct i40e_pf *pf = pci_get_drvdata(pdev); + if (test_bit(__I40E_IN_REMOVE, pf->state)) + return; + i40e_reset_and_rebuild(pf, false, false); } From 3ec5586b4699cfb75cdfa09425e11d121db40773 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Mon, 24 Jan 2022 13:40:35 +0800 Subject: [PATCH 0882/1295] drm/amd/pm: correct the MGpuFanBoost support for Beige Goby The existing way cannot handle Beige Goby well as a different PPTable data structure(PPTable_beige_goby_t instead of PPTable_t) is used there. Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index 777f717c37ae..a4207293158c 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -3696,14 +3696,14 @@ static ssize_t sienna_cichlid_get_gpu_metrics(struct smu_context *smu, static int sienna_cichlid_enable_mgpu_fan_boost(struct smu_context *smu) { - struct smu_table_context *table_context = &smu->smu_table; - PPTable_t *smc_pptable = table_context->driver_pptable; + uint16_t *mgpu_fan_boost_limit_rpm; + GET_PPTABLE_MEMBER(MGpuFanBoostLimitRpm, &mgpu_fan_boost_limit_rpm); /* * Skip the MGpuFanBoost setting for those ASICs * which do not support it */ - if (!smc_pptable->MGpuFanBoostLimitRpm) + if (*mgpu_fan_boost_limit_rpm == 0) return 0; return smu_cmn_send_smc_msg_with_param(smu, From a6ed2035878e5ad2e43ed175d8812ac9399d6c40 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 11 Jan 2022 14:00:26 -0600 Subject: [PATCH 0883/1295] drm/amd: Warn users about potential s0ix problems On some OEM setups users can configure the BIOS for S3 or S2idle. When configured to S3 users can still choose 's2idle' in the kernel by using `/sys/power/mem_sleep`. Before commit 6dc8265f9803 ("drm/amdgpu: always reset the asic in suspend (v2)"), the GPU would crash. Now when configured this way, the system should resume but will use more power. As such, adjust the `amdpu_acpi_is_s0ix function` to warn users about potential power consumption issues during their first attempt at suspending. Reported-by: Bjoren Dasse Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1824 Reviewed-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 8 ++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 24 +++++++++++++++++++----- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index d8b854fcbffa..5c466d8972f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1408,12 +1408,10 @@ int amdgpu_acpi_smart_shift_update(struct drm_device *dev, enum amdgpu_ss ss_sta int amdgpu_acpi_pcie_notify_device_ready(struct amdgpu_device *adev); void amdgpu_acpi_get_backlight_caps(struct amdgpu_dm_backlight_caps *caps); -bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev); void amdgpu_acpi_detect(void); #else static inline int amdgpu_acpi_init(struct amdgpu_device *adev) { return 0; } static inline void amdgpu_acpi_fini(struct amdgpu_device *adev) { } -static inline bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { return false; } static inline void amdgpu_acpi_detect(void) { } static inline bool amdgpu_acpi_is_power_shift_control_supported(void) { return false; } static inline int amdgpu_acpi_power_shift_control(struct amdgpu_device *adev, @@ -1422,6 +1420,12 @@ static inline int amdgpu_acpi_smart_shift_update(struct drm_device *dev, enum amdgpu_ss ss_state) { return 0; } #endif +#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) +bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev); +#else +static inline bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { return false; } +#endif + int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, uint64_t addr, struct amdgpu_bo **bo, struct amdgpu_bo_va_mapping **mapping); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index 4811b0faafd9..b19d40751802 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -1031,6 +1031,7 @@ void amdgpu_acpi_detect(void) } } +#if IS_ENABLED(CONFIG_SUSPEND) /** * amdgpu_acpi_is_s0ix_active * @@ -1040,11 +1041,24 @@ void amdgpu_acpi_detect(void) */ bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { -#if IS_ENABLED(CONFIG_AMD_PMC) && IS_ENABLED(CONFIG_SUSPEND) - if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) { - if (adev->flags & AMD_IS_APU) - return pm_suspend_target_state == PM_SUSPEND_TO_IDLE; + if (!(adev->flags & AMD_IS_APU) || + (pm_suspend_target_state != PM_SUSPEND_TO_IDLE)) + return false; + + if (!(acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0)) { + dev_warn_once(adev->dev, + "Power consumption will be higher as BIOS has not been configured for suspend-to-idle.\n" + "To use suspend-to-idle change the sleep mode in BIOS setup.\n"); + return false; } -#endif + +#if !IS_ENABLED(CONFIG_AMD_PMC) + dev_warn_once(adev->dev, + "Power consumption will be higher as the kernel has not been compiled with CONFIG_AMD_PMC.\n"); return false; +#else + return true; +#endif /* CONFIG_AMD_PMC */ } + +#endif /* CONFIG_SUSPEND */ From 4223f86512877b04c932e7203648b37eec931731 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Sat, 29 Jan 2022 09:27:04 +0300 Subject: [PATCH 0884/1295] net: dsa: mt7530: make NET_DSA_MT7530 select MEDIATEK_GE_PHY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make MediaTek MT753x DSA driver enable MediaTek Gigabit PHYs driver to properly control MT7530 and MT7531 switch PHYs. A noticeable change is that the behaviour of switchport interfaces going up-down-up-down is no longer there. Fixes: b8f126a8d543 ("net-next: dsa: add dsa support for Mediatek MT7530 switch") Signed-off-by: Arınç ÜNAL Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20220129062703.595-1-arinc.unal@arinc9.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig index 7b1457a6e327..c0c91440340a 100644 --- a/drivers/net/dsa/Kconfig +++ b/drivers/net/dsa/Kconfig @@ -36,6 +36,7 @@ config NET_DSA_LANTIQ_GSWIP config NET_DSA_MT7530 tristate "MediaTek MT753x and MT7621 Ethernet switch support" select NET_DSA_TAG_MTK + select MEDIATEK_GE_PHY help This enables support for the MediaTek MT7530, MT7531, and MT7621 Ethernet switch chips. From 7af037c39b600bac2c716dd1228e8ddbe149573f Mon Sep 17 00:00:00 2001 From: Camel Guo Date: Mon, 31 Jan 2022 09:38:40 +0100 Subject: [PATCH 0885/1295] net: stmmac: dump gmac4 DMA registers correctly Unlike gmac100, gmac1000, gmac4 has 27 DMA registers and they are located at DMA_CHAN_BASE_ADDR (0x1100). In order for ethtool to dump gmac4 DMA registers correctly, this commit checks if a net_device has gmac4 and uses different logic to dump its DMA registers. This fixes the following KASAN warning, which can normally be triggered by a command similar like "ethtool -d eth0": BUG: KASAN: vmalloc-out-of-bounds in dwmac4_dump_dma_regs+0x6d4/0xb30 Write of size 4 at addr ffffffc010177100 by task ethtool/1839 kasan_report+0x200/0x21c __asan_report_store4_noabort+0x34/0x60 dwmac4_dump_dma_regs+0x6d4/0xb30 stmmac_ethtool_gregs+0x110/0x204 ethtool_get_regs+0x200/0x4b0 dev_ethtool+0x1dac/0x3800 dev_ioctl+0x7c0/0xb50 sock_ioctl+0x298/0x6c4 ... Fixes: fbf68229ffe7 ("net: stmmac: unify registers dumps methods") Signed-off-by: Camel Guo Link: https://lore.kernel.org/r/20220131083841.3346801-1-camel.guo@axis.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac_dma.h | 1 + .../ethernet/stmicro/stmmac/stmmac_ethtool.c | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h index 1914ad698cab..acd70b9a3173 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h @@ -150,6 +150,7 @@ #define NUM_DWMAC100_DMA_REGS 9 #define NUM_DWMAC1000_DMA_REGS 23 +#define NUM_DWMAC4_DMA_REGS 27 void dwmac_enable_dma_transmission(void __iomem *ioaddr); void dwmac_enable_dma_irq(void __iomem *ioaddr, u32 chan, bool rx, bool tx); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 164dff5ec32e..abfb3cd5958d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -21,10 +21,18 @@ #include "dwxgmac2.h" #define REG_SPACE_SIZE 0x1060 +#define GMAC4_REG_SPACE_SIZE 0x116C #define MAC100_ETHTOOL_NAME "st_mac100" #define GMAC_ETHTOOL_NAME "st_gmac" #define XGMAC_ETHTOOL_NAME "st_xgmac" +/* Same as DMA_CHAN_BASE_ADDR defined in dwmac4_dma.h + * + * It is here because dwmac_dma.h and dwmac4_dam.h can not be included at the + * same time due to the conflicting macro names. + */ +#define GMAC4_DMA_CHAN_BASE_ADDR 0x00001100 + #define ETHTOOL_DMA_OFFSET 55 struct stmmac_stats { @@ -434,6 +442,8 @@ static int stmmac_ethtool_get_regs_len(struct net_device *dev) if (priv->plat->has_xgmac) return XGMAC_REGSIZE * 4; + else if (priv->plat->has_gmac4) + return GMAC4_REG_SPACE_SIZE; return REG_SPACE_SIZE; } @@ -446,8 +456,13 @@ static void stmmac_ethtool_gregs(struct net_device *dev, stmmac_dump_mac_regs(priv, priv->hw, reg_space); stmmac_dump_dma_regs(priv, priv->ioaddr, reg_space); - if (!priv->plat->has_xgmac) { - /* Copy DMA registers to where ethtool expects them */ + /* Copy DMA registers to where ethtool expects them */ + if (priv->plat->has_gmac4) { + /* GMAC4 dumps its DMA registers at its DMA_CHAN_BASE_ADDR */ + memcpy(®_space[ETHTOOL_DMA_OFFSET], + ®_space[GMAC4_DMA_CHAN_BASE_ADDR / 4], + NUM_DWMAC4_DMA_REGS * 4); + } else if (!priv->plat->has_xgmac) { memcpy(®_space[ETHTOOL_DMA_OFFSET], ®_space[DMA_BUS_MODE / 4], NUM_DWMAC1000_DMA_REGS * 4); From 9cef24c8b76c1f6effe499d2f131807c90f7ce9a Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Sun, 30 Jan 2022 13:29:01 +0200 Subject: [PATCH 0886/1295] net: macsec: Fix offload support for NETDEV_UNREGISTER event Current macsec netdev notify handler handles NETDEV_UNREGISTER event by releasing relevant SW resources only, this causes resources leak in case of macsec HW offload, as the underlay driver was not notified to clean it's macsec offload resources. Fix by calling the underlay driver to clean it's relevant resources by moving offload handling from macsec_dellink() to macsec_common_dellink() when handling NETDEV_UNREGISTER event. Fixes: 3cf3227a21d1 ("net: macsec: hardware offloading infrastructure") Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Raed Salem Reviewed-by: Antoine Tenart Link: https://lore.kernel.org/r/1643542141-28956-1-git-send-email-raeds@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 16aa3a478e9e..33ff33c05aab 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -3870,6 +3870,18 @@ static void macsec_common_dellink(struct net_device *dev, struct list_head *head struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(macsec)) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(netdev_priv(dev), &ctx); + if (ops) { + ctx.secy = &macsec->secy; + macsec_offload(ops->mdo_del_secy, &ctx); + } + } + unregister_netdevice_queue(dev, head); list_del_rcu(&macsec->secys); macsec_del_dev(macsec); @@ -3884,18 +3896,6 @@ static void macsec_dellink(struct net_device *dev, struct list_head *head) struct net_device *real_dev = macsec->real_dev; struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); - /* If h/w offloading is available, propagate to the device */ - if (macsec_is_offloaded(macsec)) { - const struct macsec_ops *ops; - struct macsec_context ctx; - - ops = macsec_get_ops(netdev_priv(dev), &ctx); - if (ops) { - ctx.secy = &macsec->secy; - macsec_offload(ops->mdo_del_secy, &ctx); - } - } - macsec_common_dellink(dev, head); if (list_empty(&rxd->secys)) { From ff4865b3c8cd746ef72f59bdd485848b4cebd43d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 26 Jan 2022 20:48:49 +0100 Subject: [PATCH 0887/1295] ALSA: Replace acpi_bus_get_device() Replace acpi_bus_get_device() that is going to be dropped with acpi_fetch_acpi_dev(). No intentional functional impact. Signed-off-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/2828205.e9J7NaK4W3@kreacher Signed-off-by: Takashi Iwai --- sound/hda/intel-sdw-acpi.c | 7 +++---- sound/soc/soc-acpi.c | 7 ++----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/sound/hda/intel-sdw-acpi.c b/sound/hda/intel-sdw-acpi.c index b7758dbe2371..5cb92f7ccbca 100644 --- a/sound/hda/intel-sdw-acpi.c +++ b/sound/hda/intel-sdw-acpi.c @@ -50,11 +50,11 @@ static bool is_link_enabled(struct fwnode_handle *fw_node, int i) static int sdw_intel_scan_controller(struct sdw_intel_acpi_info *info) { - struct acpi_device *adev; + struct acpi_device *adev = acpi_fetch_acpi_dev(info->handle); int ret, i; u8 count; - if (acpi_bus_get_device(info->handle, &adev)) + if (!adev) return -EINVAL; /* Found controller, find links supported */ @@ -119,7 +119,6 @@ static acpi_status sdw_intel_acpi_cb(acpi_handle handle, u32 level, void *cdata, void **return_value) { struct sdw_intel_acpi_info *info = cdata; - struct acpi_device *adev; acpi_status status; u64 adr; @@ -127,7 +126,7 @@ static acpi_status sdw_intel_acpi_cb(acpi_handle handle, u32 level, if (ACPI_FAILURE(status)) return AE_OK; /* keep going */ - if (acpi_bus_get_device(handle, &adev)) { + if (!acpi_fetch_acpi_dev(handle)) { pr_err("%s: Couldn't find ACPI handle\n", __func__); return AE_NOT_FOUND; } diff --git a/sound/soc/soc-acpi.c b/sound/soc/soc-acpi.c index cbd7ea48837b..142476f1396f 100644 --- a/sound/soc/soc-acpi.c +++ b/sound/soc/soc-acpi.c @@ -55,16 +55,13 @@ EXPORT_SYMBOL_GPL(snd_soc_acpi_find_machine); static acpi_status snd_soc_acpi_find_package(acpi_handle handle, u32 level, void *context, void **ret) { - struct acpi_device *adev; + struct acpi_device *adev = acpi_fetch_acpi_dev(handle); acpi_status status; struct snd_soc_acpi_package_context *pkg_ctx = context; pkg_ctx->data_valid = false; - if (acpi_bus_get_device(handle, &adev)) - return AE_OK; - - if (adev->status.present && adev->status.functional) { + if (adev && adev->status.present && adev->status.functional) { struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; union acpi_object *myobj = NULL; From 4ee02e20893d2f9e951c7888f2284fa608ddaa35 Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Mon, 31 Jan 2022 19:35:16 +0100 Subject: [PATCH 0888/1295] ALSA: usb-audio: Correct quirk for VF0770 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This device provides both audio and video. The original quirk added in commit 48827e1d6af5 ("ALSA: usb-audio: Add quirk for VF0770") used USB_DEVICE to match the vendor and product ID. Depending on module order, if snd-usb-audio was asked first, it would match the entire device and uvcvideo wouldn't get to see it. Change the matching to USB_AUDIO_DEVICE to restore uvcvideo matching in all cases. Fixes: 48827e1d6af5 ("ALSA: usb-audio: Add quirk for VF0770") Reported-by: Jukka Heikintalo Tested-by: Jukka Heikintalo Reported-by: Paweł Susicki Tested-by: Paweł Susicki Cc: # 5.4, 5.10, 5.14, 5.15 Signed-off-by: Jonas Hahnfeld Link: https://lore.kernel.org/r/20220131183516.61191-1-hahnjo@hahnjo.de Signed-off-by: Takashi Iwai --- sound/usb/quirks-table.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index b1522e43173e..0ea39565e623 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -84,7 +84,7 @@ * combination. */ { - USB_DEVICE(0x041e, 0x4095), + USB_AUDIO_DEVICE(0x041e, 0x4095), .driver_info = (unsigned long) &(const struct snd_usb_audio_quirk) { .ifnum = QUIRK_ANY_INTERFACE, .type = QUIRK_COMPOSITE, From 50317b636e7184d15126e2dfc83db0963a38d31e Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Mon, 31 Jan 2022 11:07:02 +0100 Subject: [PATCH 0889/1295] MIPS: octeon: Fix missed PTR->PTR_WD conversion Fixes: fa62f39dc7e2 ("MIPS: Fix build error due to PTR used in more places") Signed-off-by: Thomas Bogendoerfer --- arch/mips/cavium-octeon/octeon-memcpy.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/cavium-octeon/octeon-memcpy.S b/arch/mips/cavium-octeon/octeon-memcpy.S index 0a515cde1c18..25860fba6218 100644 --- a/arch/mips/cavium-octeon/octeon-memcpy.S +++ b/arch/mips/cavium-octeon/octeon-memcpy.S @@ -74,7 +74,7 @@ #define EXC(inst_reg,addr,handler) \ 9: inst_reg, addr; \ .section __ex_table,"a"; \ - PTR 9b, handler; \ + PTR_WD 9b, handler; \ .previous /* From 2161ba070999a709f975910b6b9ad6b51cd6f120 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 29 Jan 2022 12:58:19 -0800 Subject: [PATCH 0890/1295] MIPS: KVM: fix vz.c kernel-doc notation Fix all kernel-doc warnings in mips/kvm/vz.c as reported by the kernel test robot: arch/mips/kvm/vz.c:471: warning: Function parameter or member 'out_compare' not described in '_kvm_vz_save_htimer' arch/mips/kvm/vz.c:471: warning: Function parameter or member 'out_cause' not described in '_kvm_vz_save_htimer' arch/mips/kvm/vz.c:471: warning: Excess function parameter 'compare' description in '_kvm_vz_save_htimer' arch/mips/kvm/vz.c:471: warning: Excess function parameter 'cause' description in '_kvm_vz_save_htimer' arch/mips/kvm/vz.c:1551: warning: No description found for return value of 'kvm_trap_vz_handle_cop_unusable' arch/mips/kvm/vz.c:1552: warning: expecting prototype for kvm_trap_vz_handle_cop_unusuable(). Prototype was for kvm_trap_vz_handle_cop_unusable() instead arch/mips/kvm/vz.c:1597: warning: No description found for return value of 'kvm_trap_vz_handle_msa_disabled' Fixes: c992a4f6a9b0 ("KVM: MIPS: Implement VZ support") Fixes: f4474d50c7d4 ("KVM: MIPS/VZ: Support hardware guest timer") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Cc: Huacai Chen Cc: Aleksandar Markovic Cc: James Hogan Cc: kvm@vger.kernel.org Signed-off-by: Thomas Bogendoerfer --- arch/mips/kvm/vz.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 4adca5abbc72..c706f5890a05 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -458,8 +458,8 @@ void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu) /** * _kvm_vz_save_htimer() - Switch to software emulation of guest timer. * @vcpu: Virtual CPU. - * @compare: Pointer to write compare value to. - * @cause: Pointer to write cause value to. + * @out_compare: Pointer to write compare value to. + * @out_cause: Pointer to write cause value to. * * Save VZ guest timer state and switch to software emulation of guest CP0 * timer. The hard timer must already be in use, so preemption should be @@ -1541,11 +1541,14 @@ static int kvm_trap_vz_handle_guest_exit(struct kvm_vcpu *vcpu) } /** - * kvm_trap_vz_handle_cop_unusuable() - Guest used unusable coprocessor. + * kvm_trap_vz_handle_cop_unusable() - Guest used unusable coprocessor. * @vcpu: Virtual CPU context. * * Handle when the guest attempts to use a coprocessor which hasn't been allowed * by the root context. + * + * Return: value indicating whether to resume the host or the guest + * (RESUME_HOST or RESUME_GUEST) */ static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu) { @@ -1592,6 +1595,9 @@ static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu) * * Handle when the guest attempts to use MSA when it is disabled in the root * context. + * + * Return: value indicating whether to resume the host or the guest + * (RESUME_HOST or RESUME_GUEST) */ static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu) { From 92c4cfaee6872038563c5b6f2e8e613f9d84d47d Mon Sep 17 00:00:00 2001 From: Jordy Zomer Date: Sat, 29 Jan 2022 16:06:04 +0100 Subject: [PATCH 0891/1295] dma-buf: heaps: Fix potential spectre v1 gadget It appears like nr could be a Spectre v1 gadget as it's supplied by a user and used as an array index. Prevent the contents of kernel memory from being leaked to userspace via speculative execution by using array_index_nospec. Signed-off-by: Jordy Zomer Fixes: c02a81fba74f ("dma-buf: Add dma-buf heaps framework") Cc: # v5.6+ Acked-by: John Stultz Signed-off-by: Sumit Semwal [sumits: added fixes and cc: stable tags] Link: https://patchwork.freedesktop.org/patch/msgid/20220129150604.3461652-1-jordy@pwning.systems --- drivers/dma-buf/dma-heap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c index 56bf5ad01ad5..8f5848aa144f 100644 --- a/drivers/dma-buf/dma-heap.c +++ b/drivers/dma-buf/dma-heap.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -135,6 +136,7 @@ static long dma_heap_ioctl(struct file *file, unsigned int ucmd, if (nr >= ARRAY_SIZE(dma_heap_ioctl_cmds)) return -EINVAL; + nr = array_index_nospec(nr, ARRAY_SIZE(dma_heap_ioctl_cmds)); /* Get the kernel ioctl cmd that matches */ kcmd = dma_heap_ioctl_cmds[nr]; From 7d73c602154df56802a9e75ac212505fc1e9a2b6 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Mon, 24 Jan 2022 18:01:24 -0800 Subject: [PATCH 0892/1295] drm/i915/pmu: Fix KMD and GuC race on accessing busyness GuC updates shared memory and KMD reads it. Since this is not synchronized, we run into a race where the value read is inconsistent. Sometimes the inconsistency is in reading the upper MSB bytes of the last_switch_in value. 2 types of cases are seen - upper 8 bits are zero and upper 24 bits are zero. Since these are non-zero values, it is not trivial to determine validity of these values. Instead we read the values multiple times until they are consistent. In test runs, 3 attempts results in consistent values. The upper bound is set to 6 attempts and may need to be tuned as per any new occurences. Since the duration that gt is parked can vary, the patch also updates the gt timestamp on unpark before starting the worker. v2: - Initialize i - Use READ_ONCE to access engine record Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to pmu") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Alan Previn Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20220125020124.788679-2-umesh.nerlige.ramappa@intel.com (cherry picked from commit 512712a824de9b856a4e61343e3e4390eba2c391) Signed-off-by: Tvrtko Ursulin --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 58 +++++++++++++++++-- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index fcec2cb833af..154ad726e266 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1113,6 +1113,19 @@ __extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start) if (new_start == lower_32_bits(*prev_start)) return; + /* + * When gt is unparked, we update the gt timestamp and start the ping + * worker that updates the gt_stamp every POLL_TIME_CLKS. As long as gt + * is unparked, all switched in contexts will have a start time that is + * within +/- POLL_TIME_CLKS of the most recent gt_stamp. + * + * If neither gt_stamp nor new_start has rolled over, then the + * gt_stamp_hi does not need to be adjusted, however if one of them has + * rolled over, we need to adjust gt_stamp_hi accordingly. + * + * The below conditions address the cases of new_start rollover and + * gt_stamp_last rollover respectively. + */ if (new_start < gt_stamp_last && (new_start - gt_stamp_last) <= POLL_TIME_CLKS) gt_stamp_hi++; @@ -1124,17 +1137,45 @@ __extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start) *prev_start = ((u64)gt_stamp_hi << 32) | new_start; } -static void guc_update_engine_gt_clks(struct intel_engine_cs *engine) +/* + * GuC updates shared memory and KMD reads it. Since this is not synchronized, + * we run into a race where the value read is inconsistent. Sometimes the + * inconsistency is in reading the upper MSB bytes of the last_in value when + * this race occurs. 2 types of cases are seen - upper 8 bits are zero and upper + * 24 bits are zero. Since these are non-zero values, it is non-trivial to + * determine validity of these values. Instead we read the values multiple times + * until they are consistent. In test runs, 3 attempts results in consistent + * values. The upper bound is set to 6 attempts and may need to be tuned as per + * any new occurences. + */ +static void __get_engine_usage_record(struct intel_engine_cs *engine, + u32 *last_in, u32 *id, u32 *total) { struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine); + int i = 0; + + do { + *last_in = READ_ONCE(rec->last_switch_in_stamp); + *id = READ_ONCE(rec->current_context_index); + *total = READ_ONCE(rec->total_runtime); + + if (READ_ONCE(rec->last_switch_in_stamp) == *last_in && + READ_ONCE(rec->current_context_index) == *id && + READ_ONCE(rec->total_runtime) == *total) + break; + } while (++i < 6); +} + +static void guc_update_engine_gt_clks(struct intel_engine_cs *engine) +{ struct intel_engine_guc_stats *stats = &engine->stats.guc; struct intel_guc *guc = &engine->gt->uc.guc; - u32 last_switch = rec->last_switch_in_stamp; - u32 ctx_id = rec->current_context_index; - u32 total = rec->total_runtime; + u32 last_switch, ctx_id, total; lockdep_assert_held(&guc->timestamp.lock); + __get_engine_usage_record(engine, &last_switch, &ctx_id, &total); + stats->running = ctx_id != ~0U && last_switch; if (stats->running) __extend_last_switch(guc, &stats->start_gt_clk, last_switch); @@ -1236,6 +1277,10 @@ static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now) if (!in_reset && intel_gt_pm_get_if_awake(gt)) { stats_saved = *stats; gt_stamp_saved = guc->timestamp.gt_stamp; + /* + * Update gt_clks, then gt timestamp to simplify the 'gt_stamp - + * start_gt_clk' calculation below for active engines. + */ guc_update_engine_gt_clks(engine); guc_update_pm_timestamp(guc, now); intel_gt_pm_put_async(gt); @@ -1364,10 +1409,15 @@ void intel_guc_busyness_park(struct intel_gt *gt) void intel_guc_busyness_unpark(struct intel_gt *gt) { struct intel_guc *guc = >->uc.guc; + unsigned long flags; + ktime_t unused; if (!guc_submission_initialized(guc)) return; + spin_lock_irqsave(&guc->timestamp.lock, flags); + guc_update_pm_timestamp(guc, &unused); + spin_unlock_irqrestore(&guc->timestamp.lock, flags); mod_delayed_work(system_highpri_wq, &guc->timestamp.work, guc->timestamp.ping_delay); } From fbb9b194e15a63c56c5664e76ccd0e85c6100cea Mon Sep 17 00:00:00 2001 From: Cameron Williams Date: Tue, 1 Feb 2022 10:12:51 +0000 Subject: [PATCH 0893/1295] USB: serial: ftdi_sio: add support for Brainboxes US-159/235/320 This patch adds support for the Brainboxes US-159, US-235 and US-320 USB-to-Serial devices. Signed-off-by: Cameron Williams Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/ftdi_sio.c | 3 +++ drivers/usb/serial/ftdi_sio_ids.h | 3 +++ 2 files changed, 6 insertions(+) diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index 4edebd14ef29..49c08f07c969 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -969,6 +969,7 @@ static const struct usb_device_id id_table_combined[] = { { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_VX_023_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_VX_034_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_101_PID) }, + { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_159_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_160_1_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_160_2_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_160_3_PID) }, @@ -977,12 +978,14 @@ static const struct usb_device_id id_table_combined[] = { { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_160_6_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_160_7_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_160_8_PID) }, + { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_235_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_257_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_279_1_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_279_2_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_279_3_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_279_4_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_313_PID) }, + { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_320_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_324_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_346_1_PID) }, { USB_DEVICE(BRAINBOXES_VID, BRAINBOXES_US_346_2_PID) }, diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index 755858ca20ba..d1a9564697a4 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -1506,6 +1506,9 @@ #define BRAINBOXES_VX_023_PID 0x1003 /* VX-023 ExpressCard 1 Port RS422/485 */ #define BRAINBOXES_VX_034_PID 0x1004 /* VX-034 ExpressCard 2 Port RS422/485 */ #define BRAINBOXES_US_101_PID 0x1011 /* US-101 1xRS232 */ +#define BRAINBOXES_US_159_PID 0x1021 /* US-159 1xRS232 */ +#define BRAINBOXES_US_235_PID 0x1017 /* US-235 1xRS232 */ +#define BRAINBOXES_US_320_PID 0x1019 /* US-320 1xRS422/485 */ #define BRAINBOXES_US_324_PID 0x1013 /* US-324 1xRS422/485 1Mbaud */ #define BRAINBOXES_US_606_1_PID 0x2001 /* US-606 6 Port RS232 Serial Port 1 and 2 */ #define BRAINBOXES_US_606_2_PID 0x2002 /* US-606 6 Port RS232 Serial Port 3 and 4 */ From b50f8f09c622297d3cf46e332e17ba8adedec9af Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 1 Feb 2022 11:42:52 +0100 Subject: [PATCH 0894/1295] USB: serial: cp210x: add NCR Retail IO box id Add the device id for NCR's Retail IO box (CP2105) used in NCR FastLane SelfServ Checkout - R6C: https://www.ncr.com/product-catalog/ncr-fastlane-selfserv-checkout-r6c Reported-by: Scott Russell Cc: stable@vger.kernel.org Reviewed-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold --- drivers/usb/serial/cp210x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index 8a60c0d56863..5172e7ac16fd 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -51,6 +51,7 @@ static void cp210x_enable_event_mode(struct usb_serial_port *port); static void cp210x_disable_event_mode(struct usb_serial_port *port); static const struct usb_device_id id_table[] = { + { USB_DEVICE(0x0404, 0x034C) }, /* NCR Retail IO Box */ { USB_DEVICE(0x045B, 0x0053) }, /* Renesas RX610 RX-Stick */ { USB_DEVICE(0x0471, 0x066A) }, /* AKTAKOM ACE-1001 cable */ { USB_DEVICE(0x0489, 0xE000) }, /* Pirelli Broadband S.p.A, DP-L10 SIP/GSM Mobile */ From 6ca0c6283340d819bf9c7d8e76be33c9fbd903ab Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 1 Feb 2022 11:42:53 +0100 Subject: [PATCH 0895/1295] USB: serial: cp210x: add CPI Bulk Coin Recycler id Add the device id for the Crane Payment Innovation / Money Controls Bulk Coin Recycler: https://www.cranepi.com/en/system/files/Support/OM_BCR_EN_V1-04_0.pdf Reported-by: Scott Russell Cc: stable@vger.kernel.org Reviewed-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold --- drivers/usb/serial/cp210x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index 5172e7ac16fd..a27f7efcec6a 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -69,6 +69,7 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(0x0FCF, 0x1004) }, /* Dynastream ANT2USB */ { USB_DEVICE(0x0FCF, 0x1006) }, /* Dynastream ANT development board */ { USB_DEVICE(0x0FDE, 0xCA05) }, /* OWL Wireless Electricity Monitor CM-160 */ + { USB_DEVICE(0x106F, 0x0003) }, /* CPI / Money Controls Bulk Coin Recycler */ { USB_DEVICE(0x10A6, 0xAA26) }, /* Knock-off DCU-11 cable */ { USB_DEVICE(0x10AB, 0x10C5) }, /* Siemens MC60 Cable */ { USB_DEVICE(0x10B5, 0xAC70) }, /* Nokia CA-42 USB */ From 57dfd7b53dec740afe402135fdd1c5708ec337f0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 28 Jan 2022 00:51:48 +0000 Subject: [PATCH 0896/1295] KVM: x86: Move delivery of non-APICv interrupt into vendor code Handle non-APICv interrupt delivery in vendor code, even though it means VMX and SVM will temporarily have duplicate code. SVM's AVIC has a race condition that requires KVM to fall back to legacy interrupt injection _after_ the interrupt has been logged in the vIRR, i.e. to fix the race, SVM will need to open code the full flow anyways[*]. Refactor the code so that the SVM bug without introducing other issues, e.g. SVM would return "success" and thus invoke trace_kvm_apicv_accept_irq() even when delivery through the AVIC failed, and to opportunistically prepare for using KVM_X86_OP to fill each vendor's kvm_x86_ops struct, which will rely on the vendor function matching the kvm_x86_op pointer name. No functional change intended. [*] https://lore.kernel.org/all/20211213104634.199141-4-mlevitsk@redhat.com Signed-off-by: Sean Christopherson Message-Id: <20220128005208.4008533-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 2 +- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/lapic.c | 10 ++-------- arch/x86/kvm/svm/svm.c | 17 ++++++++++++++++- arch/x86/kvm/vmx/vmx.c | 17 ++++++++++++++++- 5 files changed, 37 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 631d5040b31e..d39e0de06be2 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -82,7 +82,7 @@ KVM_X86_OP_NULL(guest_apic_has_interrupt) KVM_X86_OP(load_eoi_exitmap) KVM_X86_OP(set_virtual_apic_mode) KVM_X86_OP_NULL(set_apic_access_page_addr) -KVM_X86_OP(deliver_posted_interrupt) +KVM_X86_OP(deliver_interrupt) KVM_X86_OP_NULL(sync_pir_to_irr) KVM_X86_OP(set_tss_addr) KVM_X86_OP(set_identity_map_addr) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index efee29d5ad4f..80e285533371 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1409,7 +1409,8 @@ struct kvm_x86_ops { void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu); - int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); + void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4662469240bc..d7e6fde82d25 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1096,14 +1096,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic->regs + APIC_TMR); } - if (static_call(kvm_x86_deliver_posted_interrupt)(vcpu, vector)) { - kvm_lapic_set_irr(vector, apic); - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); - } else { - trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector); - } + static_call(kvm_x86_deliver_interrupt)(apic, delivery_mode, + trig_mode, vector); break; case APIC_DM_REMRD: diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9cef8e4598df..5772dd6f79a4 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3291,6 +3291,21 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } +static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + + if (svm_deliver_avic_intr(vcpu, vector)) { + kvm_lapic_set_irr(vector, apic); + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } else { + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector); + } +} + static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4545,7 +4560,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .pmu_ops = &amd_pmu_ops, .nested_ops = &svm_nested_ops, - .deliver_posted_interrupt = svm_deliver_avic_intr, + .deliver_interrupt = svm_deliver_interrupt, .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b1165bb13a5a..3c0ba5b1bbcf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4041,6 +4041,21 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) return 0; } +static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + + if (vmx_deliver_posted_interrupt(vcpu, vector)) { + kvm_lapic_set_irr(vector, apic); + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } else { + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector); + } +} + /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -7766,7 +7781,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .hwapic_isr_update = vmx_hwapic_isr_update, .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + .deliver_interrupt = vmx_deliver_interrupt, .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, From ee12595147ac1fbfb5bcb23837e26dd58d94b15d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 28 Jan 2022 22:57:01 +0300 Subject: [PATCH 0897/1295] fanotify: Fix stale file descriptor in copy_event_to_user() This code calls fd_install() which gives the userspace access to the fd. Then if copy_info_records_to_user() fails it calls put_unused_fd(fd) but that will not release it and leads to a stale entry in the file descriptor table. Generally you can't trust the fd after a call to fd_install(). The fix is to delay the fd_install() until everything else has succeeded. Fortunately it requires CAP_SYS_ADMIN to reach this code so the security impact is less. Fixes: f644bc449b37 ("fanotify: fix copy_event_to_user() fid error clean up") Link: https://lore.kernel.org/r/20220128195656.GA26981@kili Signed-off-by: Dan Carpenter Reviewed-by: Mathias Krause Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify_user.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 1026f67b1d1e..2ff6bd85ba8f 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -701,9 +701,6 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (fanotify_is_perm_event(event->mask)) FANOTIFY_PERM(event)->fd = fd; - if (f) - fd_install(fd, f); - if (info_mode) { ret = copy_info_records_to_user(event, info, info_mode, pidfd, buf, count); @@ -711,6 +708,9 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, goto out_close_fd; } + if (f) + fd_install(fd, f); + return metadata.event_len; out_close_fd: From 881cc731df6af99a21622e9be25a23b81adcd10b Mon Sep 17 00:00:00 2001 From: Jonathan McDowell Date: Mon, 31 Jan 2022 13:56:41 +0000 Subject: [PATCH 0898/1295] net: phy: Fix qca8081 with speeds lower than 2.5Gb/s A typo in qca808x_read_status means we try to set SMII mode on the port rather than SGMII when the link speed is not 2.5Gb/s. This results in no traffic due to the mismatch in configuration between the phy and the mac. v2: Only change interface mode when the link is up Fixes: 79c7bc0521545 ("net: phy: add qca8081 read_status") Cc: stable@vger.kernel.org Signed-off-by: Jonathan McDowell Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/phy/at803x.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 5b6c0d120e09..29aa811af430 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -1688,19 +1688,19 @@ static int qca808x_read_status(struct phy_device *phydev) if (ret < 0) return ret; - if (phydev->link && phydev->speed == SPEED_2500) - phydev->interface = PHY_INTERFACE_MODE_2500BASEX; - else - phydev->interface = PHY_INTERFACE_MODE_SMII; - - /* generate seed as a lower random value to make PHY linked as SLAVE easily, - * except for master/slave configuration fault detected. - * the reason for not putting this code into the function link_change_notify is - * the corner case where the link partner is also the qca8081 PHY and the seed - * value is configured as the same value, the link can't be up and no link change - * occurs. - */ - if (!phydev->link) { + if (phydev->link) { + if (phydev->speed == SPEED_2500) + phydev->interface = PHY_INTERFACE_MODE_2500BASEX; + else + phydev->interface = PHY_INTERFACE_MODE_SGMII; + } else { + /* generate seed as a lower random value to make PHY linked as SLAVE easily, + * except for master/slave configuration fault detected. + * the reason for not putting this code into the function link_change_notify is + * the corner case where the link partner is also the qca8081 PHY and the seed + * value is configured as the same value, the link can't be up and no link change + * occurs. + */ if (phydev->master_slave_state == MASTER_SLAVE_STATE_ERR) { qca808x_phy_ms_seed_enable(phydev, false); } else { From ef9989afda73332df566852d6e9ca695c05f10ce Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 1 Feb 2022 13:29:22 +0000 Subject: [PATCH 0899/1295] kvm: add guest_state_{enter,exit}_irqoff() When transitioning to/from guest mode, it is necessary to inform lockdep, tracing, and RCU in a specific order, similar to the requirements for transitions to/from user mode. Additionally, it is necessary to perform vtime accounting for a window around running the guest, with RCU enabled, such that timer interrupts taken from the guest can be accounted as guest time. Most architectures don't handle all the necessary pieces, and a have a number of common bugs, including unsafe usage of RCU during the window between guest_enter() and guest_exit(). On x86, this was dealt with across commits: 87fa7f3e98a1310e ("x86/kvm: Move context tracking where it belongs") 0642391e2139a2c1 ("x86/kvm/vmx: Add hardirq tracing to guest enter/exit") 9fc975e9efd03e57 ("x86/kvm/svm: Add hardirq tracing on guest enter/exit") 3ebccdf373c21d86 ("x86/kvm/vmx: Move guest enter/exit into .noinstr.text") 135961e0a7d555fc ("x86/kvm/svm: Move guest enter/exit into .noinstr.text") 160457140187c5fb ("KVM: x86: Defer vtime accounting 'til after IRQ handling") bc908e091b326467 ("KVM: x86: Consolidate guest enter/exit logic to common helpers") ... but those fixes are specific to x86, and as the resulting logic (while correct) is split across generic helper functions and x86-specific helper functions, it is difficult to see that the entry/exit accounting is balanced. This patch adds generic helpers which architectures can use to handle guest entry/exit consistently and correctly. The guest_{enter,exit}() helpers are split into guest_timing_{enter,exit}() to perform vtime accounting, and guest_context_{enter,exit}() to perform the necessary context tracking and RCU management. The existing guest_{enter,exit}() heleprs are left as wrappers of these. Atop this, new guest_state_enter_irqoff() and guest_state_exit_irqoff() helpers are added to handle the ordering of lockdep, tracing, and RCU manageent. These are inteneded to mirror exit_to_user_mode() and enter_from_user_mode(). Subsequent patches will migrate architectures over to the new helpers, following a sequence: guest_timing_enter_irqoff(); guest_state_enter_irqoff(); < run the vcpu > guest_state_exit_irqoff(); < take any pending IRQs > guest_timing_exit_irqoff(); This sequences handles all of the above correctly, and more clearly balances the entry and exit portions, making it easier to understand. The existing helpers are marked as deprecated, and will be removed once all architectures have been converted. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Marc Zyngier Reviewed-by: Paolo Bonzini Reviewed-by: Nicolas Saenz Julienne Message-Id: <20220201132926.3301912-2-mark.rutland@arm.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 112 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 3 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f079820f52b5..b3810976a27f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -29,7 +29,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -368,8 +370,11 @@ struct kvm_vcpu { u64 last_used_slot_gen; }; -/* must be called with irqs disabled */ -static __always_inline void guest_enter_irqoff(void) +/* + * Start accounting time towards a guest. + * Must be called before entering guest context. + */ +static __always_inline void guest_timing_enter_irqoff(void) { /* * This is running in ioctl context so its safe to assume that it's the @@ -378,7 +383,18 @@ static __always_inline void guest_enter_irqoff(void) instrumentation_begin(); vtime_account_guest_enter(); instrumentation_end(); +} +/* + * Enter guest context and enter an RCU extended quiescent state. + * + * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is + * unsafe to use any code which may directly or indirectly use RCU, tracing + * (including IRQ flag tracing), or lockdep. All code in this period must be + * non-instrumentable. + */ +static __always_inline void guest_context_enter_irqoff(void) +{ /* * KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode @@ -394,16 +410,79 @@ static __always_inline void guest_enter_irqoff(void) } } -static __always_inline void guest_exit_irqoff(void) +/* + * Deprecated. Architectures should move to guest_timing_enter_irqoff() and + * guest_state_enter_irqoff(). + */ +static __always_inline void guest_enter_irqoff(void) +{ + guest_timing_enter_irqoff(); + guest_context_enter_irqoff(); +} + +/** + * guest_state_enter_irqoff - Fixup state when entering a guest + * + * Entry to a guest will enable interrupts, but the kernel state is interrupts + * disabled when this is invoked. Also tell RCU about it. + * + * 1) Trace interrupts on state + * 2) Invoke context tracking if enabled to adjust RCU state + * 3) Tell lockdep that interrupts are enabled + * + * Invoked from architecture specific code before entering a guest. + * Must be called with interrupts disabled and the caller must be + * non-instrumentable. + * The caller has to invoke guest_timing_enter_irqoff() before this. + * + * Note: this is analogous to exit_to_user_mode(). + */ +static __always_inline void guest_state_enter_irqoff(void) +{ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + guest_context_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); +} + +/* + * Exit guest context and exit an RCU extended quiescent state. + * + * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is + * unsafe to use any code which may directly or indirectly use RCU, tracing + * (including IRQ flag tracing), or lockdep. All code in this period must be + * non-instrumentable. + */ +static __always_inline void guest_context_exit_irqoff(void) { context_tracking_guest_exit(); +} +/* + * Stop accounting time towards a guest. + * Must be called after exiting guest context. + */ +static __always_inline void guest_timing_exit_irqoff(void) +{ instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ vtime_account_guest_exit(); instrumentation_end(); } +/* + * Deprecated. Architectures should move to guest_state_exit_irqoff() and + * guest_timing_exit_irqoff(). + */ +static __always_inline void guest_exit_irqoff(void) +{ + guest_context_exit_irqoff(); + guest_timing_exit_irqoff(); +} + static inline void guest_exit(void) { unsigned long flags; @@ -413,6 +492,33 @@ static inline void guest_exit(void) local_irq_restore(flags); } +/** + * guest_state_exit_irqoff - Establish state when returning from guest mode + * + * Entry from a guest disables interrupts, but guest mode is traced as + * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. + * + * 1) Tell lockdep that interrupts are disabled + * 2) Invoke context tracking if enabled to reactivate RCU + * 3) Trace interrupts off state + * + * Invoked from architecture specific code after exiting a guest. + * Must be invoked with interrupts disabled and the caller must be + * non-instrumentable. + * The caller has to invoke guest_timing_exit_irqoff() after this. + * + * Note: this is analogous to enter_from_user_mode(). + */ +static __always_inline void guest_state_exit_irqoff(void) +{ + lockdep_hardirqs_off(CALLER_ADDR0); + guest_context_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) { /* From 72e3244512b34756a7e8aa67eff45cdcb040ac4e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 1 Feb 2022 13:29:26 +0000 Subject: [PATCH 0900/1295] kvm/mips: rework guest entry logic In kvm_arch_vcpu_ioctl_run() we use guest_enter_irqoff() and guest_exit_irqoff() directly, with interrupts masked between these. As we don't handle any timer ticks during this window, we will not account time spent within the guest as guest time, which is unfortunate. Additionally, we do not inform lockdep or tracing that interrupts will be enabled during guest execution, which caan lead to misleading traces and warnings that interrupts have been enabled for overly-long periods. This patch fixes these issues by using the new timing and context entry/exit helpers to ensure that interrupts are handled during guest vtime but with RCU watching, with a sequence: guest_timing_enter_irqoff(); guest_state_enter_irqoff(); < run the vcpu > guest_state_exit_irqoff(); < take any pending IRQs > guest_timing_exit_irqoff(); In addition, as guest exits during the "run the vcpu" step are handled by kvm_mips_handle_exit(), a wrapper function is added which ensures that such exists are handled with a sequence: guest_state_exit_irqoff(); < handle the exit > guest_state_enter_irqoff(); This means that exits which stop the vCPU running will have a redundant guest_state_enter_irqoff() .. guest_state_exit_irqoff() sequence, which can be addressed with future rework. Since instrumentation may make use of RCU, we must also ensure that no instrumented code is run during the EQS. I've split out the critical section into a new kvm_mips_enter_exit_vcpu() helper which is marked noinstr. Signed-off-by: Mark Rutland Cc: Aleksandar Markovic Cc: Frederic Weisbecker Cc: Huacai Chen Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Thomas Bogendoerfer Message-Id: <20220201132926.3301912-6-mark.rutland@arm.com> Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 50 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index e59cb6246f76..a25e0b73ee70 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -414,6 +414,24 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, return -ENOIOCTLCMD; } +/* + * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while + * the vCPU is running. + * + * This must be noinstr as instrumentation may make use of RCU, and this is not + * safe during the EQS. + */ +static int noinstr kvm_mips_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + int ret; + + guest_state_enter_irqoff(); + ret = kvm_mips_callbacks->vcpu_run(vcpu); + guest_state_exit_irqoff(); + + return ret; +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { int r = -EINTR; @@ -434,7 +452,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) lose_fpu(1); local_irq_disable(); - guest_enter_irqoff(); + guest_timing_enter_irqoff(); trace_kvm_enter(vcpu); /* @@ -445,10 +463,23 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); - r = kvm_mips_callbacks->vcpu_run(vcpu); + r = kvm_mips_vcpu_enter_exit(vcpu); + + /* + * We must ensure that any pending interrupts are taken before + * we exit guest timing so that timer ticks are accounted as + * guest time. Transiently unmask interrupts so that any + * pending interrupts are taken. + * + * TODO: is there a barrier which ensures that pending interrupts are + * recognised? Currently this just hopes that the CPU takes any pending + * interrupts between the enable and disable. + */ + local_irq_enable(); + local_irq_disable(); trace_kvm_out(vcpu); - guest_exit_irqoff(); + guest_timing_exit_irqoff(); local_irq_enable(); out: @@ -1168,7 +1199,7 @@ static void kvm_mips_set_c0_status(void) /* * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV) */ -int kvm_mips_handle_exit(struct kvm_vcpu *vcpu) +static int __kvm_mips_handle_exit(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 cause = vcpu->arch.host_cp0_cause; @@ -1357,6 +1388,17 @@ int kvm_mips_handle_exit(struct kvm_vcpu *vcpu) return ret; } +int noinstr kvm_mips_handle_exit(struct kvm_vcpu *vcpu) +{ + int ret; + + guest_state_exit_irqoff(); + ret = __kvm_mips_handle_exit(vcpu); + guest_state_enter_irqoff(); + + return ret; +} + /* Enable FPU for guest and restore context */ void kvm_own_fpu(struct kvm_vcpu *vcpu) { From b2d2af7e5df37ee3a9ba6b405bdbb7691a5c2dfc Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 1 Feb 2022 13:29:24 +0000 Subject: [PATCH 0901/1295] kvm/x86: rework guest entry logic For consistency and clarity, migrate x86 over to the generic helpers for guest timing and lockdep/RCU/tracing management, and remove the x86-specific helpers. Prior to this patch, the guest timing was entered in kvm_guest_enter_irqoff() (called by svm_vcpu_enter_exit() and svm_vcpu_enter_exit()), and was exited by the call to vtime_account_guest_exit() within vcpu_enter_guest(). To minimize duplication and to more clearly balance entry and exit, both entry and exit of guest timing are placed in vcpu_enter_guest(), using the new guest_timing_{enter,exit}_irqoff() helpers. When context tracking is used a small amount of additional time will be accounted towards guests; tick-based accounting is unnaffected as IRQs are disabled at this point and not enabled until after the return from the guest. This also corrects (benign) mis-balanced context tracking accounting introduced in commits: ae95f566b3d22ade ("KVM: X86: TSCDEADLINE MSR emulation fastpath") 26efe2fd92e50822 ("KVM: VMX: Handle preemption timer fastpath") Where KVM can enter a guest multiple times, calling vtime_guest_enter() without a corresponding call to vtime_account_guest_exit(), and with vtime_account_system() called when vtime_account_guest() should be used. As account_system_time() checks PF_VCPU and calls account_guest_time(), this doesn't result in any functional problem, but is unnecessarily confusing. Signed-off-by: Mark Rutland Acked-by: Paolo Bonzini Reviewed-by: Nicolas Saenz Julienne Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jim Mattson Cc: Joerg Roedel Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: Wanpeng Li Message-Id: <20220201132926.3301912-4-mark.rutland@arm.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 4 +++- arch/x86/kvm/x86.h | 45 ------------------------------------------ 4 files changed, 7 insertions(+), 50 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5772dd6f79a4..ea2f7f3614af 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3630,7 +3630,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); unsigned long vmcb_pa = svm->current_vmcb->pa; - kvm_guest_enter_irqoff(); + guest_state_enter_irqoff(); if (sev_es_guest(vcpu->kvm)) { __svm_sev_es_vcpu_run(vmcb_pa); @@ -3650,7 +3650,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) vmload(__sme_page_pa(sd->save_area)); } - kvm_guest_exit_irqoff(); + guest_state_exit_irqoff(); } static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3c0ba5b1bbcf..c0c256c33d21 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6767,7 +6767,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - kvm_guest_enter_irqoff(); + guest_state_enter_irqoff(); /* L1D Flush includes CPU buffer clear to mitigate MDS */ if (static_branch_unlikely(&vmx_l1d_should_flush)) @@ -6783,7 +6783,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vcpu->arch.cr2 = native_read_cr2(); - kvm_guest_exit_irqoff(); + guest_state_exit_irqoff(); } static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c25a6ef0ff06..fec3dd4f0718 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10088,6 +10088,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(0, 7); } + guest_timing_enter_irqoff(); + for (;;) { /* * Assert that vCPU vs. VM APICv state is consistent. An APICv @@ -10172,7 +10174,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * of accounting via context tracking, but the loss of accuracy is * acceptable for all known use cases. */ - vtime_account_guest_exit(); + guest_timing_exit_irqoff(); if (lapic_in_kernel(vcpu)) { s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 1ebd5a7594da..20c7a1fb90bb 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -10,51 +10,6 @@ void kvm_spurious_fault(void); -static __always_inline void kvm_guest_enter_irqoff(void) -{ - /* - * VMENTER enables interrupts (host state), but the kernel state is - * interrupts disabled when this is invoked. Also tell RCU about - * it. This is the same logic as for exit_to_user_mode(). - * - * This ensures that e.g. latency analysis on the host observes - * guest mode as interrupt enabled. - * - * guest_enter_irqoff() informs context tracking about the - * transition to guest mode and if enabled adjusts RCU state - * accordingly. - */ - instrumentation_begin(); - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - instrumentation_end(); - - guest_enter_irqoff(); - lockdep_hardirqs_on(CALLER_ADDR0); -} - -static __always_inline void kvm_guest_exit_irqoff(void) -{ - /* - * VMEXIT disables interrupts (host state), but tracing and lockdep - * have them in state 'on' as recorded before entering guest mode. - * Same as enter_from_user_mode(). - * - * context_tracking_guest_exit() restores host context and reinstates - * RCU if enabled and required. - * - * This needs to be done immediately after VM-Exit, before any code - * that might contain tracepoints or call out to the greater world, - * e.g. before x86_spec_ctrl_restore_host(). - */ - lockdep_hardirqs_off(CALLER_ADDR0); - context_tracking_guest_exit(); - - instrumentation_begin(); - trace_hardirqs_off_finish(); - instrumentation_end(); -} - #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ ({ \ bool failed = (consistency_check); \ From b43a76f423aa304037603fd6165c4a534d2c09a7 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Sun, 30 Jan 2022 18:08:15 +0100 Subject: [PATCH 0902/1295] RDMA/siw: Fix broken RDMA Read Fence/Resume logic. Code unconditionally resumed fenced SQ processing after next RDMA Read completion, even if other RDMA Read responses are still outstanding, or ORQ is full. Also adds comments for better readability of fence processing, and removes orq_get_tail() helper, which is not needed anymore. Fixes: 8b6a361b8c48 ("rdma/siw: receive path") Fixes: a531975279f3 ("rdma/siw: main include file") Link: https://lore.kernel.org/r/20220130170815.1940-1-bmt@zurich.ibm.com Reported-by: Jared Holzman Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw.h | 7 +------ drivers/infiniband/sw/siw/siw_qp_rx.c | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 368959ae9a8c..df03d84c6868 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -644,14 +644,9 @@ static inline struct siw_sqe *orq_get_current(struct siw_qp *qp) return &qp->orq[qp->orq_get % qp->attrs.orq_size]; } -static inline struct siw_sqe *orq_get_tail(struct siw_qp *qp) -{ - return &qp->orq[qp->orq_put % qp->attrs.orq_size]; -} - static inline struct siw_sqe *orq_get_free(struct siw_qp *qp) { - struct siw_sqe *orq_e = orq_get_tail(qp); + struct siw_sqe *orq_e = &qp->orq[qp->orq_put % qp->attrs.orq_size]; if (READ_ONCE(orq_e->flags) == 0) return orq_e; diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c index 60116f20653c..875ea6f1b04a 100644 --- a/drivers/infiniband/sw/siw/siw_qp_rx.c +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -1153,11 +1153,12 @@ static int siw_check_tx_fence(struct siw_qp *qp) spin_lock_irqsave(&qp->orq_lock, flags); - rreq = orq_get_current(qp); - /* free current orq entry */ + rreq = orq_get_current(qp); WRITE_ONCE(rreq->flags, 0); + qp->orq_get++; + if (qp->tx_ctx.orq_fence) { if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) { pr_warn("siw: [QP %u]: fence resume: bad status %d\n", @@ -1165,10 +1166,12 @@ static int siw_check_tx_fence(struct siw_qp *qp) rv = -EPROTO; goto out; } - /* resume SQ processing */ + /* resume SQ processing, if possible */ if (tx_waiting->sqe.opcode == SIW_OP_READ || tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) { - rreq = orq_get_tail(qp); + + /* SQ processing was stopped because of a full ORQ */ + rreq = orq_get_free(qp); if (unlikely(!rreq)) { pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp)); rv = -EPROTO; @@ -1181,15 +1184,14 @@ static int siw_check_tx_fence(struct siw_qp *qp) resume_tx = 1; } else if (siw_orq_empty(qp)) { + /* + * SQ processing was stopped by fenced work request. + * Resume since all previous Read's are now completed. + */ qp->tx_ctx.orq_fence = 0; resume_tx = 1; - } else { - pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n", - qp_id(qp), qp->orq_get, qp->orq_put); - rv = -EPROTO; } } - qp->orq_get++; out: spin_unlock_irqrestore(&qp->orq_lock, flags); From f3136c4ce7acf64bee43135971ca52a880572e32 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 31 Jan 2022 11:45:26 +0200 Subject: [PATCH 0903/1295] RDMA/mlx4: Don't continue event handler after memory allocation failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The failure to allocate memory during MLX4_DEV_EVENT_PORT_MGMT_CHANGE event handler will cause skip the assignment logic, but ib_dispatch_event() will be called anyway. Fix it by calling to return instead of break after memory allocation failure. Fixes: 00f5ce99dc6e ("mlx4: Use port management change event instead of smp_snoop") Link: https://lore.kernel.org/r/12a0e83f18cfad4b5f62654f141e240d04915e10.1643622264.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Reviewed-by: Håkon Bugge Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 1c3d97229988..93b1650eacfa 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -3237,7 +3237,7 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: ew = kmalloc(sizeof *ew, GFP_ATOMIC); if (!ew) - break; + return; INIT_WORK(&ew->work, handle_port_mgmt_change_event); memcpy(&ew->ib_eqe, eqe, sizeof *eqe); From 1c7f0e349aa5f8f80b1cac3d4917405332e14cdf Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 1 Feb 2022 13:21:44 +0200 Subject: [PATCH 0904/1295] ALSA: hda: Skip codec shutdown in case the codec is not registered If the codec->registered is not set then it means that pm_runtime is not yet enabled and the codec->pcm_list_head has not been initialized. The access to the not initialized pcm_list_head will lead a kernel crash during shutdown. Reported-by: Guennadi Liakhovetski Signed-off-by: Peter Ujfalusi Tested-by: Guennadi Liakhovetski Fixes: b98444ed597d ("ALSA: hda: Suspend codec at shutdown") Link: https://lore.kernel.org/r/20220201112144.29411-1-peter.ujfalusi@linux.intel.com Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_codec.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index 7016b48227bf..f552785d301e 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -3000,6 +3000,10 @@ void snd_hda_codec_shutdown(struct hda_codec *codec) { struct hda_pcm *cpcm; + /* Skip the shutdown if codec is not registered */ + if (!codec->registered) + return; + list_for_each_entry(cpcm, &codec->pcm_list_head, list) snd_pcm_suspend_all(cpcm->pcm); From 836f35f79153ce09d813c83f341dba4481996966 Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Thu, 27 Jan 2022 14:03:58 -0500 Subject: [PATCH 0905/1295] platform/x86: thinkpad_acpi: Fix incorrect use of platform profile on AMD platforms Lenovo AMD based platforms have been offering platform_profiles but they are not working correctly. This is because the mode we are using on the Intel platforms (MMC) is not available on the AMD platforms. This commit adds checking of the functional capabilities returned by the BIOS to confirm if MMC is supported or not. Profiles will not be available if the platform is not MMC capable. I'm investigating and working on an alternative for AMD platforms but that is still work-in-progress. Signed-off-by: Mark Pearson Link: https://lore.kernel.org/r/20220127190358.4078-1-markpearson@lenovo.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/thinkpad_acpi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 33f611af6e51..bd045486b933 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -10119,6 +10119,9 @@ static struct ibm_struct proxsensor_driver_data = { #define DYTC_CMD_MMC_GET 8 /* To get current MMC function and mode */ #define DYTC_CMD_RESET 0x1ff /* To reset back to default */ +#define DYTC_CMD_FUNC_CAP 3 /* To get DYTC capabilities */ +#define DYTC_FC_MMC 27 /* MMC Mode supported */ + #define DYTC_GET_FUNCTION_BIT 8 /* Bits 8-11 - function setting */ #define DYTC_GET_MODE_BIT 12 /* Bits 12-15 - mode setting */ @@ -10331,6 +10334,15 @@ static int tpacpi_dytc_profile_init(struct ibm_init_struct *iibm) if (dytc_version < 5) return -ENODEV; + /* Check what capabilities are supported. Currently MMC is needed */ + err = dytc_command(DYTC_CMD_FUNC_CAP, &output); + if (err) + return err; + if (!(output & BIT(DYTC_FC_MMC))) { + dbg_printk(TPACPI_DBG_INIT, " DYTC MMC mode not supported\n"); + return -ENODEV; + } + dbg_printk(TPACPI_DBG_INIT, "DYTC version %d: thermal mode available\n", dytc_version); /* From e9cc5d48d4f463fbea19dd93253e98af3b612b7c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Feb 2020 11:04:23 -0300 Subject: [PATCH 0906/1295] tools include UAPI: Sync sound/asound.h copy with the kernel sources Picking the changes from: 55b71f6c29f2a78a ("ALSA: uapi: use C90 comment style instead of C99 style") fb6723daf89083a0 ("ALSA: pcm: comment about relation between msbits hw parameter and [S|U]32 formats") b456abe63f60ad93 ("ALSA: pcm: introduce INFO_NO_REWINDS flag") 5aec579e08e4f2be ("ALSA: uapi: Fix a C++ style comment in asound.h") Which entails no changes in the tooling side as it doesn't introduce new SNDRV_PCM_IOCTL_ ioctls. To silence this perf tools build warning: Warning: Kernel ABI header at 'tools/include/uapi/sound/asound.h' differs from latest version at 'include/uapi/sound/asound.h' diff -u tools/include/uapi/sound/asound.h include/uapi/sound/asound.h Cc: Mark Brown Cc: Pierre-Louis Bossart Cc: Takashi Iwai Cc: Takashi Sakamoto Link: https://lore.kernel.org/all/YflN0j09T+6ODHIh@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/sound/asound.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index 5fbb79e30819..ef0cafe295b2 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -202,6 +202,11 @@ typedef int __bitwise snd_pcm_format_t; #define SNDRV_PCM_FORMAT_S24_BE ((__force snd_pcm_format_t) 7) /* low three bytes */ #define SNDRV_PCM_FORMAT_U24_LE ((__force snd_pcm_format_t) 8) /* low three bytes */ #define SNDRV_PCM_FORMAT_U24_BE ((__force snd_pcm_format_t) 9) /* low three bytes */ +/* + * For S32/U32 formats, 'msbits' hardware parameter is often used to deliver information about the + * available bit count in most significant bit. It's for the case of so-called 'left-justified' or + * `right-padding` sample which has less width than 32 bit. + */ #define SNDRV_PCM_FORMAT_S32_LE ((__force snd_pcm_format_t) 10) #define SNDRV_PCM_FORMAT_S32_BE ((__force snd_pcm_format_t) 11) #define SNDRV_PCM_FORMAT_U32_LE ((__force snd_pcm_format_t) 12) @@ -300,7 +305,7 @@ typedef int __bitwise snd_pcm_subformat_t; #define SNDRV_PCM_INFO_HAS_LINK_ESTIMATED_ATIME 0x04000000 /* report estimated link audio time */ #define SNDRV_PCM_INFO_HAS_LINK_SYNCHRONIZED_ATIME 0x08000000 /* report synchronized audio/system time */ #define SNDRV_PCM_INFO_EXPLICIT_SYNC 0x10000000 /* needs explicit sync of pointers and data */ - +#define SNDRV_PCM_INFO_NO_REWINDS 0x20000000 /* hardware can only support monotonic changes of appl_ptr */ #define SNDRV_PCM_INFO_DRAIN_TRIGGER 0x40000000 /* internal kernel flag - trigger in drain */ #define SNDRV_PCM_INFO_FIFO_IN_FRAMES 0x80000000 /* internal kernel flag - FIFO size is in frames */ From 88443d3f79b8d2b5679f2a1df1482fa024be2353 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 21 May 2021 16:00:31 -0300 Subject: [PATCH 0907/1295] tools headers UAPI: Sync linux/perf_event.h with the kernel sources To pick the trivial change in: cb1c4aba055f928f ("perf: Add new macros for mem_hops field") Just comment source code alignment. This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/perf_event.h' differs from latest version at 'include/uapi/linux/perf_event.h' diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Cc: Kajol Jain Cc: Michael Ellerman Link: https://lore.kernel.org/lkml/YflPKLhu2AtHmPov@kernel.org/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 4cd39aaccbe7..1b65042ab1db 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1332,9 +1332,9 @@ union perf_mem_data_src { /* hop level */ #define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ -#define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ -#define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ -#define PERF_MEM_HOPS_3 0x04 /* remote board */ +#define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ +#define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ +#define PERF_MEM_HOPS_3 0x04 /* remote board */ /* 5-7 available */ #define PERF_MEM_HOPS_SHIFT 43 From d5381cc9f123d64bee1d1a124cd98faa5fa36ca6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 1 Jul 2021 13:39:15 -0300 Subject: [PATCH 0908/1295] tools headers cpufeatures: Sync with the kernel sources To pick the changes from: 690a757d610e50c2 ("kvm: x86: Add CPUID support for Intel AMX") This only causes these perf files to be rebuilt: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o And addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h' diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Cc: Jing Liu Cc: Paolo Bonzini Link: https://lore.kernel.org/lkml/YflQCEO9FRLeTmlB@kernel.org/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/cpufeatures.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 18de5f76f198..6db4e2932b3d 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -299,7 +299,9 @@ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_AMX_BF16 (18*32+22) /* AMX bf16 Support */ #define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */ +#define X86_FEATURE_AMX_INT8 (18*32+25) /* AMX int8 Support */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ From 100198322b2eb7fb38750cb0fcae5cd533907410 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 1 Feb 2022 12:53:16 -0300 Subject: [PATCH 0909/1295] perf beauty: Make the prctl arg regexp more strict to cope with PR_SET_VMA This new PR_SET_VMA value isn't in sequence with all the other prctl arguments and instead uses a big, 0x prefixed hex number: 0x53564d41 (S V M A). This makes it harder to generate a string table as it would be rather sparse, so make the regexp more stricter to avoid catching those. A followup patch for 'perf trace' to cope with such oddities will be needed, but then its a matter for the next merge window. The next patch will update the prctl.h file to cope with this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/prctl.h' differs from latest version at 'include/uapi/linux/prctl.h' diff -u tools/include/uapi/linux/prctl.h include/uapi/linux/prctl.h Here is the output of this script: $ tools/perf/trace/beauty/prctl_option.sh static const char *prctl_options[] = { [1] = "SET_PDEATHSIG", [2] = "GET_PDEATHSIG", [3] = "GET_DUMPABLE", [4] = "SET_DUMPABLE", [5] = "GET_UNALIGN", [6] = "SET_UNALIGN", [7] = "GET_KEEPCAPS", [8] = "SET_KEEPCAPS", [9] = "GET_FPEMU", [10] = "SET_FPEMU", [11] = "GET_FPEXC", [12] = "SET_FPEXC", [13] = "GET_TIMING", [14] = "SET_TIMING", [15] = "SET_NAME", [16] = "GET_NAME", [19] = "GET_ENDIAN", [20] = "SET_ENDIAN", [21] = "GET_SECCOMP", [22] = "SET_SECCOMP", [23] = "CAPBSET_READ", [24] = "CAPBSET_DROP", [25] = "GET_TSC", [26] = "SET_TSC", [27] = "GET_SECUREBITS", [28] = "SET_SECUREBITS", [29] = "SET_TIMERSLACK", [30] = "GET_TIMERSLACK", [31] = "TASK_PERF_EVENTS_DISABLE", [32] = "TASK_PERF_EVENTS_ENABLE", [33] = "MCE_KILL", [34] = "MCE_KILL_GET", [35] = "SET_MM", [36] = "SET_CHILD_SUBREAPER", [37] = "GET_CHILD_SUBREAPER", [38] = "SET_NO_NEW_PRIVS", [39] = "GET_NO_NEW_PRIVS", [40] = "GET_TID_ADDRESS", [41] = "SET_THP_DISABLE", [42] = "GET_THP_DISABLE", [43] = "MPX_ENABLE_MANAGEMENT", [44] = "MPX_DISABLE_MANAGEMENT", [45] = "SET_FP_MODE", [46] = "GET_FP_MODE", [47] = "CAP_AMBIENT", [50] = "SVE_SET_VL", [51] = "SVE_GET_VL", [52] = "GET_SPECULATION_CTRL", [53] = "SET_SPECULATION_CTRL", [54] = "PAC_RESET_KEYS", [55] = "SET_TAGGED_ADDR_CTRL", [56] = "GET_TAGGED_ADDR_CTRL", [57] = "SET_IO_FLUSHER", [58] = "GET_IO_FLUSHER", [59] = "SET_SYSCALL_USER_DISPATCH", [60] = "PAC_SET_ENABLED_KEYS", [61] = "PAC_GET_ENABLED_KEYS", [62] = "SCHED_CORE", }; static const char *prctl_set_mm_options[] = { [1] = "START_CODE", [2] = "END_CODE", [3] = "START_DATA", [4] = "END_DATA", [5] = "START_STACK", [6] = "START_BRK", [7] = "BRK", [8] = "ARG_START", [9] = "ARG_END", [10] = "ENV_START", [11] = "ENV_END", [12] = "AUXV", [13] = "EXE_FILE", [14] = "MAP", [15] = "MAP_SIZE", }; $ Cc: Adrian Hunter Cc: Colin Cross Cc: Ian Rogers Cc: Jiri Olsa Cc: Kees Cook Cc: Namhyung Kim Cc: Suren Baghdasaryan Link: https://lore.kernel.org/lkml/YflZqY0rYQ3d1bKt@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/prctl_option.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/trace/beauty/prctl_option.sh b/tools/perf/trace/beauty/prctl_option.sh index 3109d7b05e11..3d278785fe57 100755 --- a/tools/perf/trace/beauty/prctl_option.sh +++ b/tools/perf/trace/beauty/prctl_option.sh @@ -4,7 +4,7 @@ [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ printf "static const char *prctl_options[] = {\n" -regex='^#define[[:space:]]+PR_(\w+)[[:space:]]*([[:xdigit:]]+).*' +regex='^#define[[:space:]]{1}PR_(\w+)[[:space:]]*([[:xdigit:]]+)([[:space:]]*\/.*)?$' egrep $regex ${header_dir}/prctl.h | grep -v PR_SET_PTRACER | \ sed -r "s/$regex/\2 \1/g" | \ sort -n | xargs printf "\t[%s] = \"%s\",\n" From fc45e6588d57b65378612fce07089276141509dc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 11 Feb 2021 12:50:52 -0300 Subject: [PATCH 0910/1295] tools headers UAPI: Sync linux/prctl.h with the kernel sources To pick the changes in: 9a10064f5625d557 ("mm: add a field to store names for private anonymous memory") That don't result in any changes in tooling: $ tools/perf/trace/beauty/prctl_option.sh > before $ cp include/uapi/linux/prctl.h tools/include/uapi/linux/prctl.h $ tools/perf/trace/beauty/prctl_option.sh > after $ diff -u before after $ This actually adds a new prctl arg, but it has to be dealt with differently, as it is not in sequence with the other arguments. Just silences this perf tools build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/prctl.h' differs from latest version at 'include/uapi/linux/prctl.h' diff -u tools/include/uapi/linux/prctl.h include/uapi/linux/prctl.h Cc: Adrian Hunter Cc: Colin Cross Cc: Ian Rogers Cc: Jiri Olsa Cc: Kees Cook Cc: Namhyung Kim Cc: Suren Baghdasaryan Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/prctl.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index bb73e9a0b24f..e998764f0262 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -272,4 +272,7 @@ struct prctl_mm_map { # define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 # define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2 +#define PR_SET_VMA 0x53564d41 +# define PR_SET_VMA_ANON_NAME 0 + #endif /* _LINUX_PRCTL_H */ From 052e04a52dcd3359ba1df25a508a3a93707a3f6e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Jan 2022 16:02:42 +0000 Subject: [PATCH 0911/1295] cifs: Transition from ->readpages() to ->readahead() Transition the cifs filesystem from using the old ->readpages() method to using the new ->readahead() method. For the moment, this removes any invocation of fscache to read data from the local cache, leaving that to another patch. Signed-off-by: David Howells cc: Steve French cc: Shyam Prasad N cc: Matthew Wilcox cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com Reviewed-by: Rohith Surabattula Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 174 ++++++++++--------------------------------------- 1 file changed, 36 insertions(+), 138 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 59334be9ed3b..be62dc29dc54 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4269,8 +4269,6 @@ cifs_readv_complete(struct work_struct *work) for (i = 0; i < rdata->nr_pages; i++) { struct page *page = rdata->pages[i]; - lru_cache_add(page); - if (rdata->result == 0 || (rdata->result == -EAGAIN && got_bytes)) { flush_dcache_page(page); @@ -4340,7 +4338,6 @@ readpages_fill_pages(struct TCP_Server_Info *server, * fill them until the writes are flushed. */ zero_user(page, 0, PAGE_SIZE); - lru_cache_add(page); flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); @@ -4350,7 +4347,6 @@ readpages_fill_pages(struct TCP_Server_Info *server, continue; } else { /* no need to hold page hostage */ - lru_cache_add(page); unlock_page(page); put_page(page); rdata->pages[i] = NULL; @@ -4393,92 +4389,16 @@ cifs_readpages_copy_into_pages(struct TCP_Server_Info *server, return readpages_fill_pages(server, rdata, iter, iter->count); } -static int -readpages_get_pages(struct address_space *mapping, struct list_head *page_list, - unsigned int rsize, struct list_head *tmplist, - unsigned int *nr_pages, loff_t *offset, unsigned int *bytes) -{ - struct page *page, *tpage; - unsigned int expected_index; - int rc; - gfp_t gfp = readahead_gfp_mask(mapping); - - INIT_LIST_HEAD(tmplist); - - page = lru_to_page(page_list); - - /* - * Lock the page and put it in the cache. Since no one else - * should have access to this page, we're safe to simply set - * PG_locked without checking it first. - */ - __SetPageLocked(page); - rc = add_to_page_cache_locked(page, mapping, - page->index, gfp); - - /* give up if we can't stick it in the cache */ - if (rc) { - __ClearPageLocked(page); - return rc; - } - - /* move first page to the tmplist */ - *offset = (loff_t)page->index << PAGE_SHIFT; - *bytes = PAGE_SIZE; - *nr_pages = 1; - list_move_tail(&page->lru, tmplist); - - /* now try and add more pages onto the request */ - expected_index = page->index + 1; - list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { - /* discontinuity ? */ - if (page->index != expected_index) - break; - - /* would this page push the read over the rsize? */ - if (*bytes + PAGE_SIZE > rsize) - break; - - __SetPageLocked(page); - rc = add_to_page_cache_locked(page, mapping, page->index, gfp); - if (rc) { - __ClearPageLocked(page); - break; - } - list_move_tail(&page->lru, tmplist); - (*bytes) += PAGE_SIZE; - expected_index++; - (*nr_pages)++; - } - return rc; -} - -static int cifs_readpages(struct file *file, struct address_space *mapping, - struct list_head *page_list, unsigned num_pages) +static void cifs_readahead(struct readahead_control *ractl) { int rc; - int err = 0; - struct list_head tmplist; - struct cifsFileInfo *open_file = file->private_data; - struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); + struct cifsFileInfo *open_file = ractl->file->private_data; + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(ractl->file); struct TCP_Server_Info *server; pid_t pid; - unsigned int xid; + unsigned int xid, last_batch_size = 0; xid = get_xid(); - /* - * Reads as many pages as possible from fscache. Returns -ENOBUFS - * immediately if the cookie is negative - * - * After this point, every page in the list might have PG_fscache set, - * so we will need to clean that up off of every page we don't use. - */ - rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, - &num_pages); - if (rc == 0) { - free_xid(xid); - return rc; - } if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; @@ -4489,39 +4409,32 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", - __func__, file, mapping, num_pages); + __func__, ractl->file, ractl->mapping, readahead_count(ractl)); /* - * Start with the page at end of list and move it to private - * list. Do the same with any following pages until we hit - * the rsize limit, hit an index discontinuity, or run out of - * pages. Issue the async read and then start the loop again - * until the list is empty. - * - * Note that list order is important. The page_list is in - * the order of declining indexes. When we put the pages in - * the rdata->pages, then we want them in increasing order. + * Chop the readahead request up into rsize-sized read requests. */ - while (!list_empty(page_list) && !err) { - unsigned int i, nr_pages, bytes, rsize; - loff_t offset; - struct page *page, *tpage; + while (readahead_count(ractl) - last_batch_size) { + unsigned int i, nr_pages, got, rsize; + struct page *page; struct cifs_readdata *rdata; struct cifs_credits credits_on_stack; struct cifs_credits *credits = &credits_on_stack; if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); - if (rc == -EAGAIN) - continue; - else if (rc) + if (rc) { + if (rc == -EAGAIN) + continue; break; + } } rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize, credits); if (rc) break; + nr_pages = min_t(size_t, rsize / PAGE_SIZE, readahead_count(ractl)); /* * Give up immediately if rsize is too small to read an entire @@ -4529,16 +4442,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, * reach this point however since we set ra_pages to 0 when the * rsize is smaller than a cache page. */ - if (unlikely(rsize < PAGE_SIZE)) { - add_credits_and_wake_if(server, credits, 0); - free_xid(xid); - return 0; - } - - nr_pages = 0; - err = readpages_get_pages(mapping, page_list, rsize, &tmplist, - &nr_pages, &offset, &bytes); - if (!nr_pages) { + if (unlikely(!nr_pages)) { add_credits_and_wake_if(server, credits, 0); break; } @@ -4546,36 +4450,31 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete); if (!rdata) { /* best to give up if we're out of mem */ - list_for_each_entry_safe(page, tpage, &tmplist, lru) { - list_del(&page->lru); - lru_cache_add(page); - unlock_page(page); - put_page(page); - } - rc = -ENOMEM; add_credits_and_wake_if(server, credits, 0); break; } - rdata->cfile = cifsFileInfo_get(open_file); - rdata->server = server; - rdata->mapping = mapping; - rdata->offset = offset; - rdata->bytes = bytes; - rdata->pid = pid; - rdata->pagesz = PAGE_SIZE; - rdata->tailsz = PAGE_SIZE; - rdata->read_into_pages = cifs_readpages_read_into_pages; - rdata->copy_into_pages = cifs_readpages_copy_into_pages; - rdata->credits = credits_on_stack; - - list_for_each_entry_safe(page, tpage, &tmplist, lru) { - list_del(&page->lru); - rdata->pages[rdata->nr_pages++] = page; + got = __readahead_batch(ractl, rdata->pages, nr_pages); + if (got != nr_pages) { + pr_warn("__readahead_batch() returned %u/%u\n", + got, nr_pages); + nr_pages = got; } - rc = adjust_credits(server, &rdata->credits, rdata->bytes); + rdata->nr_pages = nr_pages; + rdata->bytes = readahead_batch_length(ractl); + rdata->cfile = cifsFileInfo_get(open_file); + rdata->server = server; + rdata->mapping = ractl->mapping; + rdata->offset = readahead_pos(ractl); + rdata->pid = pid; + rdata->pagesz = PAGE_SIZE; + rdata->tailsz = PAGE_SIZE; + rdata->read_into_pages = cifs_readpages_read_into_pages; + rdata->copy_into_pages = cifs_readpages_copy_into_pages; + rdata->credits = credits_on_stack; + rc = adjust_credits(server, &rdata->credits, rdata->bytes); if (!rc) { if (rdata->cfile->invalidHandle) rc = -EAGAIN; @@ -4587,7 +4486,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, add_credits_and_wake_if(server, &rdata->credits, 0); for (i = 0; i < rdata->nr_pages; i++) { page = rdata->pages[i]; - lru_cache_add(page); unlock_page(page); put_page(page); } @@ -4597,10 +4495,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, } kref_put(&rdata->refcount, cifs_readdata_release); + last_batch_size = nr_pages; } free_xid(xid); - return rc; } /* @@ -4924,7 +4822,7 @@ oplock_break_done: * In the non-cached mode (mount with cache=none), we shunt off direct read and write requests * so this method should never be called. * - * Direct IO is not yet supported in the cached mode. + * Direct IO is not yet supported in the cached mode. */ static ssize_t cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter) @@ -5006,7 +4904,7 @@ static int cifs_set_page_dirty(struct page *page) const struct address_space_operations cifs_addr_ops = { .readpage = cifs_readpage, - .readpages = cifs_readpages, + .readahead = cifs_readahead, .writepage = cifs_writepage, .writepages = cifs_writepages, .write_begin = cifs_write_begin, From bee9f65523218e3baeeecde9295c8fbe9bc08e0a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Jan 2022 16:02:50 +0000 Subject: [PATCH 0912/1295] netfs, cachefiles: Add a method to query presence of data in the cache Add a netfs_cache_ops method by which a network filesystem can ask the cache about what data it has available and where so that it can make a multipage read more efficient. Signed-off-by: David Howells cc: linux-cachefs@redhat.com Acked-by: Jeff Layton Reviewed-by: Rohith Surabattula Signed-off-by: Steve French --- Documentation/filesystems/netfs_library.rst | 16 ++++++ fs/cachefiles/io.c | 59 +++++++++++++++++++++ include/linux/netfs.h | 7 +++ 3 files changed, 82 insertions(+) diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst index 136f8da3d0e2..4f373a8ec47b 100644 --- a/Documentation/filesystems/netfs_library.rst +++ b/Documentation/filesystems/netfs_library.rst @@ -462,6 +462,10 @@ operation table looks like the following:: struct iov_iter *iter, netfs_io_terminated_t term_func, void *term_func_priv); + + int (*query_occupancy)(struct netfs_cache_resources *cres, + loff_t start, size_t len, size_t granularity, + loff_t *_data_start, size_t *_data_len); }; With a termination handler function pointer:: @@ -536,6 +540,18 @@ The methods defined in the table are: indicating whether the termination is definitely happening in the caller's context. + * ``query_occupancy()`` + + [Required] Called to find out where the next piece of data is within a + particular region of the cache. The start and length of the region to be + queried are passed in, along with the granularity to which the answer needs + to be aligned. The function passes back the start and length of the data, + if any, available within that region. Note that there may be a hole at the + front. + + It returns 0 if some data was found, -ENODATA if there was no usable data + within the region or -ENOBUFS if there is no caching on this file. + Note that these methods are passed a pointer to the cache resource structure, not the read request structure as they could be used in other situations where there isn't a read request structure as well, such as writing dirty data to the diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 04eb52736990..753986ea1583 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -191,6 +191,64 @@ presubmission_error: return ret; } +/* + * Query the occupancy of the cache in a region, returning where the next chunk + * of data starts and how long it is. + */ +static int cachefiles_query_occupancy(struct netfs_cache_resources *cres, + loff_t start, size_t len, size_t granularity, + loff_t *_data_start, size_t *_data_len) +{ + struct cachefiles_object *object; + struct file *file; + loff_t off, off2; + + *_data_start = -1; + *_data_len = 0; + + if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ)) + return -ENOBUFS; + + object = cachefiles_cres_object(cres); + file = cachefiles_cres_file(cres); + granularity = max_t(size_t, object->volume->cache->bsize, granularity); + + _enter("%pD,%li,%llx,%zx/%llx", + file, file_inode(file)->i_ino, start, len, + i_size_read(file_inode(file))); + + off = cachefiles_inject_read_error(); + if (off == 0) + off = vfs_llseek(file, start, SEEK_DATA); + if (off == -ENXIO) + return -ENODATA; /* Beyond EOF */ + if (off < 0 && off >= (loff_t)-MAX_ERRNO) + return -ENOBUFS; /* Error. */ + if (round_up(off, granularity) >= start + len) + return -ENODATA; /* No data in range */ + + off2 = cachefiles_inject_read_error(); + if (off2 == 0) + off2 = vfs_llseek(file, off, SEEK_HOLE); + if (off2 == -ENXIO) + return -ENODATA; /* Beyond EOF */ + if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO) + return -ENOBUFS; /* Error. */ + + /* Round away partial blocks */ + off = round_up(off, granularity); + off2 = round_down(off2, granularity); + if (off2 <= off) + return -ENODATA; + + *_data_start = off; + if (off2 > start + len) + *_data_len = len; + else + *_data_len = off2 - off; + return 0; +} + /* * Handle completion of a write to the cache. */ @@ -545,6 +603,7 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = { .write = cachefiles_write, .prepare_read = cachefiles_prepare_read, .prepare_write = cachefiles_prepare_write, + .query_occupancy = cachefiles_query_occupancy, }; /* diff --git a/include/linux/netfs.h b/include/linux/netfs.h index b46c39d98bbd..614f22213e21 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -244,6 +244,13 @@ struct netfs_cache_ops { int (*prepare_write)(struct netfs_cache_resources *cres, loff_t *_start, size_t *_len, loff_t i_size, bool no_space_allocated_yet); + + /* Query the occupancy of the cache in a region, returning where the + * next chunk of data starts and how long it is. + */ + int (*query_occupancy)(struct netfs_cache_resources *cres, + loff_t start, size_t len, size_t granularity, + loff_t *_data_start, size_t *_data_len); }; struct readahead_control; From 0174ee9947bd0f24fee2794b35258960d108b7aa Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Jan 2022 16:02:58 +0000 Subject: [PATCH 0913/1295] cifs: Implement cache I/O by accessing the cache directly Move cifs to using fscache DIO API instead of the old upstream I/O API as that has been removed. This is a stopgap solution as the intention is that at sometime in the future, the cache will move to using larger blocks and won't be able to store individual pages in order to deal with the potential for data corruption due to the backing filesystem being able insert/remove bridging blocks of zeros into its extent list[1]. cifs then reads and writes cache pages synchronously and one page at a time. The preferred change would be to use the netfs lib, but the new I/O API can be used directly. It's just that as the cache now needs to track data for itself, caching blocks may exceed page size... This code is somewhat borrowed from my "fallback I/O" patchset[2]. Signed-off-by: David Howells cc: Steve French cc: Shyam Prasad N cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/YO17ZNOcq+9PajfQ@mit.edu [1] Link: https://lore.kernel.org/r/202112100957.2oEDT20W-lkp@intel.com/ [2] Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 55 +++++++++++++++++-- fs/cifs/fscache.c | 132 ++++++++++++++++++++++++++++++++++++++-------- fs/cifs/fscache.h | 81 +++++++++++++++++----------- 3 files changed, 211 insertions(+), 57 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index be62dc29dc54..a50912674915 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4276,12 +4276,12 @@ cifs_readv_complete(struct work_struct *work) } else SetPageError(page); - unlock_page(page); - if (rdata->result == 0 || (rdata->result == -EAGAIN && got_bytes)) cifs_readpage_to_fscache(rdata->mapping->host, page); + unlock_page(page); + got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes); put_page(page); @@ -4396,7 +4396,11 @@ static void cifs_readahead(struct readahead_control *ractl) struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(ractl->file); struct TCP_Server_Info *server; pid_t pid; - unsigned int xid, last_batch_size = 0; + unsigned int xid, nr_pages, last_batch_size = 0, cache_nr_pages = 0; + pgoff_t next_cached = ULONG_MAX; + bool caching = fscache_cookie_enabled(cifs_inode_cookie(ractl->mapping->host)) && + cifs_inode_cookie(ractl->mapping->host)->cache_priv; + bool check_cache = caching; xid = get_xid(); @@ -4414,12 +4418,52 @@ static void cifs_readahead(struct readahead_control *ractl) /* * Chop the readahead request up into rsize-sized read requests. */ - while (readahead_count(ractl) - last_batch_size) { - unsigned int i, nr_pages, got, rsize; + while ((nr_pages = readahead_count(ractl) - last_batch_size)) { + unsigned int i, got, rsize; struct page *page; struct cifs_readdata *rdata; struct cifs_credits credits_on_stack; struct cifs_credits *credits = &credits_on_stack; + pgoff_t index = readahead_index(ractl) + last_batch_size; + + /* + * Find out if we have anything cached in the range of + * interest, and if so, where the next chunk of cached data is. + */ + if (caching) { + if (check_cache) { + rc = cifs_fscache_query_occupancy( + ractl->mapping->host, index, nr_pages, + &next_cached, &cache_nr_pages); + if (rc < 0) + caching = false; + check_cache = false; + } + + if (index == next_cached) { + /* + * TODO: Send a whole batch of pages to be read + * by the cache. + */ + page = readahead_page(ractl); + + if (cifs_readpage_from_fscache(ractl->mapping->host, + page) < 0) { + /* + * TODO: Deal with cache read failure + * here, but for the moment, delegate + * that to readpage. + */ + caching = false; + } + unlock_page(page); + next_cached++; + cache_nr_pages--; + if (cache_nr_pages == 0) + check_cache = true; + continue; + } + } if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); @@ -4435,6 +4479,7 @@ static void cifs_readahead(struct readahead_control *ractl) if (rc) break; nr_pages = min_t(size_t, rsize / PAGE_SIZE, readahead_count(ractl)); + nr_pages = min_t(size_t, nr_pages, next_cached - index); /* * Give up immediately if rsize is too small to read an entire diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index efaac4d5ff55..33af72e0ac0c 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -134,37 +134,127 @@ void cifs_fscache_release_inode_cookie(struct inode *inode) } } +static inline void fscache_end_operation(struct netfs_cache_resources *cres) +{ + const struct netfs_cache_ops *ops = fscache_operation_valid(cres); + + if (ops) + ops->end_operation(cres); +} + +/* + * Fallback page reading interface. + */ +static int fscache_fallback_read_page(struct inode *inode, struct page *page) +{ + struct netfs_cache_resources cres; + struct fscache_cookie *cookie = cifs_inode_cookie(inode); + struct iov_iter iter; + struct bio_vec bvec[1]; + int ret; + + memset(&cres, 0, sizeof(cres)); + bvec[0].bv_page = page; + bvec[0].bv_offset = 0; + bvec[0].bv_len = PAGE_SIZE; + iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + + ret = fscache_begin_read_operation(&cres, cookie); + if (ret < 0) + return ret; + + ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL, + NULL, NULL); + fscache_end_operation(&cres); + return ret; +} + +/* + * Fallback page writing interface. + */ +static int fscache_fallback_write_page(struct inode *inode, struct page *page, + bool no_space_allocated_yet) +{ + struct netfs_cache_resources cres; + struct fscache_cookie *cookie = cifs_inode_cookie(inode); + struct iov_iter iter; + struct bio_vec bvec[1]; + loff_t start = page_offset(page); + size_t len = PAGE_SIZE; + int ret; + + memset(&cres, 0, sizeof(cres)); + bvec[0].bv_page = page; + bvec[0].bv_offset = 0; + bvec[0].bv_len = PAGE_SIZE; + iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + + ret = fscache_begin_write_operation(&cres, cookie); + if (ret < 0) + return ret; + + ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode), + no_space_allocated_yet); + if (ret == 0) + ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL); + fscache_end_operation(&cres); + return ret; +} + /* * Retrieve a page from FS-Cache */ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) { - cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n", - __func__, CIFS_I(inode)->fscache, page, inode); - return -ENOBUFS; // Needs conversion to using netfslib -} + int ret; -/* - * Retrieve a set of pages from FS-Cache - */ -int __cifs_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - cifs_dbg(FYI, "%s: (0x%p/%u/0x%p)\n", - __func__, CIFS_I(inode)->fscache, *nr_pages, inode); - return -ENOBUFS; // Needs conversion to using netfslib + cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n", + __func__, cifs_inode_cookie(inode), page, inode); + + ret = fscache_fallback_read_page(inode, page); + if (ret < 0) + return ret; + + /* Read completed synchronously */ + SetPageUptodate(page); + return 0; } void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) { - struct cifsInodeInfo *cifsi = CIFS_I(inode); - - WARN_ON(!cifsi->fscache); - cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", - __func__, cifsi->fscache, page, inode); + __func__, cifs_inode_cookie(inode), page, inode); - // Needs conversion to using netfslib + fscache_fallback_write_page(inode, page, true); +} + +/* + * Query the cache occupancy. + */ +int __cifs_fscache_query_occupancy(struct inode *inode, + pgoff_t first, unsigned int nr_pages, + pgoff_t *_data_first, + unsigned int *_data_nr_pages) +{ + struct netfs_cache_resources cres; + struct fscache_cookie *cookie = cifs_inode_cookie(inode); + loff_t start, data_start; + size_t len, data_len; + int ret; + + ret = fscache_begin_read_operation(&cres, cookie); + if (ret < 0) + return ret; + + start = first * PAGE_SIZE; + len = nr_pages * PAGE_SIZE; + ret = cres.ops->query_occupancy(&cres, start, len, PAGE_SIZE, + &data_start, &data_len); + if (ret == 0) { + *_data_first = data_start / PAGE_SIZE; + *_data_nr_pages = len / PAGE_SIZE; + } + + fscache_end_operation(&cres); + return ret; } diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index c6ca49ac33d4..55129908e2c1 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -9,6 +9,7 @@ #ifndef _CIFS_FSCACHE_H #define _CIFS_FSCACHE_H +#include #include #include "cifsglob.h" @@ -58,14 +59,6 @@ void cifs_fscache_fill_coherency(struct inode *inode, } -extern int cifs_fscache_release_page(struct page *page, gfp_t gfp); -extern int __cifs_readpage_from_fscache(struct inode *, struct page *); -extern int __cifs_readpages_from_fscache(struct inode *, - struct address_space *, - struct list_head *, - unsigned *); -extern void __cifs_readpage_to_fscache(struct inode *, struct page *); - static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { return CIFS_I(inode)->fscache; @@ -80,33 +73,52 @@ static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags i_size_read(inode), flags); } +extern int __cifs_fscache_query_occupancy(struct inode *inode, + pgoff_t first, unsigned int nr_pages, + pgoff_t *_data_first, + unsigned int *_data_nr_pages); + +static inline int cifs_fscache_query_occupancy(struct inode *inode, + pgoff_t first, unsigned int nr_pages, + pgoff_t *_data_first, + unsigned int *_data_nr_pages) +{ + if (!cifs_inode_cookie(inode)) + return -ENOBUFS; + return __cifs_fscache_query_occupancy(inode, first, nr_pages, + _data_first, _data_nr_pages); +} + +extern int __cifs_readpage_from_fscache(struct inode *pinode, struct page *ppage); +extern void __cifs_readpage_to_fscache(struct inode *pinode, struct page *ppage); + + static inline int cifs_readpage_from_fscache(struct inode *inode, struct page *page) { - if (CIFS_I(inode)->fscache) + if (cifs_inode_cookie(inode)) return __cifs_readpage_from_fscache(inode, page); - - return -ENOBUFS; -} - -static inline int cifs_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - if (CIFS_I(inode)->fscache) - return __cifs_readpages_from_fscache(inode, mapping, pages, - nr_pages); return -ENOBUFS; } static inline void cifs_readpage_to_fscache(struct inode *inode, struct page *page) { - if (PageFsCache(page)) + if (cifs_inode_cookie(inode)) __cifs_readpage_to_fscache(inode, page); } +static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp) +{ + if (PageFsCache(page)) { + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + wait_on_page_fscache(page); + fscache_note_page_release(cifs_inode_cookie(page->mapping->host)); + } + return true; +} + #else /* CONFIG_CIFS_FSCACHE */ static inline void cifs_fscache_fill_coherency(struct inode *inode, @@ -123,22 +135,29 @@ static inline void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool upd static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { return NULL; } static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) {} +static inline int cifs_fscache_query_occupancy(struct inode *inode, + pgoff_t first, unsigned int nr_pages, + pgoff_t *_data_first, + unsigned int *_data_nr_pages) +{ + *_data_first = ULONG_MAX; + *_data_nr_pages = 0; + return -ENOBUFS; +} + static inline int cifs_readpage_from_fscache(struct inode *inode, struct page *page) { return -ENOBUFS; } -static inline int cifs_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - return -ENOBUFS; -} +static inline +void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {} -static inline void cifs_readpage_to_fscache(struct inode *inode, - struct page *page) {} +static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) +{ + return true; /* May release page */ +} #endif /* CONFIG_CIFS_FSCACHE */ From 46f5cbdef7d4fbb0f857a3caddec6799d0b5bb2f Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 31 Jan 2022 17:54:43 +0000 Subject: [PATCH 0914/1295] cifs: Fix the readahead conversion to manage the batch when reading from cache Fix the readahead conversion to correctly manage the last batch skipping when reading from cache. This involves a readahead batch of one page or one folio, so set the batch size according to the number of constituent pages (should be 1 for a filesystem that doesn't do multipage folios yet). Signed-off-by: David Howells cc: Steve French Reviewed-by: Rohith Surabattula Reviewed-by: Shyam Prasad N cc: Jeff Layton cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/cifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a50912674915..e7af802dcfa6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4446,7 +4446,7 @@ static void cifs_readahead(struct readahead_control *ractl) * by the cache. */ page = readahead_page(ractl); - + last_batch_size = 1 << thp_order(page); if (cifs_readpage_from_fscache(ractl->mapping->host, page) < 0) { /* From 68defd528f94ed1cf11f49a75cc1875dccd781fa Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 7 Dec 2021 13:23:06 +0200 Subject: [PATCH 0915/1295] e1000e: Separate ADP board type from TGP We have the same LAN controller on different PCH's. Separate ADP board type from a TGP which will allow for specific fixes to be applied for ADP platforms. Suggested-by: Kai-Heng Feng Suggested-by: Dima Ruinskiy Signed-off-by: Sasha Neftin Tested-by: Nechama Kraus Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/e1000.h | 4 ++- drivers/net/ethernet/intel/e1000e/ich8lan.c | 20 +++++++++++++ drivers/net/ethernet/intel/e1000e/netdev.c | 33 +++++++++++---------- 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index c3def0ee7788..8d06c9d8ff8b 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -115,7 +115,8 @@ enum e1000_boards { board_pch_lpt, board_pch_spt, board_pch_cnp, - board_pch_tgp + board_pch_tgp, + board_pch_adp }; struct e1000_ps_page { @@ -502,6 +503,7 @@ extern const struct e1000_info e1000_pch_lpt_info; extern const struct e1000_info e1000_pch_spt_info; extern const struct e1000_info e1000_pch_cnp_info; extern const struct e1000_info e1000_pch_tgp_info; +extern const struct e1000_info e1000_pch_adp_info; extern const struct e1000_info e1000_es2_info; void e1000e_ptp_init(struct e1000_adapter *adapter); diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 5e4fc9b4e2ad..c908c84b86d2 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -6021,3 +6021,23 @@ const struct e1000_info e1000_pch_tgp_info = { .phy_ops = &ich8_phy_ops, .nvm_ops = &spt_nvm_ops, }; + +const struct e1000_info e1000_pch_adp_info = { + .mac = e1000_pch_adp, + .flags = FLAG_IS_ICH + | FLAG_HAS_WOL + | FLAG_HAS_HW_TIMESTAMP + | FLAG_HAS_CTRLEXT_ON_LOAD + | FLAG_HAS_AMT + | FLAG_HAS_FLASH + | FLAG_HAS_JUMBO_FRAMES + | FLAG_APME_IN_WUC, + .flags2 = FLAG2_HAS_PHY_STATS + | FLAG2_HAS_EEE, + .pba = 26, + .max_hw_frame_size = 9022, + .get_variants = e1000_get_variants_ich8lan, + .mac_ops = &ich8_mac_ops, + .phy_ops = &ich8_phy_ops, + .nvm_ops = &spt_nvm_ops, +}; diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 635a95927e93..d2de8bc4c3b7 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -52,6 +52,7 @@ static const struct e1000_info *e1000_info_tbl[] = { [board_pch_spt] = &e1000_pch_spt_info, [board_pch_cnp] = &e1000_pch_cnp_info, [board_pch_tgp] = &e1000_pch_tgp_info, + [board_pch_adp] = &e1000_pch_adp_info, }; struct e1000_reg_info { @@ -7898,22 +7899,22 @@ static const struct pci_device_id e1000_pci_tbl[] = { { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_V14), board_pch_tgp }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_LM15), board_pch_tgp }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_V15), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_LM23), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_V23), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_LM16), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_V16), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_LM17), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_V17), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_LM22), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_V22), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_LM18), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_V18), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_LM19), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_V19), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_LM20), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_V20), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_LM21), board_pch_tgp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_V21), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_LM23), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_V23), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_LM16), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_V16), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_LM17), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_V17), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_LM22), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_V22), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_LM18), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_V18), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_LM19), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_V19), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_LM20), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_V20), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_LM21), board_pch_adp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_V21), board_pch_adp }, { 0, 0, 0, 0, 0, 0, 0 } /* terminate list */ }; From cad014b7b5a6897d8c4fad13e2888978bfb7a53f Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 7 Dec 2021 13:23:42 +0200 Subject: [PATCH 0916/1295] e1000e: Handshake with CSME starts from ADL platforms Handshake with CSME/AMT on none provisioned platforms during S0ix flow is not supported on TGL platform and can cause to HW unit hang. Update the handshake with CSME flow to start from the ADL platform. Fixes: 3e55d231716e ("e1000e: Add handshake with the CSME to support S0ix") Signed-off-by: Sasha Neftin Tested-by: Nechama Kraus Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/netdev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index d2de8bc4c3b7..a42aeb555f34 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6342,7 +6342,8 @@ static void e1000e_s0ix_entry_flow(struct e1000_adapter *adapter) u32 mac_data; u16 phy_data; - if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID) { + if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID && + hw->mac.type >= e1000_pch_adp) { /* Request ME configure the device for S0ix */ mac_data = er32(H2ME); mac_data |= E1000_H2ME_START_DPG; @@ -6491,7 +6492,8 @@ static void e1000e_s0ix_exit_flow(struct e1000_adapter *adapter) u16 phy_data; u32 i = 0; - if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID) { + if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID && + hw->mac.type >= e1000_pch_adp) { /* Request ME unconfigure the device from S0ix */ mac_data = er32(H2ME); mac_data &= ~E1000_H2ME_START_DPG; From 053ca37c87af65f41f5842070c68aa53c3d035f5 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 27 Jan 2022 15:49:49 -0600 Subject: [PATCH 0917/1295] PCI: j721e: Initialize pcie->cdns_pcie before using it Christian reported a NULL pointer dereference in j721e_pcie_probe() caused by 19e863828acf ("PCI: j721e: Drop redundant struct device *"), which removed struct j721e_pcie.dev since there's another copy in struct cdns_pcie.dev reachable via j721e_pcie->cdns_pcie->dev. The problem is that j721e_pcie->cdns_pcie was dereferenced before being initialized: j721e_pcie_probe pcie = devm_kzalloc() # struct j721e_pcie j721e_pcie_ctrl_init(pcie) dev = pcie->cdns_pcie->dev <-- dereference cdns_pcie switch (mode) { case PCI_MODE_RC: cdns_pcie = ... # alloc as part of pci_host_bridge pcie->cdns_pcie = cdns_pcie <-- initialize pcie->cdns_pcie Move the cdns_pcie initialization earlier so it is done before it is used. This also simplifies the error exits. Fixes: 19e863828acf ("PCI: j721e: Drop redundant struct device *") Link: https://lore.kernel.org/r/20220127222951.GA144828@bhelgaas Link: https://lore.kernel.org/r/20220124122132.435743-1-christian.gmeiner@gmail.com Reported-by: Christian Gmeiner Tested-by: Christian Gmeiner Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/cadence/pci-j721e.c | 85 +++++++++++----------- 1 file changed, 42 insertions(+), 43 deletions(-) diff --git a/drivers/pci/controller/cadence/pci-j721e.c b/drivers/pci/controller/cadence/pci-j721e.c index 489586a4cdc7..768d33f9ebc8 100644 --- a/drivers/pci/controller/cadence/pci-j721e.c +++ b/drivers/pci/controller/cadence/pci-j721e.c @@ -356,8 +356,8 @@ static int j721e_pcie_probe(struct platform_device *pdev) const struct j721e_pcie_data *data; struct cdns_pcie *cdns_pcie; struct j721e_pcie *pcie; - struct cdns_pcie_rc *rc; - struct cdns_pcie_ep *ep; + struct cdns_pcie_rc *rc = NULL; + struct cdns_pcie_ep *ep = NULL; struct gpio_desc *gpiod; void __iomem *base; struct clk *clk; @@ -376,6 +376,46 @@ static int j721e_pcie_probe(struct platform_device *pdev) if (!pcie) return -ENOMEM; + switch (mode) { + case PCI_MODE_RC: + if (!IS_ENABLED(CONFIG_PCIE_CADENCE_HOST)) + return -ENODEV; + + bridge = devm_pci_alloc_host_bridge(dev, sizeof(*rc)); + if (!bridge) + return -ENOMEM; + + if (!data->byte_access_allowed) + bridge->ops = &cdns_ti_pcie_host_ops; + rc = pci_host_bridge_priv(bridge); + rc->quirk_retrain_flag = data->quirk_retrain_flag; + rc->quirk_detect_quiet_flag = data->quirk_detect_quiet_flag; + + cdns_pcie = &rc->pcie; + cdns_pcie->dev = dev; + cdns_pcie->ops = &j721e_pcie_ops; + pcie->cdns_pcie = cdns_pcie; + break; + case PCI_MODE_EP: + if (!IS_ENABLED(CONFIG_PCIE_CADENCE_EP)) + return -ENODEV; + + ep = devm_kzalloc(dev, sizeof(*ep), GFP_KERNEL); + if (!ep) + return -ENOMEM; + + ep->quirk_detect_quiet_flag = data->quirk_detect_quiet_flag; + + cdns_pcie = &ep->pcie; + cdns_pcie->dev = dev; + cdns_pcie->ops = &j721e_pcie_ops; + pcie->cdns_pcie = cdns_pcie; + break; + default: + dev_err(dev, "INVALID device type %d\n", mode); + return 0; + } + pcie->mode = mode; pcie->linkdown_irq_regfield = data->linkdown_irq_regfield; @@ -426,28 +466,6 @@ static int j721e_pcie_probe(struct platform_device *pdev) switch (mode) { case PCI_MODE_RC: - if (!IS_ENABLED(CONFIG_PCIE_CADENCE_HOST)) { - ret = -ENODEV; - goto err_get_sync; - } - - bridge = devm_pci_alloc_host_bridge(dev, sizeof(*rc)); - if (!bridge) { - ret = -ENOMEM; - goto err_get_sync; - } - - if (!data->byte_access_allowed) - bridge->ops = &cdns_ti_pcie_host_ops; - rc = pci_host_bridge_priv(bridge); - rc->quirk_retrain_flag = data->quirk_retrain_flag; - rc->quirk_detect_quiet_flag = data->quirk_detect_quiet_flag; - - cdns_pcie = &rc->pcie; - cdns_pcie->dev = dev; - cdns_pcie->ops = &j721e_pcie_ops; - pcie->cdns_pcie = cdns_pcie; - gpiod = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW); if (IS_ERR(gpiod)) { ret = PTR_ERR(gpiod); @@ -497,23 +515,6 @@ static int j721e_pcie_probe(struct platform_device *pdev) break; case PCI_MODE_EP: - if (!IS_ENABLED(CONFIG_PCIE_CADENCE_EP)) { - ret = -ENODEV; - goto err_get_sync; - } - - ep = devm_kzalloc(dev, sizeof(*ep), GFP_KERNEL); - if (!ep) { - ret = -ENOMEM; - goto err_get_sync; - } - ep->quirk_detect_quiet_flag = data->quirk_detect_quiet_flag; - - cdns_pcie = &ep->pcie; - cdns_pcie->dev = dev; - cdns_pcie->ops = &j721e_pcie_ops; - pcie->cdns_pcie = cdns_pcie; - ret = cdns_pcie_init_phy(dev, cdns_pcie); if (ret) { dev_err(dev, "Failed to init phy\n"); @@ -525,8 +526,6 @@ static int j721e_pcie_probe(struct platform_device *pdev) goto err_pcie_setup; break; - default: - dev_err(dev, "INVALID device type %d\n", mode); } return 0; From 24f6008564183aa120d07c03d9289519c2fe02af Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 20 Jan 2022 11:04:01 -0600 Subject: [PATCH 0918/1295] cgroup-v1: Require capabilities to set release_agent The cgroup release_agent is called with call_usermodehelper. The function call_usermodehelper starts the release_agent with a full set fo capabilities. Therefore require capabilities when setting the release_agaent. Reported-by: Tabitha Sable Tested-by: Tabitha Sable Fixes: 81a6a5cdd2c5 ("Task Control Groups: automatic userspace notification of idle cgroups") Cc: stable@vger.kernel.org # v2.6.24+ Signed-off-by: "Eric W. Biederman" Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 41e0837a5a0b..0e877dbcfeea 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -549,6 +549,14 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((of->file->f_cred->user_ns != &init_user_ns) || + !capable(CAP_SYS_ADMIN)) + return -EPERM; + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -954,6 +962,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) + return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; From 8cfe148a7136bc60452a5c6b7ac2d9d15c36909b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 1 Feb 2022 13:29:23 +0000 Subject: [PATCH 0919/1295] kvm/arm64: rework guest entry logic In kvm_arch_vcpu_ioctl_run() we enter an RCU extended quiescent state (EQS) by calling guest_enter_irqoff(), and unmasked IRQs prior to exiting the EQS by calling guest_exit(). As the IRQ entry code will not wake RCU in this case, we may run the core IRQ code and IRQ handler without RCU watching, leading to various potential problems. Additionally, we do not inform lockdep or tracing that interrupts will be enabled during guest execution, which caan lead to misleading traces and warnings that interrupts have been enabled for overly-long periods. This patch fixes these issues by using the new timing and context entry/exit helpers to ensure that interrupts are handled during guest vtime but with RCU watching, with a sequence: guest_timing_enter_irqoff(); guest_state_enter_irqoff(); < run the vcpu > guest_state_exit_irqoff(); < take any pending IRQs > guest_timing_exit_irqoff(); Since instrumentation may make use of RCU, we must also ensure that no instrumented code is run during the EQS. I've split out the critical section into a new kvm_arm_enter_exit_vcpu() helper which is marked noinstr. Fixes: 1b3d546daf85ed2b ("arm/arm64: KVM: Properly account for guest CPU time") Reported-by: Nicolas Saenz Julienne Signed-off-by: Mark Rutland Reviewed-by: Marc Zyngier Reviewed-by: Nicolas Saenz Julienne Cc: Alexandru Elisei Cc: Catalin Marinas Cc: Frederic Weisbecker Cc: James Morse Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Suzuki K Poulose Cc: Will Deacon Message-Id: <20220201132926.3301912-3-mark.rutland@arm.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/arm.c | 51 ++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 868109cf96b4..a069d5925f77 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -790,6 +790,24 @@ static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) xfer_to_guest_mode_work_pending(); } +/* + * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while + * the vCPU is running. + * + * This must be noinstr as instrumentation may make use of RCU, and this is not + * safe during the EQS. + */ +static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + int ret; + + guest_state_enter_irqoff(); + ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu); + guest_state_exit_irqoff(); + + return ret; +} + /** * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code * @vcpu: The VCPU pointer @@ -874,9 +892,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * Enter the guest */ trace_kvm_entry(*vcpu_pc(vcpu)); - guest_enter_irqoff(); + guest_timing_enter_irqoff(); - ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu); + ret = kvm_arm_vcpu_enter_exit(vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; vcpu->stat.exits++; @@ -911,26 +929,23 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_arch_vcpu_ctxsync_fp(vcpu); /* - * We may have taken a host interrupt in HYP mode (ie - * while executing the guest). This interrupt is still - * pending, as we haven't serviced it yet! + * We must ensure that any pending interrupts are taken before + * we exit guest timing so that timer ticks are accounted as + * guest time. Transiently unmask interrupts so that any + * pending interrupts are taken. * - * We're now back in SVC mode, with interrupts - * disabled. Enabling the interrupts now will have - * the effect of taking the interrupt again, in SVC - * mode this time. + * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other + * context synchronization event) is necessary to ensure that + * pending interrupts are taken. */ local_irq_enable(); + isb(); + local_irq_disable(); + + guest_timing_exit_irqoff(); + + local_irq_enable(); - /* - * We do local_irq_enable() before calling guest_exit() so - * that if a timer interrupt hits while running the guest we - * account that tick as being spent in the guest. We enable - * preemption after calling guest_exit() so that if we get - * preempted we make sure ticks after that is not counted as - * guest time. - */ - guest_exit(); trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* Exit types that need handling before we can be preempted */ From a4b92cebc31d49b7e6ef0ce584c7f2a2e112877d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Feb 2022 14:48:38 +0000 Subject: [PATCH 0920/1295] arm64: Enable Cortex-A510 erratum 2051678 by default The recently added configuration option for Cortex A510 erratum 2051678 does not have a "default y" unlike other errata fixes. This appears to simply be an oversight since the help text suggests enabling the option if unsure and there's nothing in the commit log to suggest it is intentional. Fixes: 297ae1eb23b0 ("arm64: cpufeature: List early Cortex-A510 parts as having broken dbm") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20220201144838.20037-1-broonie@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f2b5a4abef21..c7a474f71eb4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -672,6 +672,7 @@ config ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE config ARM64_ERRATUM_2051678 bool "Cortex-A510: 2051678: disable Hardware Update of the page table dirty bit" + default y help This options adds the workaround for ARM Cortex-A510 erratum ARM64_ERRATUM_2051678. Affected Coretex-A510 might not respect the ordering rules for From 1a2beb3d5a0b4051067ecf49ea799bee340e0e7c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 31 Jan 2022 15:48:54 +0100 Subject: [PATCH 0921/1295] mailmap: update Christian Brauner's email address At least one of the addresses will stop functioning after February. Signed-off-by: Christian Brauner Signed-off-by: Linus Torvalds --- .mailmap | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.mailmap b/.mailmap index b76e520809d0..8cd44b0c6579 100644 --- a/.mailmap +++ b/.mailmap @@ -80,6 +80,9 @@ Chris Chiu Christian Borntraeger Christian Borntraeger Christian Borntraeger +Christian Brauner +Christian Brauner +Christian Brauner Christophe Ricard Christoph Hellwig Colin Ian King From b7892f7d5cb2b8187c603dd8ea3a7c44059ccfc2 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 1 Feb 2022 09:31:20 +0000 Subject: [PATCH 0922/1295] tools: Ignore errors from `which' when searching a GCC toolchain When cross-building tools with clang, we run `which $(CROSS_COMPILE)gcc` to detect whether a GCC toolchain provides the standard libraries. It is only a helper because some distros put libraries where LLVM does not automatically find them. On other systems, LLVM detects the libc automatically and does not need this. There, it is completely fine not to have a GCC at all, but some versions of `which' display an error when the command is not found: which: no aarch64-linux-gnu-gcc in ($PATH) Since the error can safely be ignored, throw it to /dev/null. Fixes: cebdb7374577 ("tools: Help cross-building with clang") Reported-by: Nathan Chancellor Signed-off-by: Jean-Philippe Brucker Signed-off-by: Daniel Borkmann Tested-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/bpf/20220201093119.1713207-1-jean-philippe@linaro.org --- tools/scripts/Makefile.include | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index b0be5f40a3f1..79d102304470 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -90,7 +90,7 @@ EXTRA_WARNINGS += -Wstrict-aliasing=3 else ifneq ($(CROSS_COMPILE),) CLANG_CROSS_FLAGS := --target=$(notdir $(CROSS_COMPILE:%-=%)) -GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)gcc)) +GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)gcc 2>/dev/null)) ifneq ($(GCC_TOOLCHAIN_DIR),) CLANG_CROSS_FLAGS += --prefix=$(GCC_TOOLCHAIN_DIR)$(notdir $(CROSS_COMPILE)) CLANG_CROSS_FLAGS += --sysroot=$(shell $(CROSS_COMPILE)gcc -print-sysroot) From 472c6e46f589c26057596dcba160712a5b3e02c5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 31 Jan 2022 13:20:08 -0800 Subject: [PATCH 0923/1295] xfs: remove XFS_PREALLOC_SYNC Callers can acheive the same thing by calling xfs_log_force_inode() after making their modifications. There is no need for xfs_update_prealloc_flags() to do this. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 13 +++++++------ fs/xfs/xfs_inode.h | 3 +-- fs/xfs/xfs_pnfs.c | 6 ++++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 22ad207bedf4..ed375b3d0614 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -95,8 +95,6 @@ xfs_update_prealloc_flags( ip->i_diflags &= ~XFS_DIFLAG_PREALLOC; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (flags & XFS_PREALLOC_SYNC) - xfs_trans_set_sync(tp); return xfs_trans_commit(tp); } @@ -1057,9 +1055,6 @@ xfs_file_fallocate( } } - if (file->f_flags & O_DSYNC) - flags |= XFS_PREALLOC_SYNC; - error = xfs_update_prealloc_flags(ip, flags); if (error) goto out_unlock; @@ -1082,8 +1077,14 @@ xfs_file_fallocate( * leave shifted extents past EOF and hence losing access to * the data that is contained within them. */ - if (do_file_insert) + if (do_file_insert) { error = xfs_insert_file_space(ip, offset, len); + if (error) + goto out_unlock; + } + + if (file->f_flags & O_DSYNC) + error = xfs_log_force_inode(ip); out_unlock: xfs_iunlock(ip, iolock); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c447bf04205a..3fc6d77f5be9 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -465,8 +465,7 @@ xfs_itruncate_extents( enum xfs_prealloc_flags { XFS_PREALLOC_SET = (1 << 1), XFS_PREALLOC_CLEAR = (1 << 2), - XFS_PREALLOC_SYNC = (1 << 3), - XFS_PREALLOC_INVISIBLE = (1 << 4), + XFS_PREALLOC_INVISIBLE = (1 << 3), }; int xfs_update_prealloc_flags(struct xfs_inode *ip, diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index d6334abbc0b3..ce6d66f20385 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -164,10 +164,12 @@ xfs_fs_map_blocks( * that the blocks allocated and handed out to the client are * guaranteed to be present even after a server crash. */ - error = xfs_update_prealloc_flags(ip, - XFS_PREALLOC_SET | XFS_PREALLOC_SYNC); + error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); + if (!error) + error = xfs_log_force_inode(ip); if (error) goto out_unlock; + } else { xfs_iunlock(ip, lock_flags); } From fbe7e520036583a783b13ff9744e35c2a329d9a4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 31 Jan 2022 13:20:09 -0800 Subject: [PATCH 0924/1295] xfs: fallocate() should call file_modified() In XFS, we always update the inode change and modification time when any fallocate() operation succeeds. Furthermore, as various fallocate modes can change the file contents (extending EOF, punching holes, zeroing things, shifting extents), we should drop file privileges like suid just like we do for a regular write(). There's already a VFS helper that figures all this out for us, so use that. The net effect of this is that we no longer drop suid/sgid if the caller is root, but we also now drop file capabilities. We also move the xfs_update_prealloc_flags() function so that it now is only called by the scope that needs to set the the prealloc flag. Based on a patch from Darrick Wong. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ed375b3d0614..7846d55cba01 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -953,6 +953,10 @@ xfs_file_fallocate( goto out_unlock; } + error = file_modified(file); + if (error) + goto out_unlock; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -1053,11 +1057,12 @@ xfs_file_fallocate( if (error) goto out_unlock; } - } - error = xfs_update_prealloc_flags(ip, flags); - if (error) - goto out_unlock; + error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); + if (error) + goto out_unlock; + + } /* Change file size if needed */ if (new_size) { From 0b02c8c0d75a738c98c35f02efb36217c170d78c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 31 Jan 2022 13:20:09 -0800 Subject: [PATCH 0925/1295] xfs: set prealloc flag in xfs_alloc_file_space() Now that we only call xfs_update_prealloc_flags() from xfs_file_fallocate() in the case where we need to set the preallocation flag, do this in xfs_alloc_file_space() where we already have the inode joined into a transaction and get rid of the call to xfs_update_prealloc_flags() from the fallocate code. This also means that we now correctly avoid setting the XFS_DIFLAG_PREALLOC flag when xfs_is_always_cow_inode() is true, as these inodes will never have preallocated extents. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 9 +++------ fs/xfs/xfs_file.c | 8 -------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index d4a387d3d0ce..eb2e387ba528 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -850,9 +850,6 @@ xfs_alloc_file_space( rblocks = 0; } - /* - * Allocate and setup the transaction. - */ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, rblocks, false, &tp); if (error) @@ -869,9 +866,9 @@ xfs_alloc_file_space( if (error) goto error; - /* - * Complete the transaction - */ + ip->i_diflags |= XFS_DIFLAG_PREALLOC; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7846d55cba01..082e3ef81418 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -908,7 +908,6 @@ xfs_file_fallocate( struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); long error; - enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; loff_t new_size = 0; bool do_file_insert = false; @@ -1006,8 +1005,6 @@ xfs_file_fallocate( } do_file_insert = true; } else { - flags |= XFS_PREALLOC_SET; - if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) { new_size = offset + len; @@ -1057,11 +1054,6 @@ xfs_file_fallocate( if (error) goto out_unlock; } - - error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); - if (error) - goto out_unlock; - } /* Change file size if needed */ From b39a04636fd7454911b80e7b5ab2a66b011a8145 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 31 Jan 2022 13:20:10 -0800 Subject: [PATCH 0926/1295] xfs: move xfs_update_prealloc_flags() to xfs_pnfs.c The operations that xfs_update_prealloc_flags() perform are now unique to xfs_fs_map_blocks(), so move xfs_update_prealloc_flags() to be a static function in xfs_pnfs.c and cut out all the other functionality that is doesn't use anymore. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 32 -------------------------------- fs/xfs/xfs_inode.h | 8 -------- fs/xfs/xfs_pnfs.c | 38 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 36 insertions(+), 42 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 082e3ef81418..cecc5dedddff 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -66,38 +66,6 @@ xfs_is_falloc_aligned( return !((pos | len) & mask); } -int -xfs_update_prealloc_flags( - struct xfs_inode *ip, - enum xfs_prealloc_flags flags) -{ - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, - 0, 0, 0, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - if (!(flags & XFS_PREALLOC_INVISIBLE)) { - VFS_I(ip)->i_mode &= ~S_ISUID; - if (VFS_I(ip)->i_mode & S_IXGRP) - VFS_I(ip)->i_mode &= ~S_ISGID; - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } - - if (flags & XFS_PREALLOC_SET) - ip->i_diflags |= XFS_DIFLAG_PREALLOC; - if (flags & XFS_PREALLOC_CLEAR) - ip->i_diflags &= ~XFS_DIFLAG_PREALLOC; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp); -} - /* * Fsync operations on directories are much simpler than on regular files, * as there is no file data to flush, and thus also no need for explicit diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3fc6d77f5be9..b7e8f14d9fca 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -462,14 +462,6 @@ xfs_itruncate_extents( } /* from xfs_file.c */ -enum xfs_prealloc_flags { - XFS_PREALLOC_SET = (1 << 1), - XFS_PREALLOC_CLEAR = (1 << 2), - XFS_PREALLOC_INVISIBLE = (1 << 3), -}; - -int xfs_update_prealloc_flags(struct xfs_inode *ip, - enum xfs_prealloc_flags flags); int xfs_break_layouts(struct inode *inode, uint *iolock, enum layout_break_reason reason); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index ce6d66f20385..4abe17312c2b 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -70,6 +70,40 @@ xfs_fs_get_uuid( return 0; } +/* + * We cannot use file based VFS helpers such as file_modified() to update + * inode state as we modify the data/metadata in the inode here. Hence we have + * to open code the timestamp updates and SUID/SGID stripping. We also need + * to set the inode prealloc flag to ensure that the extents we allocate are not + * removed if the inode is reclaimed from memory before xfs_fs_block_commit() + * is from the client to indicate that data has been written and the file size + * can be extended. + */ +static int +xfs_fs_map_update_inode( + struct xfs_inode *ip) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, + 0, 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + VFS_I(ip)->i_mode &= ~S_ISUID; + if (VFS_I(ip)->i_mode & S_IXGRP) + VFS_I(ip)->i_mode &= ~S_ISGID; + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + ip->i_diflags |= XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return xfs_trans_commit(tp); +} + /* * Get a layout for the pNFS client. */ @@ -164,7 +198,7 @@ xfs_fs_map_blocks( * that the blocks allocated and handed out to the client are * guaranteed to be present even after a server crash. */ - error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); + error = xfs_fs_map_update_inode(ip); if (!error) error = xfs_log_force_inode(ip); if (error) @@ -257,7 +291,7 @@ xfs_fs_commit_blocks( length = end - start; if (!length) continue; - + /* * Make sure reads through the pagecache see the new data. */ From cea267c235e1b1ec3bfc415f6bd420289bcb3bc9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 31 Jan 2022 13:20:10 -0800 Subject: [PATCH 0927/1295] xfs: ensure log flush at the end of a synchronous fallocate call Since we've started treating fallocate more like a file write, we should flush the log to disk if the user has asked for synchronous writes either by setting it via fcntl flags, or inode flags, or with the sync mount option. We've already got a helper for this, so use it. [The original patch by Darrick was massaged by Dave to fit this patchset] Signed-off-by: Darrick J. Wong Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index cecc5dedddff..5bddb1e9e0b3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -861,6 +861,21 @@ xfs_break_layouts( return error; } +/* Does this file, inode, or mount want synchronous writes? */ +static inline bool xfs_file_sync_writes(struct file *filp) +{ + struct xfs_inode *ip = XFS_I(file_inode(filp)); + + if (xfs_has_wsync(ip->i_mount)) + return true; + if (filp->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(filp))) + return true; + + return false; +} + #define XFS_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ @@ -1048,7 +1063,7 @@ xfs_file_fallocate( goto out_unlock; } - if (file->f_flags & O_DSYNC) + if (xfs_file_sync_writes(file)) error = xfs_log_force_inode(ip); out_unlock: @@ -1081,21 +1096,6 @@ xfs_file_fadvise( return ret; } -/* Does this file, inode, or mount want synchronous writes? */ -static inline bool xfs_file_sync_writes(struct file *filp) -{ - struct xfs_inode *ip = XFS_I(file_inode(filp)); - - if (xfs_has_wsync(ip->i_mount)) - return true; - if (filp->f_flags & (__O_SYNC | O_DSYNC)) - return true; - if (IS_SYNC(file_inode(filp))) - return true; - - return false; -} - STATIC loff_t xfs_file_remap_range( struct file *file_in, From 6dde7acdb3dc2e0b3bcb090aac0b3699396d309f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 31 Jan 2022 13:17:30 -0800 Subject: [PATCH 0928/1295] ethernet: smc911x: fix indentation in get/set EEPROM Build bot produced a smatch indentation warning, the code looks correct but it mixes spaces and tabs. Reported-by: kernel test robot Link: https://lore.kernel.org/r/20220131211730.3940875-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/smsc/smc911x.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/smsc/smc911x.c b/drivers/net/ethernet/smsc/smc911x.c index dd6f69ced4ee..fc9cef9dcefc 100644 --- a/drivers/net/ethernet/smsc/smc911x.c +++ b/drivers/net/ethernet/smsc/smc911x.c @@ -1648,7 +1648,7 @@ static int smc911x_ethtool_geteeprom(struct net_device *dev, return ret; if ((ret=smc911x_ethtool_read_eeprom_byte(dev, &eebuf[i]))!=0) return ret; - } + } memcpy(data, eebuf+eeprom->offset, eeprom->len); return 0; } @@ -1667,11 +1667,11 @@ static int smc911x_ethtool_seteeprom(struct net_device *dev, return ret; /* write byte */ if ((ret=smc911x_ethtool_write_eeprom_byte(dev, *data))!=0) - return ret; + return ret; if ((ret=smc911x_ethtool_write_eeprom_cmd(dev, E2P_CMD_EPC_CMD_WRITE_, i ))!=0) return ret; - } - return 0; + } + return 0; } static int smc911x_ethtool_geteeprom_len(struct net_device *dev) From 04c2a47ffb13c29778e2a14e414ad4cb5a5db4b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Jan 2022 09:20:18 -0800 Subject: [PATCH 0929/1295] net: sched: fix use-after-free in tc_new_tfilter() Whenever tc_new_tfilter() jumps back to replay: label, we need to make sure @q and @chain local variables are cleared again, or risk use-after-free as in [1] For consistency, apply the same fix in tc_ctl_chain() BUG: KASAN: use-after-free in mini_qdisc_pair_swap+0x1b9/0x1f0 net/sched/sch_generic.c:1581 Write of size 8 at addr ffff8880985c4b08 by task syz-executor.4/1945 CPU: 0 PID: 1945 Comm: syz-executor.4 Not tainted 5.17.0-rc1-syzkaller-00495-gff58831fa02d #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x336 mm/kasan/report.c:255 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 mini_qdisc_pair_swap+0x1b9/0x1f0 net/sched/sch_generic.c:1581 tcf_chain_head_change_item net/sched/cls_api.c:372 [inline] tcf_chain0_head_change.isra.0+0xb9/0x120 net/sched/cls_api.c:386 tcf_chain_tp_insert net/sched/cls_api.c:1657 [inline] tcf_chain_tp_insert_unique net/sched/cls_api.c:1707 [inline] tc_new_tfilter+0x1e67/0x2350 net/sched/cls_api.c:2086 rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:5583 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:725 ____sys_sendmsg+0x331/0x810 net/socket.c:2413 ___sys_sendmsg+0xf3/0x170 net/socket.c:2467 __sys_sendmmsg+0x195/0x470 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f2647172059 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f2645aa5168 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 00007f2647285100 RCX: 00007f2647172059 RDX: 040000000000009f RSI: 00000000200002c0 RDI: 0000000000000006 RBP: 00007f26471cc08d R08: 0000000000000000 R09: 0000000000000000 R10: 9e00000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007fffb3f7f02f R14: 00007f2645aa5300 R15: 0000000000022000 Allocated by task 1944: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:45 [inline] set_alloc_info mm/kasan/common.c:436 [inline] ____kasan_kmalloc mm/kasan/common.c:515 [inline] ____kasan_kmalloc mm/kasan/common.c:474 [inline] __kasan_kmalloc+0xa9/0xd0 mm/kasan/common.c:524 kmalloc_node include/linux/slab.h:604 [inline] kzalloc_node include/linux/slab.h:726 [inline] qdisc_alloc+0xac/0xa10 net/sched/sch_generic.c:941 qdisc_create.constprop.0+0xce/0x10f0 net/sched/sch_api.c:1211 tc_modify_qdisc+0x4c5/0x1980 net/sched/sch_api.c:1660 rtnetlink_rcv_msg+0x413/0xb80 net/core/rtnetlink.c:5592 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:725 ____sys_sendmsg+0x331/0x810 net/socket.c:2413 ___sys_sendmsg+0xf3/0x170 net/socket.c:2467 __sys_sendmmsg+0x195/0x470 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 3609: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track+0x21/0x30 mm/kasan/common.c:45 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370 ____kasan_slab_free mm/kasan/common.c:366 [inline] ____kasan_slab_free+0x130/0x160 mm/kasan/common.c:328 kasan_slab_free include/linux/kasan.h:236 [inline] slab_free_hook mm/slub.c:1728 [inline] slab_free_freelist_hook+0x8b/0x1c0 mm/slub.c:1754 slab_free mm/slub.c:3509 [inline] kfree+0xcb/0x280 mm/slub.c:4562 rcu_do_batch kernel/rcu/tree.c:2527 [inline] rcu_core+0x7b8/0x1540 kernel/rcu/tree.c:2778 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 Last potentially related work creation: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 __kasan_record_aux_stack+0xbe/0xd0 mm/kasan/generic.c:348 __call_rcu kernel/rcu/tree.c:3026 [inline] call_rcu+0xb1/0x740 kernel/rcu/tree.c:3106 qdisc_put_unlocked+0x6f/0x90 net/sched/sch_generic.c:1109 tcf_block_release+0x86/0x90 net/sched/cls_api.c:1238 tc_new_tfilter+0xc0d/0x2350 net/sched/cls_api.c:2148 rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:5583 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:725 ____sys_sendmsg+0x331/0x810 net/socket.c:2413 ___sys_sendmsg+0xf3/0x170 net/socket.c:2467 __sys_sendmmsg+0x195/0x470 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff8880985c4800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 776 bytes inside of 1024-byte region [ffff8880985c4800, ffff8880985c4c00) The buggy address belongs to the page: page:ffffea0002617000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x985c0 head:ffffea0002617000 order:3 compound_mapcount:0 compound_pincount:0 flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888010c41dc0 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 3, migratetype Unmovable, gfp_mask 0x1d20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC|__GFP_HARDWALL), pid 1941, ts 1038999441284, free_ts 1033444432829 prep_new_page mm/page_alloc.c:2434 [inline] get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4165 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5389 alloc_pages+0x1aa/0x310 mm/mempolicy.c:2271 alloc_slab_page mm/slub.c:1799 [inline] allocate_slab mm/slub.c:1944 [inline] new_slab+0x28a/0x3b0 mm/slub.c:2004 ___slab_alloc+0x87c/0xe90 mm/slub.c:3018 __slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3105 slab_alloc_node mm/slub.c:3196 [inline] slab_alloc mm/slub.c:3238 [inline] __kmalloc+0x2fb/0x340 mm/slub.c:4420 kmalloc include/linux/slab.h:586 [inline] kzalloc include/linux/slab.h:715 [inline] __register_sysctl_table+0x112/0x1090 fs/proc/proc_sysctl.c:1335 neigh_sysctl_register+0x2c8/0x5e0 net/core/neighbour.c:3787 devinet_sysctl_register+0xb1/0x230 net/ipv4/devinet.c:2618 inetdev_init+0x286/0x580 net/ipv4/devinet.c:278 inetdev_event+0xa8a/0x15d0 net/ipv4/devinet.c:1532 notifier_call_chain+0xb5/0x200 kernel/notifier.c:84 call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:1919 call_netdevice_notifiers_extack net/core/dev.c:1931 [inline] call_netdevice_notifiers net/core/dev.c:1945 [inline] register_netdevice+0x1073/0x1500 net/core/dev.c:9698 veth_newlink+0x59c/0xa90 drivers/net/veth.c:1722 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1352 [inline] free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1404 free_unref_page_prepare mm/page_alloc.c:3325 [inline] free_unref_page+0x19/0x690 mm/page_alloc.c:3404 release_pages+0x748/0x1220 mm/swap.c:956 tlb_batch_pages_flush mm/mmu_gather.c:50 [inline] tlb_flush_mmu_free mm/mmu_gather.c:243 [inline] tlb_flush_mmu+0xe9/0x6b0 mm/mmu_gather.c:250 zap_pte_range mm/memory.c:1441 [inline] zap_pmd_range mm/memory.c:1490 [inline] zap_pud_range mm/memory.c:1519 [inline] zap_p4d_range mm/memory.c:1540 [inline] unmap_page_range+0x1d1d/0x2a30 mm/memory.c:1561 unmap_single_vma+0x198/0x310 mm/memory.c:1606 unmap_vmas+0x16b/0x2f0 mm/memory.c:1638 exit_mmap+0x201/0x670 mm/mmap.c:3178 __mmput+0x122/0x4b0 kernel/fork.c:1114 mmput+0x56/0x60 kernel/fork.c:1135 exit_mm kernel/exit.c:507 [inline] do_exit+0xa3c/0x2a30 kernel/exit.c:793 do_group_exit+0xd2/0x2f0 kernel/exit.c:935 __do_sys_exit_group kernel/exit.c:946 [inline] __se_sys_exit_group kernel/exit.c:944 [inline] __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:944 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Memory state around the buggy address: ffff8880985c4a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880985c4a80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff8880985c4b00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8880985c4b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880985c4c00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc Fixes: 470502de5bdb ("net: sched: unlock rules update API") Signed-off-by: Eric Dumazet Cc: Vlad Buslov Cc: Jiri Pirko Cc: Cong Wang Reported-by: syzbot Link: https://lore.kernel.org/r/20220131172018.3704490-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/cls_api.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index d4e27c679123..5f0f346b576f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1945,9 +1945,9 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, bool prio_allocate; u32 parent; u32 chain_index; - struct Qdisc *q = NULL; + struct Qdisc *q; struct tcf_chain_info chain_info; - struct tcf_chain *chain = NULL; + struct tcf_chain *chain; struct tcf_block *block; struct tcf_proto *tp; unsigned long cl; @@ -1976,6 +1976,8 @@ replay: tp = NULL; cl = 0; block = NULL; + q = NULL; + chain = NULL; flags = 0; if (prio == 0) { @@ -2798,8 +2800,8 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, struct tcmsg *t; u32 parent; u32 chain_index; - struct Qdisc *q = NULL; - struct tcf_chain *chain = NULL; + struct Qdisc *q; + struct tcf_chain *chain; struct tcf_block *block; unsigned long cl; int err; @@ -2809,6 +2811,7 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, return -EPERM; replay: + q = NULL; err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) From c6f6f2444bdbe0079e41914a35081530d0409963 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Jan 2022 17:21:06 -0800 Subject: [PATCH 0930/1295] rtnetlink: make sure to refresh master_dev/m_ops in __rtnl_newlink() While looking at one unrelated syzbot bug, I found the replay logic in __rtnl_newlink() to potentially trigger use-after-free. It is better to clear master_dev and m_ops inside the loop, in case we have to replay it. Fixes: ba7d49b1f0f8 ("rtnetlink: provide api for getting and setting slave info") Signed-off-by: Eric Dumazet Cc: Jiri Pirko Link: https://lore.kernel.org/r/20220201012106.216495-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e476403231f0..710da8a36729 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3275,8 +3275,8 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; unsigned char name_assign_type = NET_NAME_USER; struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; - const struct rtnl_link_ops *m_ops = NULL; - struct net_device *master_dev = NULL; + const struct rtnl_link_ops *m_ops; + struct net_device *master_dev; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; struct nlattr *tb[IFLA_MAX + 1]; @@ -3314,6 +3314,8 @@ replay: else dev = NULL; + master_dev = NULL; + m_ops = NULL; if (dev) { master_dev = netdev_master_upper_dev_get(dev); if (master_dev) From e42e70ad6ae2ae511a6143d2e8da929366e58bd9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Jan 2022 18:23:58 -0800 Subject: [PATCH 0931/1295] af_packet: fix data-race in packet_setsockopt / packet_setsockopt When packet_setsockopt( PACKET_FANOUT_DATA ) reads po->fanout, no lock is held, meaning that another thread can change po->fanout. Given that po->fanout can only be set once during the socket lifetime (it is only cleared from fanout_release()), we can use READ_ONCE()/WRITE_ONCE() to document the race. BUG: KCSAN: data-race in packet_setsockopt / packet_setsockopt write to 0xffff88813ae8e300 of 8 bytes by task 14653 on cpu 0: fanout_add net/packet/af_packet.c:1791 [inline] packet_setsockopt+0x22fe/0x24a0 net/packet/af_packet.c:3931 __sys_setsockopt+0x209/0x2a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88813ae8e300 of 8 bytes by task 14654 on cpu 1: packet_setsockopt+0x691/0x24a0 net/packet/af_packet.c:3935 __sys_setsockopt+0x209/0x2a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000000000000000 -> 0xffff888106f8c000 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 14654 Comm: syz-executor.3 Not tainted 5.16.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 47dceb8ecdc1 ("packet: add classic BPF fanout mode") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Link: https://lore.kernel.org/r/20220201022358.330621-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 85ea7ddb48db..ab87f22cc7ec 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1789,7 +1789,10 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) err = -ENOSPC; if (refcount_read(&match->sk_ref) < match->max_num_members) { __dev_remove_pack(&po->prot_hook); - po->fanout = match; + + /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ + WRITE_ONCE(po->fanout, match); + po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); @@ -3934,7 +3937,8 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, } case PACKET_FANOUT_DATA: { - if (!po->fanout) + /* Paired with the WRITE_ONCE() in fanout_add() */ + if (!READ_ONCE(po->fanout)) return -EINVAL; return fanout_set_data(po, optval, optlen); From 479f5547239d970d3833f15f54a6481fffdb91ec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Jan 2022 22:52:54 -0800 Subject: [PATCH 0932/1295] tcp: fix mem under-charging with zerocopy sendmsg() We got reports of following warning in inet_sock_destruct() WARN_ON(sk_forward_alloc_get(sk)); Whenever we add a non zero-copy fragment to a pure zerocopy skb, we have to anticipate that whole skb->truesize will be uncharged when skb is finally freed. skb->data_len is the payload length. But the memory truesize estimated by __zerocopy_sg_from_iter() is page aligned. Fixes: 9b65b17db723 ("net: avoid double accounting for pure zerocopy skbs") Signed-off-by: Eric Dumazet Cc: Talal Ahmad Cc: Arjun Roy Cc: Willem de Bruijn Acked-by: Soheil Hassas Yeganeh Link: https://lore.kernel.org/r/20220201065254.680532-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 78e81465f5f3..bdf108f544a4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1322,10 +1322,13 @@ new_segment: /* skb changing from pure zc to mixed, must charge zc */ if (unlikely(skb_zcopy_pure(skb))) { - if (!sk_wmem_schedule(sk, skb->data_len)) + u32 extra = skb->truesize - + SKB_TRUESIZE(skb_end_offset(skb)); + + if (!sk_wmem_schedule(sk, extra)) goto wait_for_space; - sk_mem_charge(sk, skb->data_len); + sk_mem_charge(sk, extra); skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; } From 63e4b45c82ed1bde979da7052229a4229ce9cabf Mon Sep 17 00:00:00 2001 From: Georgi Valkov Date: Tue, 1 Feb 2022 08:16:18 +0100 Subject: [PATCH 0933/1295] ipheth: fix EOVERFLOW in ipheth_rcvbulk_callback When rx_buf is allocated we need to account for IPHETH_IP_ALIGN, which reduces the usable size by 2 bytes. Otherwise we have 1512 bytes usable instead of 1514, and if we receive more than 1512 bytes, ipheth_rcvbulk_callback is called with status -EOVERFLOW, after which the driver malfunctiones and all communication stops. Resolves ipheth 2-1:4.2: ipheth_rcvbulk_callback: urb status: -75 Fixes: f33d9e2b48a3 ("usbnet: ipheth: fix connectivity with iOS 14") Signed-off-by: Georgi Valkov Tested-by: Jan Kiszka Link: https://lore.kernel.org/all/B60B8A4B-92A0-49B3-805D-809A2433B46C@abv.bg/ Link: https://lore.kernel.org/all/24851bd2769434a5fc24730dce8e8a984c5a4505.1643699778.git.jan.kiszka@siemens.com/ Signed-off-by: Jakub Kicinski --- drivers/net/usb/ipheth.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index cd33955df0b6..6a769df0b421 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -121,7 +121,7 @@ static int ipheth_alloc_urbs(struct ipheth_device *iphone) if (tx_buf == NULL) goto free_rx_urb; - rx_buf = usb_alloc_coherent(iphone->udev, IPHETH_BUF_SIZE, + rx_buf = usb_alloc_coherent(iphone->udev, IPHETH_BUF_SIZE + IPHETH_IP_ALIGN, GFP_KERNEL, &rx_urb->transfer_dma); if (rx_buf == NULL) goto free_tx_buf; @@ -146,7 +146,7 @@ error_nomem: static void ipheth_free_urbs(struct ipheth_device *iphone) { - usb_free_coherent(iphone->udev, IPHETH_BUF_SIZE, iphone->rx_buf, + usb_free_coherent(iphone->udev, IPHETH_BUF_SIZE + IPHETH_IP_ALIGN, iphone->rx_buf, iphone->rx_urb->transfer_dma); usb_free_coherent(iphone->udev, IPHETH_BUF_SIZE, iphone->tx_buf, iphone->tx_urb->transfer_dma); @@ -317,7 +317,7 @@ static int ipheth_rx_submit(struct ipheth_device *dev, gfp_t mem_flags) usb_fill_bulk_urb(dev->rx_urb, udev, usb_rcvbulkpipe(udev, dev->bulk_in), - dev->rx_buf, IPHETH_BUF_SIZE, + dev->rx_buf, IPHETH_BUF_SIZE + IPHETH_IP_ALIGN, ipheth_rcvbulk_callback, dev); dev->rx_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; From d0cfa548dbde354de986911d3913897b5448faad Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Sun, 30 Jan 2022 13:37:52 +0200 Subject: [PATCH 0934/1295] net: macsec: Verify that send_sci is on when setting Tx sci explicitly When setting Tx sci explicit, the Rx side is expected to use this sci and not recalculate it from the packet.However, in case of Tx sci is explicit and send_sci is off, the receiver is wrongly recalculate the sci from the source MAC address which most likely be different than the explicit sci. Fix by preventing such configuration when macsec newlink is established and return EINVAL error code on such cases. Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver") Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Raed Salem Link: https://lore.kernel.org/r/1643542672-29403-1-git-send-email-raeds@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 33ff33c05aab..3d0874331763 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -4018,6 +4018,15 @@ static int macsec_newlink(struct net *net, struct net_device *dev, !macsec_check_offload(macsec->offload, macsec)) return -EOPNOTSUPP; + /* send_sci must be set to true when transmit sci explicitly is set */ + if ((data && data[IFLA_MACSEC_SCI]) && + (data && data[IFLA_MACSEC_INC_SCI])) { + u8 send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]); + + if (!send_sci) + return -EINVAL; + } + if (data && data[IFLA_MACSEC_ICV_LEN]) icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]); mtu = real_dev->mtu - icv_len - macsec_extra_len(true); From 04f8c12f031fcd0ffa0c72822eb665ceb2c872e7 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 6 Jan 2022 16:40:18 +0200 Subject: [PATCH 0935/1295] net/mlx5: Bridge, take rtnl lock in init error handler The mlx5_esw_bridge_cleanup() is expected to be called with rtnl lock taken, which is true for mlx5e_rep_bridge_cleanup() function but not for error handling code in mlx5e_rep_bridge_init(). Add missing rtnl lock/unlock calls and extend both mlx5_esw_bridge_cleanup() and its dual function mlx5_esw_bridge_init() with ASSERT_RTNL() to verify the invariant from now on. Fixes: 7cd6a54a8285 ("net/mlx5: Bridge, handle FDB events") Fixes: 19e9bfa044f3 ("net/mlx5: Bridge, add offload infrastructure") Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c index c6d2f8c78db7..d5cb27667005 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c @@ -509,7 +509,9 @@ err_register_swdev_blk: err_register_swdev: destroy_workqueue(br_offloads->wq); err_alloc_wq: + rtnl_lock(); mlx5_esw_bridge_cleanup(esw); + rtnl_unlock(); } void mlx5e_rep_bridge_cleanup(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c index f690f430f40f..05e08cec5a8c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c @@ -1574,6 +1574,8 @@ struct mlx5_esw_bridge_offloads *mlx5_esw_bridge_init(struct mlx5_eswitch *esw) { struct mlx5_esw_bridge_offloads *br_offloads; + ASSERT_RTNL(); + br_offloads = kvzalloc(sizeof(*br_offloads), GFP_KERNEL); if (!br_offloads) return ERR_PTR(-ENOMEM); @@ -1590,6 +1592,8 @@ void mlx5_esw_bridge_cleanup(struct mlx5_eswitch *esw) { struct mlx5_esw_bridge_offloads *br_offloads = esw->br_offloads; + ASSERT_RTNL(); + if (!br_offloads) return; From 350d9a823734b5a7e767cddc3bdde5f0bcbb7ff4 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 6 Jan 2022 18:45:26 +0200 Subject: [PATCH 0936/1295] net/mlx5: Bridge, ensure dev_name is null-terminated Even though net_device->name is guaranteed to be null-terminated string of size<=IFNAMSIZ, the test robot complains that return value of netdev_name() can be larger: In file included from include/trace/define_trace.h:102, from drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h:113, from drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c:12: drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h: In function 'trace_event_raw_event_mlx5_esw_bridge_fdb_template': >> drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h:24:29: warning: 'strncpy' output may be truncated copying 16 bytes from a string of length 20 [-Wstringop-truncation] 24 | strncpy(__entry->dev_name, | ^~~~~~~~~~~~~~~~~~~~~~~~~~ 25 | netdev_name(fdb->dev), | ~~~~~~~~~~~~~~~~~~~~~~ 26 | IFNAMSIZ); | ~~~~~~~~~ This is caused by the fact that default value of IFNAMSIZ is 16, while placeholder value that is returned by netdev_name() for unnamed net devices is larger than that. The offending code is in a tracing function that is only called for mlx5 representors, so there is no straightforward way to reproduce the issue but let's fix it for correctness sake by replacing strncpy() with strscpy() to ensure that resulting string is always null-terminated. Fixes: 9724fd5d9c2a ("net/mlx5: Bridge, add tracepoints") Reported-by: kernel test robot Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h index 3401188e0a60..51ac24e6ec3c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h @@ -21,7 +21,7 @@ DECLARE_EVENT_CLASS(mlx5_esw_bridge_fdb_template, __field(unsigned int, used) ), TP_fast_assign( - strncpy(__entry->dev_name, + strscpy(__entry->dev_name, netdev_name(fdb->dev), IFNAMSIZ); memcpy(__entry->addr, fdb->key.addr, ETH_ALEN); From a2446bc77a16cefd27de712d28af2396d6287593 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 4 Jan 2022 10:38:02 +0200 Subject: [PATCH 0937/1295] net/mlx5e: TC, Reject rules with drop and modify hdr action This kind of action is not supported by firmware and generates a syndrome. kernel: mlx5_core 0000:08:00.0: mlx5_cmd_check:777:(pid 102063): SET_FLOW_TABLE_ENTRY(0x936) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x8708c3) Fixes: d7e75a325cb2 ("net/mlx5e: Add offloading of E-Switch TC pedit (header re-write) actions") Signed-off-by: Roi Dayan Reviewed-by: Oz Shlomo Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 3d908a7e1406..671f76c350db 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3191,6 +3191,12 @@ actions_match_supported(struct mlx5e_priv *priv, return false; } + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && + actions & MLX5_FLOW_CONTEXT_ACTION_DROP) { + NL_SET_ERR_MSG_MOD(extack, "Drop with modify header action is not supported"); + return false; + } + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && !modify_header_match_supported(priv, &parse_attr->spec, flow_action, actions, ct_flow, ct_clear, extack)) From 4a08a131351e375a2969b98e46df260ed04dcba7 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 16 Jan 2022 09:07:22 +0200 Subject: [PATCH 0938/1295] net/mlx5e: Fix module EEPROM query When querying the module EEPROM, there was a misusage of the 'offset' variable vs the 'query.offset' field. Fix that by always using 'offset' and assigning its value to 'query.offset' right before the mcia register read call. While at it, the cross-pages read size adjustment was changed to be more intuitive. Fixes: e19b0a3474ab ("net/mlx5: Refactor module EEPROM query") Reported-by: Wang Yugui Signed-off-by: Gal Pressman Reviewed-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/port.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 1ef2b6a848c1..7b16a1188aab 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -406,23 +406,24 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, switch (module_id) { case MLX5_MODULE_ID_SFP: - mlx5_sfp_eeprom_params_set(&query.i2c_address, &query.page, &query.offset); + mlx5_sfp_eeprom_params_set(&query.i2c_address, &query.page, &offset); break; case MLX5_MODULE_ID_QSFP: case MLX5_MODULE_ID_QSFP_PLUS: case MLX5_MODULE_ID_QSFP28: - mlx5_qsfp_eeprom_params_set(&query.i2c_address, &query.page, &query.offset); + mlx5_qsfp_eeprom_params_set(&query.i2c_address, &query.page, &offset); break; default: mlx5_core_err(dev, "Module ID not recognized: 0x%x\n", module_id); return -EINVAL; } - if (query.offset + size > MLX5_EEPROM_PAGE_LENGTH) + if (offset + size > MLX5_EEPROM_PAGE_LENGTH) /* Cross pages read, read until offset 256 in low page */ - size -= offset + size - MLX5_EEPROM_PAGE_LENGTH; + size = MLX5_EEPROM_PAGE_LENGTH - offset; query.size = size; + query.offset = offset; return mlx5_query_mcia(dev, &query, data); } From 3c5193a87b0fea090aa3f769d020337662d87b5e Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Thu, 13 Jan 2022 15:48:48 +0200 Subject: [PATCH 0939/1295] net/mlx5: Use del_timer_sync in fw reset flow of halting poll Substitute del_timer() with del_timer_sync() in fw reset polling deactivation flow, in order to prevent a race condition which occurs when del_timer() is called and timer is deactivated while another process is handling the timer interrupt. A situation that led to the following call trace: RIP: 0010:run_timer_softirq+0x137/0x420 recalibrate_cpu_khz+0x10/0x10 ktime_get+0x3e/0xa0 ? sched_clock_cpu+0xb/0xc0 __do_softirq+0xf5/0x2ea irq_exit_rcu+0xc1/0xf0 sysvec_apic_timer_interrupt+0x9e/0xc0 asm_sysvec_apic_timer_interrupt+0x12/0x20 Fixes: 38b9f903f22b ("net/mlx5: Handle sync reset request event") Signed-off-by: Maher Sanalla Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c index 0b0234f9d694..84dbe46d5ede 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c @@ -132,7 +132,7 @@ static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev) { struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; - del_timer(&fw_reset->timer); + del_timer_sync(&fw_reset->timer); } static void mlx5_sync_reset_clear_reset_requested(struct mlx5_core_dev *dev, bool poll_health) From 5623ef8a118838aae65363750dfafcba734dc8cb Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 17 Jan 2022 15:00:30 +0200 Subject: [PATCH 0940/1295] net/mlx5e: TC, Reject rules with forward and drop actions Such rules are redundant but allowed and passed to the driver. The driver does not support offloading such rules so return an error. Fixes: 03a9d11e6eeb ("net/mlx5e: Add TC drop and mirred/redirect action parsing for SRIOV offloads") Signed-off-by: Roi Dayan Reviewed-by: Oz Shlomo Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 671f76c350db..4c6e3c26c1ab 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3191,6 +3191,12 @@ actions_match_supported(struct mlx5e_priv *priv, return false; } + if (!(~actions & + (MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_DROP))) { + NL_SET_ERR_MSG_MOD(extack, "Rule cannot support forward+drop action"); + return false; + } + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && actions & MLX5_FLOW_CONTEXT_ACTION_DROP) { NL_SET_ERR_MSG_MOD(extack, "Drop with modify header action is not supported"); From 55b2ca702cfa744a9eb108915996a2294da47e71 Mon Sep 17 00:00:00 2001 From: Dima Chumak Date: Mon, 17 Jan 2022 15:32:16 +0200 Subject: [PATCH 0941/1295] net/mlx5: Fix offloading with ESWITCH_IPV4_TTL_MODIFY_ENABLE Only prio 1 is supported for nic mode when there is no ignore flow level support in firmware. But for switchdev mode, which supports fixed number of statically pre-allocated prios, this restriction is not relevant so it can be relaxed. Fixes: d671e109bd85 ("net/mlx5: Fix tc max supported prio for nic mode") Signed-off-by: Dima Chumak Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c index d5e47630e284..e94233a12a32 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c @@ -121,12 +121,13 @@ u32 mlx5_chains_get_nf_ft_chain(struct mlx5_fs_chains *chains) u32 mlx5_chains_get_prio_range(struct mlx5_fs_chains *chains) { - if (!mlx5_chains_prios_supported(chains)) - return 1; - if (mlx5_chains_ignore_flow_level_supported(chains)) return UINT_MAX; + if (!chains->dev->priv.eswitch || + chains->dev->priv.eswitch->mode != MLX5_ESWITCH_OFFLOADS) + return 1; + /* We should get here only for eswitch case */ return FDB_TC_MAX_PRIO; } From 880b517691908fb753019b9b27cd082e7617debd Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 24 Jan 2022 13:56:26 +0200 Subject: [PATCH 0942/1295] net/mlx5: Bridge, Fix devlink deadlock on net namespace deletion When changing mode to switchdev, rep bridge init registered to netdevice notifier holds the devlink lock and then takes pernet_ops_rwsem. At that time deleting a netns holds pernet_ops_rwsem and then takes the devlink lock. Example sequence is: $ ip netns add foo $ devlink dev eswitch set pci/0000:00:08.0 mode switchdev & $ ip netns del foo deleting netns trace: [ 1185.365555] ? devlink_pernet_pre_exit+0x74/0x1c0 [ 1185.368331] ? mutex_lock_io_nested+0x13f0/0x13f0 [ 1185.370984] ? xt_find_table+0x40/0x100 [ 1185.373244] ? __mutex_lock+0x24a/0x15a0 [ 1185.375494] ? net_generic+0xa0/0x1c0 [ 1185.376844] ? wait_for_completion_io+0x280/0x280 [ 1185.377767] ? devlink_pernet_pre_exit+0x74/0x1c0 [ 1185.378686] devlink_pernet_pre_exit+0x74/0x1c0 [ 1185.379579] ? devlink_nl_cmd_get_dumpit+0x3a0/0x3a0 [ 1185.380557] ? xt_find_table+0xda/0x100 [ 1185.381367] cleanup_net+0x372/0x8e0 changing mode to switchdev trace: [ 1185.411267] down_write+0x13a/0x150 [ 1185.412029] ? down_write_killable+0x180/0x180 [ 1185.413005] register_netdevice_notifier+0x1e/0x210 [ 1185.414000] mlx5e_rep_bridge_init+0x181/0x360 [mlx5_core] [ 1185.415243] mlx5e_uplink_rep_enable+0x269/0x480 [mlx5_core] [ 1185.416464] ? mlx5e_uplink_rep_disable+0x210/0x210 [mlx5_core] [ 1185.417749] mlx5e_attach_netdev+0x232/0x400 [mlx5_core] [ 1185.418906] mlx5e_netdev_attach_profile+0x15b/0x1e0 [mlx5_core] [ 1185.420172] mlx5e_netdev_change_profile+0x15a/0x1d0 [mlx5_core] [ 1185.421459] mlx5e_vport_rep_load+0x557/0x780 [mlx5_core] [ 1185.422624] ? mlx5e_stats_grp_vport_rep_num_stats+0x10/0x10 [mlx5_core] [ 1185.424006] mlx5_esw_offloads_rep_load+0xdb/0x190 [mlx5_core] [ 1185.425277] esw_offloads_enable+0xd74/0x14a0 [mlx5_core] Fix this by registering rep bridges for per net netdev notifier instead of global one, which operats on the net namespace without holding the pernet_ops_rwsem. Fixes: 19e9bfa044f3 ("net/mlx5: Bridge, add offload infrastructure") Signed-off-by: Roi Dayan Reviewed-by: Vlad Buslov Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c index d5cb27667005..48dc121b2cb4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c @@ -491,7 +491,7 @@ void mlx5e_rep_bridge_init(struct mlx5e_priv *priv) } br_offloads->netdev_nb.notifier_call = mlx5_esw_bridge_switchdev_port_event; - err = register_netdevice_notifier(&br_offloads->netdev_nb); + err = register_netdevice_notifier_net(&init_net, &br_offloads->netdev_nb); if (err) { esw_warn(mdev, "Failed to register bridge offloads netdevice notifier (err=%d)\n", err); @@ -526,7 +526,7 @@ void mlx5e_rep_bridge_cleanup(struct mlx5e_priv *priv) return; cancel_delayed_work_sync(&br_offloads->update_work); - unregister_netdevice_notifier(&br_offloads->netdev_nb); + unregister_netdevice_notifier_net(&init_net, &br_offloads->netdev_nb); unregister_switchdev_blocking_notifier(&br_offloads->nb_blk); unregister_switchdev_notifier(&br_offloads->nb); destroy_workqueue(br_offloads->wq); From b8d91145ed7cfa046cc07bcfb277465b9d45da73 Mon Sep 17 00:00:00 2001 From: Khalid Manaa Date: Wed, 26 Jan 2022 14:14:58 +0200 Subject: [PATCH 0943/1295] net/mlx5e: Fix wrong calculation of header index in HW_GRO The HW doesn't wrap the CQE.shampo.header_index field according to the headers buffer size, instead it always increases it until reaching overflow of u16 size. Thus the mlx5e_handle_rx_cqe_mpwrq_shampo handler should mask the CQE header_index field to find the actual header index in the headers buffer. Fixes: f97d5c2a453e ("net/mlx5e: Add handle SHAMPO cqe support") Signed-off-by: Khalid Manaa Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 5 +++++ drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index 4cdf8e5b24c2..b789af07829c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -167,6 +167,11 @@ static inline u16 mlx5e_txqsq_get_next_pi(struct mlx5e_txqsq *sq, u16 size) return pi; } +static inline u16 mlx5e_shampo_get_cqe_header_index(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + return be16_to_cpu(cqe->shampo.header_entry_index) & (rq->mpwqe.shampo->hd_per_wq - 1); +} + struct mlx5e_shampo_umr { u16 len; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index e86ccc22fb82..3a79ecd38003 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1117,7 +1117,7 @@ static void mlx5e_shampo_update_ipv6_udp_hdr(struct mlx5e_rq *rq, struct ipv6hdr static void mlx5e_shampo_update_fin_psh_flags(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, struct tcphdr *skb_tcp_hd) { - u16 header_index = be16_to_cpu(cqe->shampo.header_entry_index); + u16 header_index = mlx5e_shampo_get_cqe_header_index(rq, cqe); struct tcphdr *last_tcp_hd; void *last_hd_addr; @@ -1973,7 +1973,7 @@ mlx5e_free_rx_shampo_hd_entry(struct mlx5e_rq *rq, u16 header_index) static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) { u16 data_bcnt = mpwrq_get_cqe_byte_cnt(cqe) - cqe->shampo.header_size; - u16 header_index = be16_to_cpu(cqe->shampo.header_entry_index); + u16 header_index = mlx5e_shampo_get_cqe_header_index(rq, cqe); u32 wqe_offset = be32_to_cpu(cqe->shampo.data_offset); u16 cstrides = mpwrq_get_cqe_consumed_strides(cqe); u32 data_offset = wqe_offset & (PAGE_SIZE - 1); From 7957837b816f11eecb9146235bb0715478f4c81f Mon Sep 17 00:00:00 2001 From: Khalid Manaa Date: Wed, 26 Jan 2022 14:25:55 +0200 Subject: [PATCH 0944/1295] net/mlx5e: Fix broken SKB allocation in HW-GRO In case the HW doesn't perform header-data split, it will write the whole packet into the data buffer in the WQ, in this case the SHAMPO CQE handler couldn't use the header entry to build the SKB, instead it should allocate a new memory to build the SKB using the function: mlx5e_skb_from_cqe_mpwrq_nonlinear. Fixes: f97d5c2a453e ("net/mlx5e: Add handle SHAMPO cqe support") Signed-off-by: Khalid Manaa Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_rx.c | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 3a79ecd38003..ee0a8f5206e3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1871,7 +1871,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, return skb; } -static void +static struct sk_buff * mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, struct mlx5_cqe64 *cqe, u16 header_index) { @@ -1895,7 +1895,7 @@ mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, skb = mlx5e_build_linear_skb(rq, hdr, frag_size, rx_headroom, head_size); if (unlikely(!skb)) - return; + return NULL; /* queue up for recycling/reuse */ page_ref_inc(head->page); @@ -1907,7 +1907,7 @@ mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, ALIGN(head_size, sizeof(long))); if (unlikely(!skb)) { rq->stats->buff_alloc_err++; - return; + return NULL; } prefetchw(skb->data); @@ -1918,9 +1918,7 @@ mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, skb->tail += head_size; skb->len += head_size; } - rq->hw_gro_data->skb = skb; - NAPI_GRO_CB(skb)->count = 1; - skb_shinfo(skb)->gso_size = mpwrq_get_cqe_byte_cnt(cqe) - head_size; + return skb; } static void @@ -1980,6 +1978,7 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq u32 cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe); u16 wqe_id = be16_to_cpu(cqe->wqe_id); u32 page_idx = wqe_offset >> PAGE_SHIFT; + u16 head_size = cqe->shampo.header_size; struct sk_buff **skb = &rq->hw_gro_data->skb; bool flush = cqe->shampo.flush; bool match = cqe->shampo.match; @@ -2011,9 +2010,16 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq } if (!*skb) { - mlx5e_skb_from_cqe_shampo(rq, wi, cqe, header_index); + if (likely(head_size)) + *skb = mlx5e_skb_from_cqe_shampo(rq, wi, cqe, header_index); + else + *skb = mlx5e_skb_from_cqe_mpwrq_nonlinear(rq, wi, cqe_bcnt, data_offset, + page_idx); if (unlikely(!*skb)) goto free_hd_entry; + + NAPI_GRO_CB(*skb)->count = 1; + skb_shinfo(*skb)->gso_size = cqe_bcnt - head_size; } else { NAPI_GRO_CB(*skb)->count++; if (NAPI_GRO_CB(*skb)->count == 2 && @@ -2027,8 +2033,10 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq } } - di = &wi->umr.dma_info[page_idx]; - mlx5e_fill_skb_data(*skb, rq, di, data_bcnt, data_offset); + if (likely(head_size)) { + di = &wi->umr.dma_info[page_idx]; + mlx5e_fill_skb_data(*skb, rq, di, data_bcnt, data_offset); + } mlx5e_shampo_complete_rx_cqe(rq, cqe, cqe_bcnt, *skb); if (flush) From ec41332e02bd0acf1f24206867bb6a02f5877a62 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Thu, 13 Jan 2022 15:11:42 +0200 Subject: [PATCH 0945/1295] net/mlx5e: Fix handling of wrong devices during bond netevent Current implementation of bond netevent handler only check if the handled netdev is VF representor and it missing a check if the VF representor is on the same phys device of the bond handling the netevent. Fix by adding the missing check and optimizing the check if the netdev is VF representor so it will not access uninitialized private data and crashes. BUG: kernel NULL pointer dereference, address: 000000000000036c PGD 0 P4D 0 Oops: 0000 [#1] SMP NOPTI Workqueue: eth3bond0 bond_mii_monitor [bonding] RIP: 0010:mlx5e_is_uplink_rep+0xc/0x50 [mlx5_core] RSP: 0018:ffff88812d69fd60 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff8881cf800000 RCX: 0000000000000000 RDX: ffff88812d69fe10 RSI: 000000000000001b RDI: ffff8881cf800880 RBP: ffff8881cf800000 R08: 00000445cabccf2b R09: 0000000000000008 R10: 0000000000000004 R11: 0000000000000008 R12: ffff88812d69fe10 R13: 00000000fffffffe R14: ffff88820c0f9000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff88846fb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000036c CR3: 0000000103d80006 CR4: 0000000000370ea0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mlx5e_eswitch_uplink_rep+0x31/0x40 [mlx5_core] mlx5e_rep_is_lag_netdev+0x94/0xc0 [mlx5_core] mlx5e_rep_esw_bond_netevent+0xeb/0x3d0 [mlx5_core] raw_notifier_call_chain+0x41/0x60 call_netdevice_notifiers_info+0x34/0x80 netdev_lower_state_changed+0x4e/0xa0 bond_mii_monitor+0x56b/0x640 [bonding] process_one_work+0x1b9/0x390 worker_thread+0x4d/0x3d0 ? rescuer_thread+0x350/0x350 kthread+0x124/0x150 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30 Fixes: 7e51891a237f ("net/mlx5e: Use netdev events to set/del egress acl forward-to-vport rule") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en/rep/bond.c | 32 ++++++++----------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c index 9c076aa20306..b6f5c1bcdbcd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c @@ -183,18 +183,7 @@ void mlx5e_rep_bond_unslave(struct mlx5_eswitch *esw, static bool mlx5e_rep_is_lag_netdev(struct net_device *netdev) { - struct mlx5e_rep_priv *rpriv; - struct mlx5e_priv *priv; - - /* A given netdev is not a representor or not a slave of LAG configuration */ - if (!mlx5e_eswitch_rep(netdev) || !netif_is_lag_port(netdev)) - return false; - - priv = netdev_priv(netdev); - rpriv = priv->ppriv; - - /* Egress acl forward to vport is supported only non-uplink representor */ - return rpriv->rep->vport != MLX5_VPORT_UPLINK; + return netif_is_lag_port(netdev) && mlx5e_eswitch_vf_rep(netdev); } static void mlx5e_rep_changelowerstate_event(struct net_device *netdev, void *ptr) @@ -210,9 +199,6 @@ static void mlx5e_rep_changelowerstate_event(struct net_device *netdev, void *pt u16 fwd_vport_num; int err; - if (!mlx5e_rep_is_lag_netdev(netdev)) - return; - info = ptr; lag_info = info->lower_state_info; /* This is not an event of a representor becoming active slave */ @@ -266,9 +252,6 @@ static void mlx5e_rep_changeupper_event(struct net_device *netdev, void *ptr) struct net_device *lag_dev; struct mlx5e_priv *priv; - if (!mlx5e_rep_is_lag_netdev(netdev)) - return; - priv = netdev_priv(netdev); rpriv = priv->ppriv; lag_dev = info->upper_dev; @@ -293,6 +276,19 @@ static int mlx5e_rep_esw_bond_netevent(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct mlx5e_rep_priv *rpriv; + struct mlx5e_rep_bond *bond; + struct mlx5e_priv *priv; + + if (!mlx5e_rep_is_lag_netdev(netdev)) + return NOTIFY_DONE; + + bond = container_of(nb, struct mlx5e_rep_bond, nb); + priv = netdev_priv(netdev); + rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch, REP_ETH); + /* Verify VF representor is on the same device of the bond handling the netevent. */ + if (rpriv->uplink_priv.bond != bond) + return NOTIFY_DONE; switch (event) { case NETDEV_CHANGELOWERSTATE: From d8e5883d694bb053b19c4142a2d1f43a34f6fe2c Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Sun, 30 Jan 2022 16:00:41 +0200 Subject: [PATCH 0946/1295] net/mlx5: E-Switch, Fix uninitialized variable modact The variable modact is not initialized before used in command modify header allocation which can cause command to fail. Fix by initializing modact with zeros. Addresses-Coverity: ("Uninitialized scalar variable") Fixes: 8f1e0b97cc70 ("net/mlx5: E-Switch, Mark miss packets with new chain id mapping") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c index e94233a12a32..df58cba37930 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c @@ -212,7 +212,7 @@ static int create_chain_restore(struct fs_chain *chain) { struct mlx5_eswitch *esw = chain->chains->dev->priv.eswitch; - char modact[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)]; + u8 modact[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; struct mlx5_fs_chains *chains = chain->chains; enum mlx5e_tc_attr_to_reg chain_to_reg; struct mlx5_modify_hdr *mod_hdr; From 736dfe4e68b868829a1e89dfef4a44c1580d4478 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 18 Jan 2022 13:31:54 +0200 Subject: [PATCH 0947/1295] net/mlx5e: Don't treat small ceil values as unlimited in HTB offload The hardware spec defines max_average_bw == 0 as "unlimited bandwidth". max_average_bw is calculated as `ceil / BYTES_IN_MBIT`, which can become 0 when ceil is small, leading to an undesired effect of having no bandwidth limit. This commit fixes it by rounding up small values of ceil to 1 Mbit/s. Fixes: 214baf22870c ("net/mlx5e: Support HTB offload") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/qos.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c index 00449df98a5e..c1e07496c89c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c @@ -570,7 +570,8 @@ static int mlx5e_htb_convert_rate(struct mlx5e_priv *priv, u64 rate, static void mlx5e_htb_convert_ceil(struct mlx5e_priv *priv, u64 ceil, u32 *max_average_bw) { - *max_average_bw = div_u64(ceil, BYTES_IN_MBIT); + /* Hardware treats 0 as "unlimited", set at least 1. */ + *max_average_bw = max_t(u32, div_u64(ceil, BYTES_IN_MBIT), 1); qos_dbg(priv->mdev, "Convert: ceil %llu -> max_average_bw %u\n", ceil, *max_average_bw); From 5352859b3bfa0ca188b2f1d2c1436fddc781e3b6 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 2 Dec 2021 17:43:50 +0200 Subject: [PATCH 0948/1295] net/mlx5e: IPsec: Fix crypto offload for non TCP/UDP encapsulated traffic IPsec crypto offload always set the ethernet segment checksum flags with the inner L4 header checksum flag enabled for encapsulated IPsec offloaded packet regardless of the encapsulated L4 header type, and even if it doesn't exists in the first place, this breaks non TCP/UDP traffic as such. Set the inner L4 checksum flag only when the encapsulated L4 header protocol is TCP/UDP using software parser swp_inner_l4_offset field as indication. Fixes: 5cfb540ef27b ("net/mlx5e: Set IPsec WAs only in IP's non checksum partial case.") Signed-off-by: Raed Salem Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h index b98db50c3418..428881e0adcb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h @@ -131,14 +131,17 @@ static inline bool mlx5e_ipsec_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg) { - struct xfrm_offload *xo = xfrm_offload(skb); + u8 inner_ipproto; if (!mlx5e_ipsec_eseg_meta(eseg)) return false; eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM; - if (xo->inner_ipproto) { - eseg->cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM | MLX5_ETH_WQE_L3_INNER_CSUM; + inner_ipproto = xfrm_offload(skb)->inner_ipproto; + if (inner_ipproto) { + eseg->cs_flags |= MLX5_ETH_WQE_L3_INNER_CSUM; + if (inner_ipproto == IPPROTO_TCP || inner_ipproto == IPPROTO_UDP) + eseg->cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM; } else if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { eseg->cs_flags |= MLX5_ETH_WQE_L4_CSUM; sq->stats->csum_partial_inner++; From de47db0cf7f4a9c555ad204e06baa70b50a70d08 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 2 Dec 2021 17:49:01 +0200 Subject: [PATCH 0949/1295] net/mlx5e: IPsec: Fix tunnel mode crypto offload for non TCP/UDP traffic IPsec Tunnel mode crypto offload software parser (SWP) setting in data path currently always set the inner L4 offset regardless of the encapsulated L4 header type and whether it exists in the first place, this breaks non TCP/UDP traffic as such. Set the SWP inner L4 offset only when the IPsec tunnel encapsulated L4 header protocol is TCP/UDP. While at it fix inner ip protocol read for setting MLX5_ETH_WQE_SWP_INNER_L4_UDP flag to address the case where the ip header protocol is IPv6. Fixes: f1267798c980 ("net/mlx5: Fix checksum issue of VXLAN and IPsec crypto offload") Signed-off-by: Raed Salem Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/en_accel/ipsec_rxtx.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c index 2db9573a3fe6..b56fea142c24 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c @@ -157,11 +157,20 @@ static void mlx5e_ipsec_set_swp(struct sk_buff *skb, /* Tunnel mode */ if (mode == XFRM_MODE_TUNNEL) { eseg->swp_inner_l3_offset = skb_inner_network_offset(skb) / 2; - eseg->swp_inner_l4_offset = skb_inner_transport_offset(skb) / 2; if (xo->proto == IPPROTO_IPV6) eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6; - if (inner_ip_hdr(skb)->protocol == IPPROTO_UDP) + + switch (xo->inner_ipproto) { + case IPPROTO_UDP: eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_UDP; + fallthrough; + case IPPROTO_TCP: + /* IP | ESP | IP | [TCP | UDP] */ + eseg->swp_inner_l4_offset = skb_inner_transport_offset(skb) / 2; + break; + default: + break; + } return; } From 5b209d1a22afabfb7d644abb10510c5713a3e569 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 1 Feb 2022 15:27:48 +0200 Subject: [PATCH 0950/1295] net/mlx5e: Avoid implicit modify hdr for decap drop rule Currently the driver adds implicit modify hdr action for decap rules on tunnel devices if the port is an ovs port. This is also done if the action is drop and makes the modify hdr redundant and also the FW doesn't support it and will generate a syndrome. kernel: mlx5_core 0000:08:00.0: mlx5_cmd_check:777:(pid 102063): SET_FLOW_TABLE_ENTRY(0x936) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x8708c3) Fix it by adding the implicit modify hdr only for fwd actions. Fixes: b16eb3c81fe2 ("net/mlx5: Support internal port as decap route device") Fixes: 077cdda764c7 ("net/mlx5e: TC, Fix memory leak with rules with internal port") Signed-off-by: Roi Dayan Reviewed-by: Ariel Levkovich Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 4c6e3c26c1ab..2022fa4a9598 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1414,7 +1414,8 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, if (err) goto err_out; - if (!attr->chain && esw_attr->int_port) { + if (!attr->chain && esw_attr->int_port && + attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { /* If decap route device is internal port, change the * source vport value in reg_c0 back to uplink just in * case the rule performs goto chain > 0. If we have a miss From 6d5c900eb64107001e91e1f46bddc254dded8a59 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 24 Jan 2022 09:22:41 -0800 Subject: [PATCH 0951/1295] net/mlx5e: Use struct_group() for memcpy() region In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally writing across neighboring fields. Use struct_group() in struct vlan_ethhdr around members h_dest and h_source, so they can be referenced together. This will allow memcpy() and sizeof() to more easily reason about sizes, improve readability, and avoid future warnings about writing beyond the end of h_dest. "pahole" shows no size nor member offset changes to struct vlan_ethhdr. "objdump -d" shows no object code changes. Fixes: 34802a42b352 ("net/mlx5e: Do not modify the TX SKB") Signed-off-by: Kees Cook Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +- include/linux/if_vlan.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 7fd33b356cc8..ee7ecb88adc1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -208,7 +208,7 @@ static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs) int cpy1_sz = 2 * ETH_ALEN; int cpy2_sz = ihs - cpy1_sz; - memcpy(vhdr, skb->data, cpy1_sz); + memcpy(&vhdr->addrs, skb->data, cpy1_sz); vhdr->h_vlan_proto = skb->vlan_proto; vhdr->h_vlan_TCI = cpu_to_be16(skb_vlan_tag_get(skb)); memcpy(&vhdr->h_vlan_encapsulated_proto, skb->data + cpy1_sz, cpy2_sz); diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 8420fe504927..2be4dd7e90a9 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -46,8 +46,10 @@ struct vlan_hdr { * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_ethhdr { - unsigned char h_dest[ETH_ALEN]; - unsigned char h_source[ETH_ALEN]; + struct_group(addrs, + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + ); __be16 h_vlan_proto; __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; From ad5185735f7dab342fdd0dd41044da4c9ccfef67 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 24 Jan 2022 09:20:28 -0800 Subject: [PATCH 0952/1295] net/mlx5e: Avoid field-overflowing memcpy() In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally writing across neighboring fields. Use flexible arrays instead of zero-element arrays (which look like they are always overflowing) and split the cross-field memcpy() into two halves that can be appropriately bounds-checked by the compiler. We were doing: #define ETH_HLEN 14 #define VLAN_HLEN 4 ... #define MLX5E_XDP_MIN_INLINE (ETH_HLEN + VLAN_HLEN) ... struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(wq, pi); ... struct mlx5_wqe_eth_seg *eseg = &wqe->eth; struct mlx5_wqe_data_seg *dseg = wqe->data; ... memcpy(eseg->inline_hdr.start, xdptxd->data, MLX5E_XDP_MIN_INLINE); target is wqe->eth.inline_hdr.start (which the compiler sees as being 2 bytes in size), but copying 18, intending to write across start (really vlan_tci, 2 bytes). The remaining 16 bytes get written into wqe->data[0], covering byte_count (4 bytes), lkey (4 bytes), and addr (8 bytes). struct mlx5e_tx_wqe { struct mlx5_wqe_ctrl_seg ctrl; /* 0 16 */ struct mlx5_wqe_eth_seg eth; /* 16 16 */ struct mlx5_wqe_data_seg data[]; /* 32 0 */ /* size: 32, cachelines: 1, members: 3 */ /* last cacheline: 32 bytes */ }; struct mlx5_wqe_eth_seg { u8 swp_outer_l4_offset; /* 0 1 */ u8 swp_outer_l3_offset; /* 1 1 */ u8 swp_inner_l4_offset; /* 2 1 */ u8 swp_inner_l3_offset; /* 3 1 */ u8 cs_flags; /* 4 1 */ u8 swp_flags; /* 5 1 */ __be16 mss; /* 6 2 */ __be32 flow_table_metadata; /* 8 4 */ union { struct { __be16 sz; /* 12 2 */ u8 start[2]; /* 14 2 */ } inline_hdr; /* 12 4 */ struct { __be16 type; /* 12 2 */ __be16 vlan_tci; /* 14 2 */ } insert; /* 12 4 */ __be32 trailer; /* 12 4 */ }; /* 12 4 */ /* size: 16, cachelines: 1, members: 9 */ /* last cacheline: 16 bytes */ }; struct mlx5_wqe_data_seg { __be32 byte_count; /* 0 4 */ __be32 lkey; /* 4 4 */ __be64 addr; /* 8 8 */ /* size: 16, cachelines: 1, members: 3 */ /* last cacheline: 16 bytes */ }; So, split the memcpy() so the compiler can reason about the buffer sizes. "pahole" shows no size nor member offset changes to struct mlx5e_tx_wqe nor struct mlx5e_umr_wqe. "objdump -d" shows no meaningful object code changes (i.e. only source line number induced differences and optimizations). Fixes: b5503b994ed5 ("net/mlx5e: XDP TX forwarding support") Signed-off-by: Kees Cook Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 812e6810cb3b..c14e06ca64d8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -224,7 +224,7 @@ static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev) struct mlx5e_tx_wqe { struct mlx5_wqe_ctrl_seg ctrl; struct mlx5_wqe_eth_seg eth; - struct mlx5_wqe_data_seg data[0]; + struct mlx5_wqe_data_seg data[]; }; struct mlx5e_rx_wqe_ll { @@ -241,8 +241,8 @@ struct mlx5e_umr_wqe { struct mlx5_wqe_umr_ctrl_seg uctrl; struct mlx5_mkey_seg mkc; union { - struct mlx5_mtt inline_mtts[0]; - struct mlx5_klm inline_klms[0]; + DECLARE_FLEX_ARRAY(struct mlx5_mtt, inline_mtts); + DECLARE_FLEX_ARRAY(struct mlx5_klm, inline_klms); }; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 338d65e2c9ce..56e10c84a706 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -341,8 +341,10 @@ mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, /* copy the inline part if required */ if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) { - memcpy(eseg->inline_hdr.start, xdptxd->data, MLX5E_XDP_MIN_INLINE); + memcpy(eseg->inline_hdr.start, xdptxd->data, sizeof(eseg->inline_hdr.start)); eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE); + memcpy(dseg, xdptxd->data + sizeof(eseg->inline_hdr.start), + MLX5E_XDP_MIN_INLINE - sizeof(eseg->inline_hdr.start)); dma_len -= MLX5E_XDP_MIN_INLINE; dma_addr += MLX5E_XDP_MIN_INLINE; dseg++; From 9a8406ba1a9a2965c27e0db1d7753471d12ee9ff Mon Sep 17 00:00:00 2001 From: Liu Ying Date: Mon, 24 Jan 2022 10:40:07 +0800 Subject: [PATCH 0953/1295] phy: dphy: Correct clk_pre parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The D-PHY specification (v1.2) explicitly mentions that the T-CLK-PRE parameter's unit is Unit Interval(UI) and the minimum value is 8. Also, kernel doc of the 'clk_pre' member of struct phy_configure_opts_mipi_dphy mentions that it should be in UI. However, the dphy core driver wrongly sets 'clk_pre' to 8000, which seems to hint that it's in picoseconds. So, let's fix the dphy core driver to correctly reflect the T-CLK-PRE parameter's minimum value according to the D-PHY specification. I'm assuming that all impacted custom drivers shall program values in TxByteClkHS cycles into hardware for the T-CLK-PRE parameter. The D-PHY specification mentions that the frequency of TxByteClkHS is exactly 1/8 the High-Speed(HS) bit rate(each HS bit consumes one UI). So, relevant custom driver code is changed to program those values as DIV_ROUND_UP(cfg->clk_pre, BITS_PER_BYTE), then. Note that I've only tested the patch with RM67191 DSI panel on i.MX8mq EVK. Help is needed to test with other i.MX8mq, Meson and Rockchip platforms, as I don't have the hardwares. Fixes: 2ed869990e14 ("phy: Add MIPI D-PHY configuration options") Tested-by: Liu Ying # RM67191 DSI panel on i.MX8mq EVK Reviewed-by: Andrzej Hajda Reviewed-by: Neil Armstrong # for phy-meson-axg-mipi-dphy.c Tested-by: Neil Armstrong # for phy-meson-axg-mipi-dphy.c Tested-by: Guido Günther # Librem 5 (imx8mq) with it's rather picky panel Reviewed-by: Laurent Pinchart Signed-off-by: Liu Ying Link: https://lore.kernel.org/r/20220124024007.1465018-1-victor.liu@nxp.com Signed-off-by: Vinod Koul --- drivers/gpu/drm/bridge/nwl-dsi.c | 12 +++++------- drivers/phy/amlogic/phy-meson-axg-mipi-dphy.c | 3 ++- drivers/phy/phy-core-mipi-dphy.c | 4 ++-- drivers/phy/rockchip/phy-rockchip-inno-dsidphy.c | 3 ++- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/bridge/nwl-dsi.c b/drivers/gpu/drm/bridge/nwl-dsi.c index a7389a0facfb..af07eeb47ca0 100644 --- a/drivers/gpu/drm/bridge/nwl-dsi.c +++ b/drivers/gpu/drm/bridge/nwl-dsi.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -196,12 +197,9 @@ static u32 ps2bc(struct nwl_dsi *dsi, unsigned long long ps) /* * ui2bc - UI time periods to byte clock cycles */ -static u32 ui2bc(struct nwl_dsi *dsi, unsigned long long ui) +static u32 ui2bc(unsigned int ui) { - u32 bpp = mipi_dsi_pixel_format_to_bpp(dsi->format); - - return DIV64_U64_ROUND_UP(ui * dsi->lanes, - dsi->mode.clock * 1000 * bpp); + return DIV_ROUND_UP(ui, BITS_PER_BYTE); } /* @@ -232,12 +230,12 @@ static int nwl_dsi_config_host(struct nwl_dsi *dsi) } /* values in byte clock cycles */ - cycles = ui2bc(dsi, cfg->clk_pre); + cycles = ui2bc(cfg->clk_pre); DRM_DEV_DEBUG_DRIVER(dsi->dev, "cfg_t_pre: 0x%x\n", cycles); nwl_dsi_write(dsi, NWL_DSI_CFG_T_PRE, cycles); cycles = ps2bc(dsi, cfg->lpx + cfg->clk_prepare + cfg->clk_zero); DRM_DEV_DEBUG_DRIVER(dsi->dev, "cfg_tx_gap (pre): 0x%x\n", cycles); - cycles += ui2bc(dsi, cfg->clk_pre); + cycles += ui2bc(cfg->clk_pre); DRM_DEV_DEBUG_DRIVER(dsi->dev, "cfg_t_post: 0x%x\n", cycles); nwl_dsi_write(dsi, NWL_DSI_CFG_T_POST, cycles); cycles = ps2bc(dsi, cfg->hs_exit); diff --git a/drivers/phy/amlogic/phy-meson-axg-mipi-dphy.c b/drivers/phy/amlogic/phy-meson-axg-mipi-dphy.c index cd2332bf0e31..fdbd64c03e12 100644 --- a/drivers/phy/amlogic/phy-meson-axg-mipi-dphy.c +++ b/drivers/phy/amlogic/phy-meson-axg-mipi-dphy.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -250,7 +251,7 @@ static int phy_meson_axg_mipi_dphy_power_on(struct phy *phy) (DIV_ROUND_UP(priv->config.clk_zero, temp) << 16) | (DIV_ROUND_UP(priv->config.clk_prepare, temp) << 24)); regmap_write(priv->regmap, MIPI_DSI_CLK_TIM1, - DIV_ROUND_UP(priv->config.clk_pre, temp)); + DIV_ROUND_UP(priv->config.clk_pre, BITS_PER_BYTE)); regmap_write(priv->regmap, MIPI_DSI_HS_TIM, DIV_ROUND_UP(priv->config.hs_exit, temp) | diff --git a/drivers/phy/phy-core-mipi-dphy.c b/drivers/phy/phy-core-mipi-dphy.c index 288c9c67aa74..ccb4045685cd 100644 --- a/drivers/phy/phy-core-mipi-dphy.c +++ b/drivers/phy/phy-core-mipi-dphy.c @@ -36,7 +36,7 @@ int phy_mipi_dphy_get_default_config(unsigned long pixel_clock, cfg->clk_miss = 0; cfg->clk_post = 60000 + 52 * ui; - cfg->clk_pre = 8000; + cfg->clk_pre = 8; cfg->clk_prepare = 38000; cfg->clk_settle = 95000; cfg->clk_term_en = 0; @@ -97,7 +97,7 @@ int phy_mipi_dphy_config_validate(struct phy_configure_opts_mipi_dphy *cfg) if (cfg->clk_post < (60000 + 52 * ui)) return -EINVAL; - if (cfg->clk_pre < 8000) + if (cfg->clk_pre < 8) return -EINVAL; if (cfg->clk_prepare < 38000 || cfg->clk_prepare > 95000) diff --git a/drivers/phy/rockchip/phy-rockchip-inno-dsidphy.c b/drivers/phy/rockchip/phy-rockchip-inno-dsidphy.c index 347dc79a18c1..630e01b5c19b 100644 --- a/drivers/phy/rockchip/phy-rockchip-inno-dsidphy.c +++ b/drivers/phy/rockchip/phy-rockchip-inno-dsidphy.c @@ -5,6 +5,7 @@ * Author: Wyon Bi */ +#include #include #include #include @@ -364,7 +365,7 @@ static void inno_dsidphy_mipi_mode_enable(struct inno_dsidphy *inno) * The value of counter for HS Tclk-pre * Tclk-pre = Tpin_txbyteclkhs * value */ - clk_pre = DIV_ROUND_UP(cfg->clk_pre, t_txbyteclkhs); + clk_pre = DIV_ROUND_UP(cfg->clk_pre, BITS_PER_BYTE); /* * The value of counter for HS Tlpx Time From 0fa0f99fc84e41057cbdd2efbfe91c6b2f47dd9d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 1 Feb 2022 14:54:19 +0200 Subject: [PATCH 0954/1295] nvme: fix a possible use-after-free in controller reset during load Unlike .queue_rq, in .submit_async_event drivers may not check the ctrl readiness for AER submission. This may lead to a use-after-free condition that was observed with nvme-tcp. The race condition may happen in the following scenario: 1. driver executes its reset_ctrl_work 2. -> nvme_stop_ctrl - flushes ctrl async_event_work 3. ctrl sends AEN which is received by the host, which in turn schedules AEN handling 4. teardown admin queue (which releases the queue socket) 5. AEN processed, submits another AER, calling the driver to submit 6. driver attempts to send the cmd ==> use-after-free In order to fix that, add ctrl state check to validate the ctrl is actually able to accept the AER submission. This addresses the above race in controller resets because the driver during teardown should: 1. change ctrl state to RESETTING 2. flush async_event_work (as well as other async work elements) So after 1,2, any other AER command will find the ctrl state to be RESETTING and bail out without submitting the AER. Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5e0bfda04bd7..961a5f8a44d2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4253,7 +4253,14 @@ static void nvme_async_event_work(struct work_struct *work) container_of(work, struct nvme_ctrl, async_event_work); nvme_aen_uevent(ctrl); - ctrl->ops->submit_async_event(ctrl); + + /* + * The transport drivers must guarantee AER submission here is safe by + * flushing ctrl async_event_work after changing the controller state + * from LIVE and before freeing the admin queue. + */ + if (ctrl->state == NVME_CTRL_LIVE) + ctrl->ops->submit_async_event(ctrl); } static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) From ff9fc7ebf5c06de1ef72a69f9b1ab40af8b07f9e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 1 Feb 2022 14:54:20 +0200 Subject: [PATCH 0955/1295] nvme-tcp: fix possible use-after-free in transport error_recovery work While nvme_tcp_submit_async_event_work is checking the ctrl and queue state before preparing the AER command and scheduling io_work, in order to fully prevent a race where this check is not reliable the error recovery work must flush async_event_work before continuing to destroy the admin queue after setting the ctrl state to RESETTING such that there is no race .submit_async_event and the error recovery handler itself changing the ctrl state. Tested-by: Chris Leech Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4ceb28675fdf..01e24b5703db 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2096,6 +2096,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; nvme_stop_keep_alive(ctrl); + flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); /* unquiesce to fail fast pending requests */ nvme_start_queues(ctrl); From b6bb1722f34bbdbabed27acdceaf585d300c5fd2 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 1 Feb 2022 14:54:21 +0200 Subject: [PATCH 0956/1295] nvme-rdma: fix possible use-after-free in transport error_recovery work While nvme_rdma_submit_async_event_work is checking the ctrl and queue state before preparing the AER command and scheduling io_work, in order to fully prevent a race where this check is not reliable the error recovery work must flush async_event_work before continuing to destroy the admin queue after setting the ctrl state to RESETTING such that there is no race .submit_async_event and the error recovery handler itself changing the ctrl state. Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 850f84d204d0..9c55e4be8a39 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1200,6 +1200,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl, err_work); nvme_stop_keep_alive(&ctrl->ctrl); + flush_work(&ctrl->ctrl.async_event_work); nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); From 2c212e1baedcd782b2535a3f86bc491977677c0e Mon Sep 17 00:00:00 2001 From: Janis Schoetterl-Glausch Date: Fri, 28 Jan 2022 15:06:43 +0100 Subject: [PATCH 0957/1295] KVM: s390: Return error on SIDA memop on normal guest Refuse SIDA memops on guests which are not protected. For normal guests, the secure instruction data address designation, which determines the location we access, is not under control of KVM. Fixes: 19e122776886 (KVM: S390: protvirt: Introduce instruction data area bounce buffer) Signed-off-by: Janis Schoetterl-Glausch Cc: stable@vger.kernel.org Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 577f1ead6a51..2296b1ff1e02 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4667,6 +4667,8 @@ static long kvm_s390_guest_sida_op(struct kvm_vcpu *vcpu, return -EINVAL; if (mop->size + mop->sida_offset > sida_size(vcpu->arch.sie_block)) return -E2BIG; + if (!kvm_s390_pv_cpu_is_protected(vcpu)) + return -EINVAL; switch (mop->op) { case KVM_S390_MEMOP_SIDA_READ: From eba1e44beef88aa722f07755f79f604cd5d92290 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 2 Feb 2022 10:34:54 +0000 Subject: [PATCH 0958/1295] irqchip/gic-v3-its: Skip HP notifier when no ITS is registered We have some systems out there that have both LPI support and an ITS, but that don't expose the ITS in their firmware tables (either because it is broken or because they run under a hypervisor that hides it...). Is such a configuration, we still register the HP notifier to free the allocated tables if needed, resulting in a warning as there is no memory to free (nothing was allocated the first place). Fix it by keying the HP notifier on the presence of at least one sucessfully probed ITS. Fixes: d23bc2bc1d63 ("irqchip/gic-v3-its: Postpone LPI pending table freeing and memreserve") Reported-by: Steev Klimaszewski Tested-by: Steev Klimaszewski Signed-off-by: Marc Zyngier Cc: Valentin Schneider Link: https://lore.kernel.org/r/20220202103454.2480465-1-maz@kernel.org --- drivers/irqchip/irq-gic-v3-its.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 9e93ff2b6375..cd772973114a 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -5517,6 +5517,9 @@ int __init its_lpi_memreserve_init(void) if (!efi_enabled(EFI_CONFIG_TABLES)) return 0; + if (list_empty(&its_nodes)) + return 0; + gic_rdists->cpuhp_memreserve_state = CPUHP_INVALID; state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "irqchip/arm/gicv3/memreserve:online", From 321a8be37e1a93cefeae990107533142c8515933 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Sun, 30 Jan 2022 21:56:33 +0800 Subject: [PATCH 0959/1295] dt-bindings: update riscv plic compatible string Add the compatible string "thead,c900-plic" to the riscv plic bindings to support allwinner d1 SOC which contains c906 core. Signed-off-by: Guo Ren Cc: Anup Patel Cc: Heiko Stuebner Cc: Rob Herring Cc: Rob Herring Cc: Palmer Dabbelt Cc: Samuel Holland Reviewed-by: Rob Herring Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220130135634.1213301-2-guoren@kernel.org --- .../sifive,plic-1.0.0.yaml | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml b/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml index 0dfa6b26e099..27092c6a86c4 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml @@ -35,6 +35,10 @@ description: contains a specific memory layout, which is documented in chapter 8 of the SiFive U5 Coreplex Series Manual . + The thead,c900-plic is different from sifive,plic-1.0.0 in opensbi, the + T-HEAD PLIC implementation requires setting a delegation bit to allow access + from S-mode. So add thead,c900-plic to distinguish them. + maintainers: - Sagar Kadam - Paul Walmsley @@ -42,12 +46,17 @@ maintainers: properties: compatible: - items: - - enum: - - sifive,fu540-c000-plic - - starfive,jh7100-plic - - canaan,k210-plic - - const: sifive,plic-1.0.0 + oneOf: + - items: + - enum: + - sifive,fu540-c000-plic + - starfive,jh7100-plic + - canaan,k210-plic + - const: sifive,plic-1.0.0 + - items: + - enum: + - allwinner,sun20i-d1-plic + - const: thead,c900-plic reg: maxItems: 1 From 1d4df649cbb4b26d19bea38ecff4b65b10a1bbca Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Sun, 30 Jan 2022 21:56:34 +0800 Subject: [PATCH 0960/1295] irqchip/sifive-plic: Add missing thead,c900-plic match string The thead,c900-plic has been used in opensbi to distinguish PLIC [1]. Although PLICs have the same behaviors in Linux, they are different hardware with some custom initializing in firmware(opensbi). Qute opensbi patch commit-msg by Samuel: The T-HEAD PLIC implementation requires setting a delegation bit to allow access from S-mode. Now that the T-HEAD PLIC has its own compatible string, set this bit automatically from the PLIC driver, instead of reaching into the PLIC's MMIO space from another driver. [1]: https://github.com/riscv-software-src/opensbi/commit/78c2b19218bd62653b9fb31623a42ced45f38ea6 Signed-off-by: Guo Ren Cc: Anup Patel Cc: Marc Zyngier Cc: Palmer Dabbelt Cc: Samuel Holland Cc: Thomas Gleixner Tested-by: Samuel Holland Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220130135634.1213301-3-guoren@kernel.org --- drivers/irqchip/irq-sifive-plic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index 259065d271ef..09cc98266d30 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -398,3 +398,4 @@ out_free_priv: IRQCHIP_DECLARE(sifive_plic, "sifive,plic-1.0.0", plic_init); IRQCHIP_DECLARE(riscv_plic0, "riscv,plic0", plic_init); /* for legacy systems */ +IRQCHIP_DECLARE(thead_c900_plic, "thead,c900-plic", plic_init); /* for firmware driver */ From 2cba05451a6d0c703bb74f1a250691404f27c4f1 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 31 Jan 2022 11:35:53 +0100 Subject: [PATCH 0961/1295] gpio: aggregator: Fix calling into sleeping GPIO controllers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the parent GPIO controller is a sleeping controller (e.g. a GPIO controller connected to I2C), getting or setting a GPIO triggers a might_sleep() warning. This happens because the GPIO Aggregator takes the can_sleep flag into account only for its internal locking, not for calling into the parent GPIO controller. Fix this by using the gpiod_[gs]et*_cansleep() APIs when calling into a sleeping GPIO controller. Reported-by: Mikko Salomäki Fixes: 828546e24280f721 ("gpio: Add GPIO Aggregator") Signed-off-by: Geert Uytterhoeven Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index 869dc952cf45..0cb2664085cf 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -278,7 +278,8 @@ static int gpio_fwd_get(struct gpio_chip *chip, unsigned int offset) { struct gpiochip_fwd *fwd = gpiochip_get_data(chip); - return gpiod_get_value(fwd->descs[offset]); + return chip->can_sleep ? gpiod_get_value_cansleep(fwd->descs[offset]) + : gpiod_get_value(fwd->descs[offset]); } static int gpio_fwd_get_multiple(struct gpiochip_fwd *fwd, unsigned long *mask, @@ -293,7 +294,10 @@ static int gpio_fwd_get_multiple(struct gpiochip_fwd *fwd, unsigned long *mask, for_each_set_bit(i, mask, fwd->chip.ngpio) descs[j++] = fwd->descs[i]; - error = gpiod_get_array_value(j, descs, NULL, values); + if (fwd->chip.can_sleep) + error = gpiod_get_array_value_cansleep(j, descs, NULL, values); + else + error = gpiod_get_array_value(j, descs, NULL, values); if (error) return error; @@ -328,7 +332,10 @@ static void gpio_fwd_set(struct gpio_chip *chip, unsigned int offset, int value) { struct gpiochip_fwd *fwd = gpiochip_get_data(chip); - gpiod_set_value(fwd->descs[offset], value); + if (chip->can_sleep) + gpiod_set_value_cansleep(fwd->descs[offset], value); + else + gpiod_set_value(fwd->descs[offset], value); } static void gpio_fwd_set_multiple(struct gpiochip_fwd *fwd, unsigned long *mask, @@ -343,7 +350,10 @@ static void gpio_fwd_set_multiple(struct gpiochip_fwd *fwd, unsigned long *mask, descs[j++] = fwd->descs[i]; } - gpiod_set_array_value(j, descs, NULL, values); + if (fwd->chip.can_sleep) + gpiod_set_array_value_cansleep(j, descs, NULL, values); + else + gpiod_set_array_value(j, descs, NULL, values); } static void gpio_fwd_set_multiple_locked(struct gpio_chip *chip, From a01994f5e5c79d3a35e5e8cf4252c7f2147323c3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Jan 2022 12:32:51 +0100 Subject: [PATCH 0962/1295] x86/perf: Default set FREEZE_ON_SMI for all Kyle reported that rr[0] has started to malfunction on Comet Lake and later CPUs due to EFI starting to make use of CPL3 [1] and the PMU event filtering not distinguishing between regular CPL3 and SMM CPL3. Since this is a privilege violation, default disable SMM visibility where possible. Administrators wanting to observe SMM cycles can easily change this using the sysfs attribute while regular users don't have access to this file. [0] https://rr-project.org/ [1] See the Intel white paper "Trustworthy SMM on the Intel vPro Platform" at https://bugzilla.kernel.org/attachment.cgi?id=300300, particularly the end of page 5. Reported-by: Kyle Huey Suggested-by: Andrew Cooper Signed-off-by: Peter Zijlstra (Intel) Cc: stable@kernel.org Link: https://lkml.kernel.org/r/YfKChjX61OW4CkYm@hirez.programming.kicks-ass.net --- arch/x86/events/intel/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index c91434056c29..a3c7ca876aeb 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4703,6 +4703,19 @@ static __initconst const struct x86_pmu intel_pmu = { .lbr_read = intel_pmu_lbr_read_64, .lbr_save = intel_pmu_lbr_save, .lbr_restore = intel_pmu_lbr_restore, + + /* + * SMM has access to all 4 rings and while traditionally SMM code only + * ran in CPL0, 2021-era firmware is starting to make use of CPL3 in SMM. + * + * Since the EVENTSEL.{USR,OS} CPL filtering makes no distinction + * between SMM or not, this results in what should be pure userspace + * counters including SMM data. + * + * This is a clear privilege issue, therefore globally disable + * counting SMM by default. + */ + .attr_freeze_on_smi = 1, }; static __init void intel_clovertown_quirk(void) From 3c25fc97f5590060464cabfa25710970ecddbc96 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 31 Jan 2022 11:34:05 +0100 Subject: [PATCH 0963/1295] perf: Copy perf_event_attr::sig_data on modification The intent has always been that perf_event_attr::sig_data should also be modifiable along with PERF_EVENT_IOC_MODIFY_ATTRIBUTES, because it is observable by user space if SIGTRAP on events is requested. Currently only PERF_TYPE_BREAKPOINT is modifiable, and explicitly copies relevant breakpoint-related attributes in hw_breakpoint_copy_attr(). This misses copying perf_event_attr::sig_data. Since sig_data is not specific to PERF_TYPE_BREAKPOINT, introduce a helper to copy generic event-type-independent attributes on modification. Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Link: https://lore.kernel.org/r/20220131103407.1971678-1-elver@google.com --- kernel/events/core.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 76c754e45d01..57c7197838db 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3238,6 +3238,15 @@ static int perf_event_modify_breakpoint(struct perf_event *bp, return err; } +/* + * Copy event-type-independent attributes that may be modified. + */ +static void perf_event_modify_copy_attr(struct perf_event_attr *to, + const struct perf_event_attr *from) +{ + to->sig_data = from->sig_data; +} + static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { @@ -3260,10 +3269,17 @@ static int perf_event_modify_attr(struct perf_event *event, WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); + /* + * Event-type-independent attributes must be copied before event-type + * modification, which will validate that final attributes match the + * source attributes after all relevant attributes have been copied. + */ + perf_event_modify_copy_attr(&event->attr, attr); err = func(event, attr); if (err) goto out; list_for_each_entry(child, &event->child_list, child_list) { + perf_event_modify_copy_attr(&child->attr, attr); err = func(child, attr); if (err) goto out; From 95d29fa104523b1756323f7003294b1711c27808 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 31 Jan 2022 11:34:06 +0100 Subject: [PATCH 0964/1295] selftests/perf_events: Test modification of perf_event_attr::sig_data Test that PERF_EVENT_IOC_MODIFY_ATTRIBUTES correctly modifies perf_event_attr::sig_data as well. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Link: https://lore.kernel.org/r/20220131103407.1971678-2-elver@google.com --- .../selftests/perf_events/sigtrap_threads.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/perf_events/sigtrap_threads.c b/tools/testing/selftests/perf_events/sigtrap_threads.c index 8e83cf91513a..6d849dc2bee0 100644 --- a/tools/testing/selftests/perf_events/sigtrap_threads.c +++ b/tools/testing/selftests/perf_events/sigtrap_threads.c @@ -44,9 +44,10 @@ static struct { } ctx; /* Unique value to check si_perf_data is correctly set from perf_event_attr::sig_data. */ -#define TEST_SIG_DATA(addr) (~(unsigned long)(addr)) +#define TEST_SIG_DATA(addr, id) (~(unsigned long)(addr) + id) -static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr) +static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr, + unsigned long id) { struct perf_event_attr attr = { .type = PERF_TYPE_BREAKPOINT, @@ -60,7 +61,7 @@ static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr) .inherit_thread = 1, /* ... but only cloned with CLONE_THREAD. */ .remove_on_exec = 1, /* Required by sigtrap. */ .sigtrap = 1, /* Request synchronous SIGTRAP on event. */ - .sig_data = TEST_SIG_DATA(addr), + .sig_data = TEST_SIG_DATA(addr, id), }; return attr; } @@ -110,7 +111,7 @@ FIXTURE(sigtrap_threads) FIXTURE_SETUP(sigtrap_threads) { - struct perf_event_attr attr = make_event_attr(false, &ctx.iterate_on); + struct perf_event_attr attr = make_event_attr(false, &ctx.iterate_on, 0); struct sigaction action = {}; int i; @@ -165,7 +166,7 @@ TEST_F(sigtrap_threads, enable_event) EXPECT_EQ(ctx.tids_want_signal, 0); EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); - EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on)); + EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 0)); /* Check enabled for parent. */ ctx.iterate_on = 0; @@ -175,7 +176,7 @@ TEST_F(sigtrap_threads, enable_event) /* Test that modification propagates to all inherited events. */ TEST_F(sigtrap_threads, modify_and_enable_event) { - struct perf_event_attr new_attr = make_event_attr(true, &ctx.iterate_on); + struct perf_event_attr new_attr = make_event_attr(true, &ctx.iterate_on, 42); EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_MODIFY_ATTRIBUTES, &new_attr), 0); run_test_threads(_metadata, self); @@ -184,7 +185,7 @@ TEST_F(sigtrap_threads, modify_and_enable_event) EXPECT_EQ(ctx.tids_want_signal, 0); EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); - EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on)); + EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 42)); /* Check enabled for parent. */ ctx.iterate_on = 0; @@ -204,7 +205,7 @@ TEST_F(sigtrap_threads, signal_stress) EXPECT_EQ(ctx.tids_want_signal, 0); EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); - EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on)); + EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 0)); } TEST_HARNESS_MAIN From ddecd22878601a606d160680fa85802b75d92eb6 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 31 Jan 2022 11:34:07 +0100 Subject: [PATCH 0965/1295] perf: uapi: Document perf_event_attr::sig_data truncation on 32 bit architectures Due to the alignment requirements of siginfo_t, as described in 3ddb3fd8cdb0 ("signal, perf: Fix siginfo_t by avoiding u64 on 32-bit architectures"), siginfo_t::si_perf_data is limited to an unsigned long. However, perf_event_attr::sig_data is an u64, to avoid having to deal with compat conversions. Due to being an u64, it may not immediately be clear to users that sig_data is truncated on 32 bit architectures. Add a comment to explicitly point this out, and hopefully help some users save time by not having to deduce themselves what's happening. Reported-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Link: https://lore.kernel.org/r/20220131103407.1971678-3-elver@google.com --- include/uapi/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1b65042ab1db..82858b697c05 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -465,6 +465,8 @@ struct perf_event_attr { /* * User provided data if sigtrap=1, passed back to user via * siginfo_t::si_perf_data, e.g. to permit user to identify the event. + * Note, siginfo_t::si_perf_data is long-sized, and sig_data will be + * truncated accordingly on 32 bit architectures. */ __u64 sig_data; }; From 1d9093457b243061a9bba23543c38726e864a643 Mon Sep 17 00:00:00 2001 From: Tristan Hume Date: Thu, 27 Jan 2022 17:08:06 -0500 Subject: [PATCH 0966/1295] perf/x86/intel/pt: Fix crash with stop filters in single-range mode Add a check for !buf->single before calling pt_buffer_region_size in a place where a missing check can cause a kernel crash. Fixes a bug introduced by commit 670638477aed ("perf/x86/intel/pt: Opportunistically use single range output mode"), which added a support for PT single-range output mode. Since that commit if a PT stop filter range is hit while tracing, the kernel will crash because of a null pointer dereference in pt_handle_status due to calling pt_buffer_region_size without a ToPA configured. The commit which introduced single-range mode guarded almost all uses of the ToPA buffer variables with checks of the buf->single variable, but missed the case where tracing was stopped by the PT hardware, which happens when execution hits a configured stop filter. Tested that hitting a stop filter while PT recording successfully records a trace with this patch but crashes without this patch. Fixes: 670638477aed ("perf/x86/intel/pt: Opportunistically use single range output mode") Signed-off-by: Tristan Hume Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Adrian Hunter Cc: stable@kernel.org Link: https://lkml.kernel.org/r/20220127220806.73664-1-tristan@thume.ca --- arch/x86/events/intel/pt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 7f406c14715f..2d33bba9a144 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -897,8 +897,9 @@ static void pt_handle_status(struct pt *pt) * means we are already losing data; need to let the decoder * know. */ - if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) || - buf->output_off == pt_buffer_region_size(buf)) { + if (!buf->single && + (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) || + buf->output_off == pt_buffer_region_size(buf))) { perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_TRUNCATED); advance++; From 6455317e4d0d8395e8e4a2fd1ec8d6502267dd02 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 1 Feb 2022 13:29:25 +0000 Subject: [PATCH 0967/1295] kvm/riscv: rework guest entry logic In kvm_arch_vcpu_ioctl_run() we enter an RCU extended quiescent state (EQS) by calling guest_enter_irqoff(), and unmask IRQs prior to exiting the EQS by calling guest_exit(). As the IRQ entry code will not wake RCU in this case, we may run the core IRQ code and IRQ handler without RCU watching, leading to various potential problems. Additionally, we do not inform lockdep or tracing that interrupts will be enabled during guest execution, which caan lead to misleading traces and warnings that interrupts have been enabled for overly-long periods. This patch fixes these issues by using the new timing and context entry/exit helpers to ensure that interrupts are handled during guest vtime but with RCU watching, with a sequence: guest_timing_enter_irqoff(); guest_state_enter_irqoff(); < run the vcpu > guest_state_exit_irqoff(); < take any pending IRQs > guest_timing_exit_irqoff(); Since instrumentation may make use of RCU, we must also ensure that no instrumented code is run during the EQS. I've split out the critical section into a new kvm_riscv_enter_exit_vcpu() helper which is marked noinstr. Fixes: 99cdc6c18c2d815e ("RISC-V: Add initial skeletal KVM support") Signed-off-by: Mark Rutland Cc: Albert Ou Cc: Anup Patel Cc: Atish Patra Cc: Frederic Weisbecker Cc: Palmer Dabbelt Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Paul Walmsley Tested-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu.c | 44 ++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 0c5239e05721..f64f62057378 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -699,6 +699,20 @@ static void kvm_riscv_update_hvip(struct kvm_vcpu *vcpu) csr_write(CSR_HVIP, csr->hvip); } +/* + * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while + * the vCPU is running. + * + * This must be noinstr as instrumentation may make use of RCU, and this is not + * safe during the EQS. + */ +static void noinstr kvm_riscv_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + guest_state_enter_irqoff(); + __kvm_riscv_switch_to(&vcpu->arch); + guest_state_exit_irqoff(); +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { int ret; @@ -790,9 +804,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) continue; } - guest_enter_irqoff(); + guest_timing_enter_irqoff(); - __kvm_riscv_switch_to(&vcpu->arch); + kvm_riscv_vcpu_enter_exit(vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; vcpu->stat.exits++; @@ -812,25 +826,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_riscv_vcpu_sync_interrupts(vcpu); /* - * We may have taken a host interrupt in VS/VU-mode (i.e. - * while executing the guest). This interrupt is still - * pending, as we haven't serviced it yet! + * We must ensure that any pending interrupts are taken before + * we exit guest timing so that timer ticks are accounted as + * guest time. Transiently unmask interrupts so that any + * pending interrupts are taken. * - * We're now back in HS-mode with interrupts disabled - * so enabling the interrupts now will have the effect - * of taking the interrupt again, in HS-mode this time. + * There's no barrier which ensures that pending interrupts are + * recognised, so we just hope that the CPU takes any pending + * interrupts between the enable and disable. */ local_irq_enable(); + local_irq_disable(); - /* - * We do local_irq_enable() before calling guest_exit() so - * that if a timer interrupt hits while running the guest - * we account that tick as being spent in the guest. We - * enable preemption after calling guest_exit() so that if - * we get preempted we make sure ticks after that is not - * counted as guest time. - */ - guest_exit(); + guest_timing_exit_irqoff(); + + local_irq_enable(); preempt_enable(); From de1d7b6a51dab546160d252e47baa54adf104d4a Mon Sep 17 00:00:00 2001 From: Mayuresh Chitale Date: Mon, 31 Jan 2022 16:33:07 +0530 Subject: [PATCH 0968/1295] RISC-V: KVM: make CY, TM, and IR counters accessible in VU mode Those applications that run in VU mode and access the time CSR cause a virtual instruction trap as Guest kernel currently does not initialize the scounteren CSR. To fix this, we should make CY, TM, and IR counters accessibile by default in VU mode (similar to OpenSBI). Fixes: a33c72faf2d73 ("RISC-V: KVM: Implement VCPU create, init and destroy functions") Cc: stable@vger.kernel.org Signed-off-by: Mayuresh Chitale Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index f64f62057378..624166004e36 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -90,6 +90,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *cntx; + struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr; /* Mark this VCPU never ran */ vcpu->arch.ran_atleast_once = false; @@ -106,6 +107,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) cntx->hstatus |= HSTATUS_SPVP; cntx->hstatus |= HSTATUS_SPV; + /* By default, make CY, TM, and IR counters accessible in VU mode */ + reset_csr->scounteren = 0x7; + /* Setup VCPU timer */ kvm_riscv_vcpu_timer_init(vcpu); From 403271548a840dd4f884088d6333e09f899be5ff Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 31 Jan 2022 22:12:32 +0530 Subject: [PATCH 0969/1295] RISC-V: KVM: Fix SBI implementation version The SBI implementation version returned by KVM RISC-V should be the Host Linux version code. Fixes: c62a76859723 ("RISC-V: KVM: Add SBI v0.2 base extension") Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kvm/vcpu_sbi_base.c b/arch/riscv/kvm/vcpu_sbi_base.c index 4ecf377f483b..48f431091cdb 100644 --- a/arch/riscv/kvm/vcpu_sbi_base.c +++ b/arch/riscv/kvm/vcpu_sbi_base.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -32,7 +33,7 @@ static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, *out_val = KVM_SBI_IMPID; break; case SBI_EXT_BASE_GET_IMP_VERSION: - *out_val = 0; + *out_val = LINUX_VERSION_CODE; break; case SBI_EXT_BASE_PROBE_EXT: if ((cp->a0 >= SBI_EXT_EXPERIMENTAL_START && From 1148836fd3226c20de841084aba24184d4fbbe77 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 2 Feb 2022 14:55:29 +0100 Subject: [PATCH 0970/1295] Revert "fbdev: Garbage collect fbdev scrolling acceleration, part 1 (from TODO list)" This reverts commit b3ec8cdf457e5e63d396fe1346cc788cf7c1b578. Revert the second (of 2) commits which disabled scrolling acceleration in fbcon/fbdev. It introduced a regression for fbdev-supported graphic cards because of the performance penalty by doing screen scrolling by software instead of using the existing graphic card 2D hardware acceleration. Console scrolling acceleration was disabled by dropping code which checked at runtime the driver hardware capabilities for the BINFO_HWACCEL_COPYAREA or FBINFO_HWACCEL_FILLRECT flags and if set, it enabled scrollmode SCROLL_MOVE which uses hardware acceleration to move screen contents. After dropping those checks scrollmode was hard-wired to SCROLL_REDRAW instead, which forces all graphic cards to redraw every character at the new screen position when scrolling. This change effectively disabled all hardware-based scrolling acceleration for ALL drivers, because now all kind of 2D hardware acceleration (bitblt, fillrect) in the drivers isn't used any longer. The original commit message mentions that only 3 DRM drivers (nouveau, omapdrm and gma500) used hardware acceleration in the past and thus code for checking and using scrolling acceleration is obsolete. This statement is NOT TRUE, because beside the DRM drivers there are around 35 other fbdev drivers which depend on fbdev/fbcon and still provide hardware acceleration for fbdev/fbcon. The original commit message also states that syzbot found lots of bugs in fbcon and thus it's "often the solution to just delete code and remove features". This is true, and the bugs - which actually affected all users of fbcon, including DRM - were fixed, or code was dropped like e.g. the support for software scrollback in vgacon (commit 973c096f6a85). So to further analyze which bugs were found by syzbot, I've looked through all patches in drivers/video which were tagged with syzbot or syzkaller back to year 2005. The vast majority fixed the reported issues on a higher level, e.g. when screen is to be resized, or when font size is to be changed. The few ones which touched driver code fixed a real driver bug, e.g. by adding a check. But NONE of those patches touched code of either the SCROLL_MOVE or the SCROLL_REDRAW case. That means, there was no real reason why SCROLL_MOVE had to be ripped-out and just SCROLL_REDRAW had to be used instead. The only reason I can imagine so far was that SCROLL_MOVE wasn't used by DRM and as such it was assumed that it could go away. That argument completely missed the fact that SCROLL_MOVE is still heavily used by fbdev (non-DRM) drivers. Some people mention that using memcpy() instead of the hardware acceleration is pretty much the same speed. But that's not true, at least not for older graphic cards and machines where we see speed decreases by factor 10 and more and thus this change leads to console responsiveness way worse than before. That's why the original commit is to be reverted. By reverting we reintroduce hardware-based scrolling acceleration and fix the performance regression for fbdev drivers. There isn't any impact on DRM when reverting those patches. Signed-off-by: Helge Deller Acked-by: Geert Uytterhoeven Acked-by: Sven Schnelle Cc: stable@vger.kernel.org # v5.16+ Signed-off-by: Helge Deller Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20220202135531.92183-2-deller@gmx.de --- Documentation/gpu/todo.rst | 13 +- drivers/video/fbdev/core/bitblit.c | 16 + drivers/video/fbdev/core/fbcon.c | 509 +++++++++++++++++++++++- drivers/video/fbdev/core/fbcon.h | 59 +++ drivers/video/fbdev/core/fbcon_ccw.c | 28 +- drivers/video/fbdev/core/fbcon_cw.c | 28 +- drivers/video/fbdev/core/fbcon_rotate.h | 9 + drivers/video/fbdev/core/fbcon_ud.c | 37 +- drivers/video/fbdev/core/tileblit.c | 16 + drivers/video/fbdev/skeletonfb.c | 12 +- include/linux/fb.h | 2 +- 11 files changed, 678 insertions(+), 51 deletions(-) diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst index da138dd39883..29506815d24a 100644 --- a/Documentation/gpu/todo.rst +++ b/Documentation/gpu/todo.rst @@ -303,19 +303,16 @@ Level: Advanced Garbage collect fbdev scrolling acceleration -------------------------------------------- -Scroll acceleration has been disabled in fbcon. Now it works as the old -SCROLL_REDRAW mode. A ton of code was removed in fbcon.c and the hook bmove was -removed from fbcon_ops. -Remaining tasks: +Scroll acceleration is disabled in fbcon by hard-wiring p->scrollmode = +SCROLL_REDRAW. There's a ton of code this will allow us to remove: -- a bunch of the hooks in fbcon_ops could be removed or simplified by calling +- lots of code in fbcon.c + +- a bunch of the hooks in fbcon_ops, maybe the remaining hooks could be called directly instead of the function table (with a switch on p->rotate) - fb_copyarea is unused after this, and can be deleted from all drivers -- after that, fb_copyarea can be deleted from fb_ops in include/linux/fb.h as - well as cfb_copyarea - Note that not all acceleration code can be deleted, since clearing and cursor support is still accelerated, which might be good candidates for further deletion projects. diff --git a/drivers/video/fbdev/core/bitblit.c b/drivers/video/fbdev/core/bitblit.c index 01fae2c96965..f98e8f298bc1 100644 --- a/drivers/video/fbdev/core/bitblit.c +++ b/drivers/video/fbdev/core/bitblit.c @@ -43,6 +43,21 @@ static void update_attr(u8 *dst, u8 *src, int attribute, } } +static void bit_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fb_copyarea area; + + area.sx = sx * vc->vc_font.width; + area.sy = sy * vc->vc_font.height; + area.dx = dx * vc->vc_font.width; + area.dy = dy * vc->vc_font.height; + area.height = height * vc->vc_font.height; + area.width = width * vc->vc_font.width; + + info->fbops->fb_copyarea(info, &area); +} + static void bit_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { @@ -378,6 +393,7 @@ static int bit_update_start(struct fb_info *info) void fbcon_set_bitops(struct fbcon_ops *ops) { + ops->bmove = bit_bmove; ops->clear = bit_clear; ops->putcs = bit_putcs; ops->clear_margins = bit_clear_margins; diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 99ecd9a6d844..fc34caddf9cf 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -173,6 +173,8 @@ static void fbcon_putcs(struct vc_data *vc, const unsigned short *s, int count, int ypos, int xpos); static void fbcon_clear_margins(struct vc_data *vc, int bottom_only); static void fbcon_cursor(struct vc_data *vc, int mode); +static void fbcon_bmove(struct vc_data *vc, int sy, int sx, int dy, int dx, + int height, int width); static int fbcon_switch(struct vc_data *vc); static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch); static void fbcon_set_palette(struct vc_data *vc, const unsigned char *table); @@ -180,8 +182,16 @@ static void fbcon_set_palette(struct vc_data *vc, const unsigned char *table); /* * Internal routines */ +static __inline__ void ywrap_up(struct vc_data *vc, int count); +static __inline__ void ywrap_down(struct vc_data *vc, int count); +static __inline__ void ypan_up(struct vc_data *vc, int count); +static __inline__ void ypan_down(struct vc_data *vc, int count); +static void fbcon_bmove_rec(struct vc_data *vc, struct fbcon_display *p, int sy, int sx, + int dy, int dx, int height, int width, u_int y_break); static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var, int unit); +static void fbcon_redraw_move(struct vc_data *vc, struct fbcon_display *p, + int line, int count, int dy); static void fbcon_modechanged(struct fb_info *info); static void fbcon_set_all_vcs(struct fb_info *info); static void fbcon_start(void); @@ -1125,6 +1135,14 @@ static void fbcon_init(struct vc_data *vc, int init) ops->graphics = 0; + /* + * No more hw acceleration for fbcon. + * + * FIXME: Garbage collect all the now dead code after sufficient time + * has passed. + */ + p->scrollmode = SCROLL_REDRAW; + /* * ++guenther: console.c:vc_allocate() relies on initializing * vc_{cols,rows}, but we must not set those if we are only @@ -1211,13 +1229,14 @@ finished: * This system is now divided into two levels because of complications * caused by hardware scrolling. Top level functions: * - * fbcon_clear(), fbcon_putc(), fbcon_clear_margins() + * fbcon_bmove(), fbcon_clear(), fbcon_putc(), fbcon_clear_margins() * * handles y values in range [0, scr_height-1] that correspond to real * screen positions. y_wrap shift means that first line of bitmap may be * anywhere on this display. These functions convert lineoffsets to * bitmap offsets and deal with the wrap-around case by splitting blits. * + * fbcon_bmove_physical_8() -- These functions fast implementations * fbcon_clear_physical_8() -- of original fbcon_XXX fns. * fbcon_putc_physical_8() -- (font width != 8) may be added later * @@ -1390,6 +1409,224 @@ static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var, } } +static __inline__ void ywrap_up(struct vc_data *vc, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + p->yscroll += count; + if (p->yscroll >= p->vrows) /* Deal with wrap */ + p->yscroll -= p->vrows; + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode |= FB_VMODE_YWRAP; + ops->update_start(info); + scrollback_max += count; + if (scrollback_max > scrollback_phys_max) + scrollback_max = scrollback_phys_max; + scrollback_current = 0; +} + +static __inline__ void ywrap_down(struct vc_data *vc, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + p->yscroll -= count; + if (p->yscroll < 0) /* Deal with wrap */ + p->yscroll += p->vrows; + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode |= FB_VMODE_YWRAP; + ops->update_start(info); + scrollback_max -= count; + if (scrollback_max < 0) + scrollback_max = 0; + scrollback_current = 0; +} + +static __inline__ void ypan_up(struct vc_data *vc, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_display *p = &fb_display[vc->vc_num]; + struct fbcon_ops *ops = info->fbcon_par; + + p->yscroll += count; + if (p->yscroll > p->vrows - vc->vc_rows) { + ops->bmove(vc, info, p->vrows - vc->vc_rows, + 0, 0, 0, vc->vc_rows, vc->vc_cols); + p->yscroll -= p->vrows - vc->vc_rows; + } + + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode &= ~FB_VMODE_YWRAP; + ops->update_start(info); + fbcon_clear_margins(vc, 1); + scrollback_max += count; + if (scrollback_max > scrollback_phys_max) + scrollback_max = scrollback_phys_max; + scrollback_current = 0; +} + +static __inline__ void ypan_up_redraw(struct vc_data *vc, int t, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + p->yscroll += count; + + if (p->yscroll > p->vrows - vc->vc_rows) { + p->yscroll -= p->vrows - vc->vc_rows; + fbcon_redraw_move(vc, p, t + count, vc->vc_rows - count, t); + } + + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode &= ~FB_VMODE_YWRAP; + ops->update_start(info); + fbcon_clear_margins(vc, 1); + scrollback_max += count; + if (scrollback_max > scrollback_phys_max) + scrollback_max = scrollback_phys_max; + scrollback_current = 0; +} + +static __inline__ void ypan_down(struct vc_data *vc, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_display *p = &fb_display[vc->vc_num]; + struct fbcon_ops *ops = info->fbcon_par; + + p->yscroll -= count; + if (p->yscroll < 0) { + ops->bmove(vc, info, 0, 0, p->vrows - vc->vc_rows, + 0, vc->vc_rows, vc->vc_cols); + p->yscroll += p->vrows - vc->vc_rows; + } + + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode &= ~FB_VMODE_YWRAP; + ops->update_start(info); + fbcon_clear_margins(vc, 1); + scrollback_max -= count; + if (scrollback_max < 0) + scrollback_max = 0; + scrollback_current = 0; +} + +static __inline__ void ypan_down_redraw(struct vc_data *vc, int t, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + p->yscroll -= count; + + if (p->yscroll < 0) { + p->yscroll += p->vrows - vc->vc_rows; + fbcon_redraw_move(vc, p, t, vc->vc_rows - count, t + count); + } + + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode &= ~FB_VMODE_YWRAP; + ops->update_start(info); + fbcon_clear_margins(vc, 1); + scrollback_max -= count; + if (scrollback_max < 0) + scrollback_max = 0; + scrollback_current = 0; +} + +static void fbcon_redraw_move(struct vc_data *vc, struct fbcon_display *p, + int line, int count, int dy) +{ + unsigned short *s = (unsigned short *) + (vc->vc_origin + vc->vc_size_row * line); + + while (count--) { + unsigned short *start = s; + unsigned short *le = advance_row(s, 1); + unsigned short c; + int x = 0; + unsigned short attr = 1; + + do { + c = scr_readw(s); + if (attr != (c & 0xff00)) { + attr = c & 0xff00; + if (s > start) { + fbcon_putcs(vc, start, s - start, + dy, x); + x += s - start; + start = s; + } + } + console_conditional_schedule(); + s++; + } while (s < le); + if (s > start) + fbcon_putcs(vc, start, s - start, dy, x); + console_conditional_schedule(); + dy++; + } +} + +static void fbcon_redraw_blit(struct vc_data *vc, struct fb_info *info, + struct fbcon_display *p, int line, int count, int ycount) +{ + int offset = ycount * vc->vc_cols; + unsigned short *d = (unsigned short *) + (vc->vc_origin + vc->vc_size_row * line); + unsigned short *s = d + offset; + struct fbcon_ops *ops = info->fbcon_par; + + while (count--) { + unsigned short *start = s; + unsigned short *le = advance_row(s, 1); + unsigned short c; + int x = 0; + + do { + c = scr_readw(s); + + if (c == scr_readw(d)) { + if (s > start) { + ops->bmove(vc, info, line + ycount, x, + line, x, 1, s-start); + x += s - start + 1; + start = s + 1; + } else { + x++; + start++; + } + } + + scr_writew(c, d); + console_conditional_schedule(); + s++; + d++; + } while (s < le); + if (s > start) + ops->bmove(vc, info, line + ycount, x, line, x, 1, + s-start); + console_conditional_schedule(); + if (ycount > 0) + line++; + else { + line--; + /* NOTE: We subtract two lines from these pointers */ + s -= vc->vc_size_row; + d -= vc->vc_size_row; + } + } +} + static void fbcon_redraw(struct vc_data *vc, struct fbcon_display *p, int line, int count, int offset) { @@ -1450,6 +1687,7 @@ static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b, { struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; struct fbcon_display *p = &fb_display[vc->vc_num]; + int scroll_partial = info->flags & FBINFO_PARTIAL_PAN_OK; if (fbcon_is_inactive(vc, info)) return true; @@ -1466,32 +1704,249 @@ static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b, case SM_UP: if (count > vc->vc_rows) /* Maximum realistic size */ count = vc->vc_rows; - fbcon_redraw(vc, p, t, b - t - count, - count * vc->vc_cols); - fbcon_clear(vc, b - count, 0, count, vc->vc_cols); - scr_memsetw((unsigned short *) (vc->vc_origin + - vc->vc_size_row * - (b - count)), - vc->vc_video_erase_char, - vc->vc_size_row * count); - return true; + if (logo_shown >= 0) + goto redraw_up; + switch (p->scrollmode) { + case SCROLL_MOVE: + fbcon_redraw_blit(vc, info, p, t, b - t - count, + count); + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + scr_memsetw((unsigned short *) (vc->vc_origin + + vc->vc_size_row * + (b - count)), + vc->vc_video_erase_char, + vc->vc_size_row * count); + return true; + + case SCROLL_WRAP_MOVE: + if (b - t - count > 3 * vc->vc_rows >> 2) { + if (t > 0) + fbcon_bmove(vc, 0, 0, count, 0, t, + vc->vc_cols); + ywrap_up(vc, count); + if (vc->vc_rows - b > 0) + fbcon_bmove(vc, b - count, 0, b, 0, + vc->vc_rows - b, + vc->vc_cols); + } else if (info->flags & FBINFO_READS_FAST) + fbcon_bmove(vc, t + count, 0, t, 0, + b - t - count, vc->vc_cols); + else + goto redraw_up; + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + break; + + case SCROLL_PAN_REDRAW: + if ((p->yscroll + count <= + 2 * (p->vrows - vc->vc_rows)) + && ((!scroll_partial && (b - t == vc->vc_rows)) + || (scroll_partial + && (b - t - count > + 3 * vc->vc_rows >> 2)))) { + if (t > 0) + fbcon_redraw_move(vc, p, 0, t, count); + ypan_up_redraw(vc, t, count); + if (vc->vc_rows - b > 0) + fbcon_redraw_move(vc, p, b, + vc->vc_rows - b, b); + } else + fbcon_redraw_move(vc, p, t + count, b - t - count, t); + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + break; + + case SCROLL_PAN_MOVE: + if ((p->yscroll + count <= + 2 * (p->vrows - vc->vc_rows)) + && ((!scroll_partial && (b - t == vc->vc_rows)) + || (scroll_partial + && (b - t - count > + 3 * vc->vc_rows >> 2)))) { + if (t > 0) + fbcon_bmove(vc, 0, 0, count, 0, t, + vc->vc_cols); + ypan_up(vc, count); + if (vc->vc_rows - b > 0) + fbcon_bmove(vc, b - count, 0, b, 0, + vc->vc_rows - b, + vc->vc_cols); + } else if (info->flags & FBINFO_READS_FAST) + fbcon_bmove(vc, t + count, 0, t, 0, + b - t - count, vc->vc_cols); + else + goto redraw_up; + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + break; + + case SCROLL_REDRAW: + redraw_up: + fbcon_redraw(vc, p, t, b - t - count, + count * vc->vc_cols); + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + scr_memsetw((unsigned short *) (vc->vc_origin + + vc->vc_size_row * + (b - count)), + vc->vc_video_erase_char, + vc->vc_size_row * count); + return true; + } + break; case SM_DOWN: if (count > vc->vc_rows) /* Maximum realistic size */ count = vc->vc_rows; - fbcon_redraw(vc, p, b - 1, b - t - count, - -count * vc->vc_cols); - fbcon_clear(vc, t, 0, count, vc->vc_cols); - scr_memsetw((unsigned short *) (vc->vc_origin + - vc->vc_size_row * - t), - vc->vc_video_erase_char, - vc->vc_size_row * count); - return true; + if (logo_shown >= 0) + goto redraw_down; + switch (p->scrollmode) { + case SCROLL_MOVE: + fbcon_redraw_blit(vc, info, p, b - 1, b - t - count, + -count); + fbcon_clear(vc, t, 0, count, vc->vc_cols); + scr_memsetw((unsigned short *) (vc->vc_origin + + vc->vc_size_row * + t), + vc->vc_video_erase_char, + vc->vc_size_row * count); + return true; + + case SCROLL_WRAP_MOVE: + if (b - t - count > 3 * vc->vc_rows >> 2) { + if (vc->vc_rows - b > 0) + fbcon_bmove(vc, b, 0, b - count, 0, + vc->vc_rows - b, + vc->vc_cols); + ywrap_down(vc, count); + if (t > 0) + fbcon_bmove(vc, count, 0, 0, 0, t, + vc->vc_cols); + } else if (info->flags & FBINFO_READS_FAST) + fbcon_bmove(vc, t, 0, t + count, 0, + b - t - count, vc->vc_cols); + else + goto redraw_down; + fbcon_clear(vc, t, 0, count, vc->vc_cols); + break; + + case SCROLL_PAN_MOVE: + if ((count - p->yscroll <= p->vrows - vc->vc_rows) + && ((!scroll_partial && (b - t == vc->vc_rows)) + || (scroll_partial + && (b - t - count > + 3 * vc->vc_rows >> 2)))) { + if (vc->vc_rows - b > 0) + fbcon_bmove(vc, b, 0, b - count, 0, + vc->vc_rows - b, + vc->vc_cols); + ypan_down(vc, count); + if (t > 0) + fbcon_bmove(vc, count, 0, 0, 0, t, + vc->vc_cols); + } else if (info->flags & FBINFO_READS_FAST) + fbcon_bmove(vc, t, 0, t + count, 0, + b - t - count, vc->vc_cols); + else + goto redraw_down; + fbcon_clear(vc, t, 0, count, vc->vc_cols); + break; + + case SCROLL_PAN_REDRAW: + if ((count - p->yscroll <= p->vrows - vc->vc_rows) + && ((!scroll_partial && (b - t == vc->vc_rows)) + || (scroll_partial + && (b - t - count > + 3 * vc->vc_rows >> 2)))) { + if (vc->vc_rows - b > 0) + fbcon_redraw_move(vc, p, b, vc->vc_rows - b, + b - count); + ypan_down_redraw(vc, t, count); + if (t > 0) + fbcon_redraw_move(vc, p, count, t, 0); + } else + fbcon_redraw_move(vc, p, t, b - t - count, t + count); + fbcon_clear(vc, t, 0, count, vc->vc_cols); + break; + + case SCROLL_REDRAW: + redraw_down: + fbcon_redraw(vc, p, b - 1, b - t - count, + -count * vc->vc_cols); + fbcon_clear(vc, t, 0, count, vc->vc_cols); + scr_memsetw((unsigned short *) (vc->vc_origin + + vc->vc_size_row * + t), + vc->vc_video_erase_char, + vc->vc_size_row * count); + return true; + } } return false; } + +static void fbcon_bmove(struct vc_data *vc, int sy, int sx, int dy, int dx, + int height, int width) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + if (fbcon_is_inactive(vc, info)) + return; + + if (!width || !height) + return; + + /* Split blits that cross physical y_wrap case. + * Pathological case involves 4 blits, better to use recursive + * code rather than unrolled case + * + * Recursive invocations don't need to erase the cursor over and + * over again, so we use fbcon_bmove_rec() + */ + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, height, width, + p->vrows - p->yscroll); +} + +static void fbcon_bmove_rec(struct vc_data *vc, struct fbcon_display *p, int sy, int sx, + int dy, int dx, int height, int width, u_int y_break) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + u_int b; + + if (sy < y_break && sy + height > y_break) { + b = y_break - sy; + if (dy < sy) { /* Avoid trashing self */ + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, + y_break); + fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, + height - b, width, y_break); + } else { + fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, + height - b, width, y_break); + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, + y_break); + } + return; + } + + if (dy < y_break && dy + height > y_break) { + b = y_break - dy; + if (dy < sy) { /* Avoid trashing self */ + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, + y_break); + fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, + height - b, width, y_break); + } else { + fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, + height - b, width, y_break); + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, + y_break); + } + return; + } + ops->bmove(vc, info, real_y(p, sy), sx, real_y(p, dy), dx, + height, width); +} + static void updatescrollmode(struct fbcon_display *p, struct fb_info *info, struct vc_data *vc) @@ -1664,7 +2119,21 @@ static int fbcon_switch(struct vc_data *vc) updatescrollmode(p, info, vc); - scrollback_phys_max = 0; + switch (p->scrollmode) { + case SCROLL_WRAP_MOVE: + scrollback_phys_max = p->vrows - vc->vc_rows; + break; + case SCROLL_PAN_MOVE: + case SCROLL_PAN_REDRAW: + scrollback_phys_max = p->vrows - 2 * vc->vc_rows; + if (scrollback_phys_max < 0) + scrollback_phys_max = 0; + break; + default: + scrollback_phys_max = 0; + break; + } + scrollback_max = 0; scrollback_current = 0; diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h index a00603b4451a..5246d0f2574b 100644 --- a/drivers/video/fbdev/core/fbcon.h +++ b/drivers/video/fbdev/core/fbcon.h @@ -29,6 +29,7 @@ struct fbcon_display { /* Filled in by the low-level console driver */ const u_char *fontdata; int userfont; /* != 0 if fontdata kmalloc()ed */ + u_short scrollmode; /* Scroll Method */ u_short inverse; /* != 0 text black on white as default */ short yscroll; /* Hardware scrolling */ int vrows; /* number of virtual rows */ @@ -51,6 +52,8 @@ struct fbcon_display { }; struct fbcon_ops { + void (*bmove)(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width); void (*clear)(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width); void (*putcs)(struct vc_data *vc, struct fb_info *info, @@ -149,6 +152,62 @@ static inline int attr_col_ec(int shift, struct vc_data *vc, #define attr_bgcol_ec(bgshift, vc, info) attr_col_ec(bgshift, vc, info, 0) #define attr_fgcol_ec(fgshift, vc, info) attr_col_ec(fgshift, vc, info, 1) + /* + * Scroll Method + */ + +/* There are several methods fbcon can use to move text around the screen: + * + * Operation Pan Wrap + *--------------------------------------------- + * SCROLL_MOVE copyarea No No + * SCROLL_PAN_MOVE copyarea Yes No + * SCROLL_WRAP_MOVE copyarea No Yes + * SCROLL_REDRAW imageblit No No + * SCROLL_PAN_REDRAW imageblit Yes No + * SCROLL_WRAP_REDRAW imageblit No Yes + * + * (SCROLL_WRAP_REDRAW is not implemented yet) + * + * In general, fbcon will choose the best scrolling + * method based on the rule below: + * + * Pan/Wrap > accel imageblit > accel copyarea > + * soft imageblit > (soft copyarea) + * + * Exception to the rule: Pan + accel copyarea is + * preferred over Pan + accel imageblit. + * + * The above is typical for PCI/AGP cards. Unless + * overridden, fbcon will never use soft copyarea. + * + * If you need to override the above rule, set the + * appropriate flags in fb_info->flags. For example, + * to prefer copyarea over imageblit, set + * FBINFO_READS_FAST. + * + * Other notes: + * + use the hardware engine to move the text + * (hw-accelerated copyarea() and fillrect()) + * + use hardware-supported panning on a large virtual screen + * + amifb can not only pan, but also wrap the display by N lines + * (i.e. visible line i = physical line (i+N) % yres). + * + read what's already rendered on the screen and + * write it in a different place (this is cfb_copyarea()) + * + re-render the text to the screen + * + * Whether to use wrapping or panning can only be figured out at + * runtime (when we know whether our font height is a multiple + * of the pan/wrap step) + * + */ + +#define SCROLL_MOVE 0x001 +#define SCROLL_PAN_MOVE 0x002 +#define SCROLL_WRAP_MOVE 0x003 +#define SCROLL_REDRAW 0x004 +#define SCROLL_PAN_REDRAW 0x005 + #ifdef CONFIG_FB_TILEBLITTING extern void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info); #endif diff --git a/drivers/video/fbdev/core/fbcon_ccw.c b/drivers/video/fbdev/core/fbcon_ccw.c index ffa78936eaab..9cd2c4b05c32 100644 --- a/drivers/video/fbdev/core/fbcon_ccw.c +++ b/drivers/video/fbdev/core/fbcon_ccw.c @@ -59,12 +59,31 @@ static void ccw_update_attr(u8 *dst, u8 *src, int attribute, } } + +static void ccw_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fbcon_ops *ops = info->fbcon_par; + struct fb_copyarea area; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + + area.sx = sy * vc->vc_font.height; + area.sy = vyres - ((sx + width) * vc->vc_font.width); + area.dx = dy * vc->vc_font.height; + area.dy = vyres - ((dx + width) * vc->vc_font.width); + area.width = height * vc->vc_font.height; + area.height = width * vc->vc_font.width; + + info->fbops->fb_copyarea(info, &area); +} + static void ccw_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { + struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vyres = info->var.yres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dx = sy * vc->vc_font.height; @@ -121,7 +140,7 @@ static void ccw_putcs(struct vc_data *vc, struct fb_info *info, u32 cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vyres = info->var.yres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -210,7 +229,7 @@ static void ccw_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vyres = info->var.yres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -368,7 +387,7 @@ static int ccw_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; u32 yoffset; - u32 vyres = info->var.yres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); int err; yoffset = (vyres - info->var.yres) - ops->var.xoffset; @@ -383,6 +402,7 @@ static int ccw_update_start(struct fb_info *info) void fbcon_rotate_ccw(struct fbcon_ops *ops) { + ops->bmove = ccw_bmove; ops->clear = ccw_clear; ops->putcs = ccw_putcs; ops->clear_margins = ccw_clear_margins; diff --git a/drivers/video/fbdev/core/fbcon_cw.c b/drivers/video/fbdev/core/fbcon_cw.c index 92e5b7fb51ee..88d89fad3f05 100644 --- a/drivers/video/fbdev/core/fbcon_cw.c +++ b/drivers/video/fbdev/core/fbcon_cw.c @@ -44,12 +44,31 @@ static void cw_update_attr(u8 *dst, u8 *src, int attribute, } } + +static void cw_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fbcon_ops *ops = info->fbcon_par; + struct fb_copyarea area; + u32 vxres = GETVXRES(ops->p->scrollmode, info); + + area.sx = vxres - ((sy + height) * vc->vc_font.height); + area.sy = sx * vc->vc_font.width; + area.dx = vxres - ((dy + height) * vc->vc_font.height); + area.dy = dx * vc->vc_font.width; + area.width = height * vc->vc_font.height; + area.height = width * vc->vc_font.width; + + info->fbops->fb_copyarea(info, &area); +} + static void cw_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { + struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vxres = info->var.xres; + u32 vxres = GETVXRES(ops->p->scrollmode, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dx = vxres - ((sy + height) * vc->vc_font.height); @@ -106,7 +125,7 @@ static void cw_putcs(struct vc_data *vc, struct fb_info *info, u32 cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vxres = info->var.xres; + u32 vxres = GETVXRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -193,7 +212,7 @@ static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vxres = info->var.xres; + u32 vxres = GETVXRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -350,7 +369,7 @@ static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode, static int cw_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; - u32 vxres = info->var.xres; + u32 vxres = GETVXRES(ops->p->scrollmode, info); u32 xoffset; int err; @@ -366,6 +385,7 @@ static int cw_update_start(struct fb_info *info) void fbcon_rotate_cw(struct fbcon_ops *ops) { + ops->bmove = cw_bmove; ops->clear = cw_clear; ops->putcs = cw_putcs; ops->clear_margins = cw_clear_margins; diff --git a/drivers/video/fbdev/core/fbcon_rotate.h b/drivers/video/fbdev/core/fbcon_rotate.h index b528b2e54283..e233444cda66 100644 --- a/drivers/video/fbdev/core/fbcon_rotate.h +++ b/drivers/video/fbdev/core/fbcon_rotate.h @@ -11,6 +11,15 @@ #ifndef _FBCON_ROTATE_H #define _FBCON_ROTATE_H +#define GETVYRES(s,i) ({ \ + (s == SCROLL_REDRAW || s == SCROLL_MOVE) ? \ + (i)->var.yres : (i)->var.yres_virtual; }) + +#define GETVXRES(s,i) ({ \ + (s == SCROLL_REDRAW || s == SCROLL_MOVE || !(i)->fix.xpanstep) ? \ + (i)->var.xres : (i)->var.xres_virtual; }) + + static inline int pattern_test_bit(u32 x, u32 y, u32 pitch, const char *pat) { u32 tmp = (y * pitch) + x, index = tmp / 8, bit = tmp % 8; diff --git a/drivers/video/fbdev/core/fbcon_ud.c b/drivers/video/fbdev/core/fbcon_ud.c index 09619bd8e021..8d5e66b1bdfb 100644 --- a/drivers/video/fbdev/core/fbcon_ud.c +++ b/drivers/video/fbdev/core/fbcon_ud.c @@ -44,13 +44,33 @@ static void ud_update_attr(u8 *dst, u8 *src, int attribute, } } + +static void ud_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fbcon_ops *ops = info->fbcon_par; + struct fb_copyarea area; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); + + area.sy = vyres - ((sy + height) * vc->vc_font.height); + area.sx = vxres - ((sx + width) * vc->vc_font.width); + area.dy = vyres - ((dy + height) * vc->vc_font.height); + area.dx = vxres - ((dx + width) * vc->vc_font.width); + area.height = height * vc->vc_font.height; + area.width = width * vc->vc_font.width; + + info->fbops->fb_copyarea(info, &area); +} + static void ud_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { + struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vyres = info->var.yres; - u32 vxres = info->var.xres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dy = vyres - ((sy + height) * vc->vc_font.height); @@ -142,8 +162,8 @@ static void ud_putcs(struct vc_data *vc, struct fb_info *info, u32 mod = vc->vc_font.width % 8, cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vyres = info->var.yres; - u32 vxres = info->var.xres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -239,8 +259,8 @@ static void ud_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vyres = info->var.yres; - u32 vxres = info->var.xres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -390,8 +410,8 @@ static int ud_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; int xoffset, yoffset; - u32 vyres = info->var.yres; - u32 vxres = info->var.xres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); int err; xoffset = vxres - info->var.xres - ops->var.xoffset; @@ -409,6 +429,7 @@ static int ud_update_start(struct fb_info *info) void fbcon_rotate_ud(struct fbcon_ops *ops) { + ops->bmove = ud_bmove; ops->clear = ud_clear; ops->putcs = ud_putcs; ops->clear_margins = ud_clear_margins; diff --git a/drivers/video/fbdev/core/tileblit.c b/drivers/video/fbdev/core/tileblit.c index 72af95053bcb..2768eff247ba 100644 --- a/drivers/video/fbdev/core/tileblit.c +++ b/drivers/video/fbdev/core/tileblit.c @@ -16,6 +16,21 @@ #include #include "fbcon.h" +static void tile_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fb_tilearea area; + + area.sx = sx; + area.sy = sy; + area.dx = dx; + area.dy = dy; + area.height = height; + area.width = width; + + info->tileops->fb_tilecopy(info, &area); +} + static void tile_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { @@ -118,6 +133,7 @@ void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info) struct fb_tilemap map; struct fbcon_ops *ops = info->fbcon_par; + ops->bmove = tile_bmove; ops->clear = tile_clear; ops->putcs = tile_putcs; ops->clear_margins = tile_clear_margins; diff --git a/drivers/video/fbdev/skeletonfb.c b/drivers/video/fbdev/skeletonfb.c index 0fe922f726e9..bcacfb6934fa 100644 --- a/drivers/video/fbdev/skeletonfb.c +++ b/drivers/video/fbdev/skeletonfb.c @@ -505,15 +505,15 @@ void xxxfb_fillrect(struct fb_info *p, const struct fb_fillrect *region) } /** - * xxxfb_copyarea - OBSOLETE function. + * xxxfb_copyarea - REQUIRED function. Can use generic routines if + * non acclerated hardware and packed pixel based. * Copies one area of the screen to another area. - * Will be deleted in a future version * * @info: frame buffer structure that represents a single frame buffer * @area: Structure providing the data to copy the framebuffer contents * from one region to another. * - * This drawing operation copied a rectangular area from one area of the + * This drawing operation copies a rectangular area from one area of the * screen to another area. */ void xxxfb_copyarea(struct fb_info *p, const struct fb_copyarea *area) @@ -645,9 +645,9 @@ static const struct fb_ops xxxfb_ops = { .fb_setcolreg = xxxfb_setcolreg, .fb_blank = xxxfb_blank, .fb_pan_display = xxxfb_pan_display, - .fb_fillrect = xxxfb_fillrect, /* Needed !!! */ - .fb_copyarea = xxxfb_copyarea, /* Obsolete */ - .fb_imageblit = xxxfb_imageblit, /* Needed !!! */ + .fb_fillrect = xxxfb_fillrect, /* Needed !!! */ + .fb_copyarea = xxxfb_copyarea, /* Needed !!! */ + .fb_imageblit = xxxfb_imageblit, /* Needed !!! */ .fb_cursor = xxxfb_cursor, /* Optional !!! */ .fb_sync = xxxfb_sync, .fb_ioctl = xxxfb_ioctl, diff --git a/include/linux/fb.h b/include/linux/fb.h index 3da95842b207..02f362c661c8 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -262,7 +262,7 @@ struct fb_ops { /* Draws a rectangle */ void (*fb_fillrect) (struct fb_info *info, const struct fb_fillrect *rect); - /* Copy data from area to another. Obsolete. */ + /* Copy data from area to another */ void (*fb_copyarea) (struct fb_info *info, const struct fb_copyarea *region); /* Draws a image to the display */ void (*fb_imageblit) (struct fb_info *info, const struct fb_image *image); From 87ab9f6b7417349aa197a6c7098d4fdd4beebb74 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 2 Feb 2022 14:55:30 +0100 Subject: [PATCH 0971/1295] Revert "fbcon: Disable accelerated scrolling" This reverts commit 39aead8373b3c20bb5965c024dfb51a94e526151. Revert the first (of 2) commits which disabled scrolling acceleration in fbcon/fbdev. It introduced a regression for fbdev-supported graphic cards because of the performance penalty by doing screen scrolling by software instead of using the existing graphic card 2D hardware acceleration. Console scrolling acceleration was disabled by dropping code which checked at runtime the driver hardware capabilities for the BINFO_HWACCEL_COPYAREA or FBINFO_HWACCEL_FILLRECT flags and if set, it enabled scrollmode SCROLL_MOVE which uses hardware acceleration to move screen contents. After dropping those checks scrollmode was hard-wired to SCROLL_REDRAW instead, which forces all graphic cards to redraw every character at the new screen position when scrolling. This change effectively disabled all hardware-based scrolling acceleration for ALL drivers, because now all kind of 2D hardware acceleration (bitblt, fillrect) in the drivers isn't used any longer. The original commit message mentions that only 3 DRM drivers (nouveau, omapdrm and gma500) used hardware acceleration in the past and thus code for checking and using scrolling acceleration is obsolete. This statement is NOT TRUE, because beside the DRM drivers there are around 35 other fbdev drivers which depend on fbdev/fbcon and still provide hardware acceleration for fbdev/fbcon. The original commit message also states that syzbot found lots of bugs in fbcon and thus it's "often the solution to just delete code and remove features". This is true, and the bugs - which actually affected all users of fbcon, including DRM - were fixed, or code was dropped like e.g. the support for software scrollback in vgacon (commit 973c096f6a85). So to further analyze which bugs were found by syzbot, I've looked through all patches in drivers/video which were tagged with syzbot or syzkaller back to year 2005. The vast majority fixed the reported issues on a higher level, e.g. when screen is to be resized, or when font size is to be changed. The few ones which touched driver code fixed a real driver bug, e.g. by adding a check. But NONE of those patches touched code of either the SCROLL_MOVE or the SCROLL_REDRAW case. That means, there was no real reason why SCROLL_MOVE had to be ripped-out and just SCROLL_REDRAW had to be used instead. The only reason I can imagine so far was that SCROLL_MOVE wasn't used by DRM and as such it was assumed that it could go away. That argument completely missed the fact that SCROLL_MOVE is still heavily used by fbdev (non-DRM) drivers. Some people mention that using memcpy() instead of the hardware acceleration is pretty much the same speed. But that's not true, at least not for older graphic cards and machines where we see speed decreases by factor 10 and more and thus this change leads to console responsiveness way worse than before. That's why the original commit is to be reverted. By reverting we reintroduce hardware-based scrolling acceleration and fix the performance regression for fbdev drivers. There isn't any impact on DRM when reverting those patches. Signed-off-by: Helge Deller Acked-by: Geert Uytterhoeven Acked-by: Sven Schnelle Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Helge Deller Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20220202135531.92183-3-deller@gmx.de --- Documentation/gpu/todo.rst | 21 --------------- drivers/video/fbdev/core/fbcon.c | 45 ++++++++++++++++++++++++++------ 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst index 29506815d24a..a1212b5b3026 100644 --- a/Documentation/gpu/todo.rst +++ b/Documentation/gpu/todo.rst @@ -300,27 +300,6 @@ Contact: Daniel Vetter, Noralf Tronnes Level: Advanced -Garbage collect fbdev scrolling acceleration --------------------------------------------- - -Scroll acceleration is disabled in fbcon by hard-wiring p->scrollmode = -SCROLL_REDRAW. There's a ton of code this will allow us to remove: - -- lots of code in fbcon.c - -- a bunch of the hooks in fbcon_ops, maybe the remaining hooks could be called - directly instead of the function table (with a switch on p->rotate) - -- fb_copyarea is unused after this, and can be deleted from all drivers - -Note that not all acceleration code can be deleted, since clearing and cursor -support is still accelerated, which might be good candidates for further -deletion projects. - -Contact: Daniel Vetter - -Level: Intermediate - idr_init_base() --------------- diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index fc34caddf9cf..0cc2a36b674a 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -1025,7 +1025,7 @@ static void fbcon_init(struct vc_data *vc, int init) struct vc_data *svc = *default_mode; struct fbcon_display *t, *p = &fb_display[vc->vc_num]; int logo = 1, new_rows, new_cols, rows, cols; - int ret; + int cap, ret; if (WARN_ON(info_idx == -1)) return; @@ -1034,6 +1034,7 @@ static void fbcon_init(struct vc_data *vc, int init) con2fb_map[vc->vc_num] = info_idx; info = registered_fb[con2fb_map[vc->vc_num]]; + cap = info->flags; if (logo_shown < 0 && console_loglevel <= CONSOLE_LOGLEVEL_QUIET) logo_shown = FBCON_LOGO_DONTSHOW; @@ -1135,13 +1136,11 @@ static void fbcon_init(struct vc_data *vc, int init) ops->graphics = 0; - /* - * No more hw acceleration for fbcon. - * - * FIXME: Garbage collect all the now dead code after sufficient time - * has passed. - */ - p->scrollmode = SCROLL_REDRAW; + if ((cap & FBINFO_HWACCEL_COPYAREA) && + !(cap & FBINFO_HWACCEL_DISABLED)) + p->scrollmode = SCROLL_MOVE; + else /* default to something safe */ + p->scrollmode = SCROLL_REDRAW; /* * ++guenther: console.c:vc_allocate() relies on initializing @@ -1953,15 +1952,45 @@ static void updatescrollmode(struct fbcon_display *p, { struct fbcon_ops *ops = info->fbcon_par; int fh = vc->vc_font.height; + int cap = info->flags; + u16 t = 0; + int ypan = FBCON_SWAP(ops->rotate, info->fix.ypanstep, + info->fix.xpanstep); + int ywrap = FBCON_SWAP(ops->rotate, info->fix.ywrapstep, t); int yres = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres); int vyres = FBCON_SWAP(ops->rotate, info->var.yres_virtual, info->var.xres_virtual); + int good_pan = (cap & FBINFO_HWACCEL_YPAN) && + divides(ypan, vc->vc_font.height) && vyres > yres; + int good_wrap = (cap & FBINFO_HWACCEL_YWRAP) && + divides(ywrap, vc->vc_font.height) && + divides(vc->vc_font.height, vyres) && + divides(vc->vc_font.height, yres); + int reading_fast = cap & FBINFO_READS_FAST; + int fast_copyarea = (cap & FBINFO_HWACCEL_COPYAREA) && + !(cap & FBINFO_HWACCEL_DISABLED); + int fast_imageblit = (cap & FBINFO_HWACCEL_IMAGEBLIT) && + !(cap & FBINFO_HWACCEL_DISABLED); p->vrows = vyres/fh; if (yres > (fh * (vc->vc_rows + 1))) p->vrows -= (yres - (fh * vc->vc_rows)) / fh; if ((yres % fh) && (vyres % fh < yres % fh)) p->vrows--; + + if (good_wrap || good_pan) { + if (reading_fast || fast_copyarea) + p->scrollmode = good_wrap ? + SCROLL_WRAP_MOVE : SCROLL_PAN_MOVE; + else + p->scrollmode = good_wrap ? SCROLL_REDRAW : + SCROLL_PAN_REDRAW; + } else { + if (reading_fast || (fast_copyarea && !fast_imageblit)) + p->scrollmode = SCROLL_MOVE; + else + p->scrollmode = SCROLL_REDRAW; + } } #define PITCH(w) (((w) + 7) >> 3) From a3f781a9d6114c1d1e01defb7aa234dec45d2a5f Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 2 Feb 2022 14:55:31 +0100 Subject: [PATCH 0972/1295] fbcon: Add option to enable legacy hardware acceleration Add a config option CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION to enable bitblt and fillrect hardware acceleration in the framebuffer console. If disabled, such acceleration will not be used, even if it is supported by the graphics hardware driver. If you plan to use DRM as your main graphics output system, you should disable this option since it will prevent compiling in code which isn't used later on when DRM takes over. For all other configurations, e.g. if none of your graphic cards support DRM (yet), DRM isn't available for your architecture, or you can't be sure that the graphic card in the target system will support DRM, you most likely want to enable this option. In the non-accelerated case (e.g. when DRM is used), the inlined fb_scrollmode() function is hardcoded to return SCROLL_REDRAW and as such the compiler is able to optimize much unneccesary code away. In this v3 patch version I additionally changed the GETVYRES() and GETVXRES() macros to take a pointer to the fbcon_display struct. This fixes the build when console rotation is enabled and helps the compiler again to optimize out code. Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Helge Deller Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20220202135531.92183-4-deller@gmx.de --- drivers/video/console/Kconfig | 20 +++++++++++++ drivers/video/fbdev/core/fbcon.c | 39 ++++++++++++++++++------- drivers/video/fbdev/core/fbcon.h | 15 +++++++++- drivers/video/fbdev/core/fbcon_ccw.c | 10 +++---- drivers/video/fbdev/core/fbcon_cw.c | 10 +++---- drivers/video/fbdev/core/fbcon_rotate.h | 4 +-- drivers/video/fbdev/core/fbcon_ud.c | 20 ++++++------- 7 files changed, 84 insertions(+), 34 deletions(-) diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig index 840d9813b0bc..fcc46380e7c9 100644 --- a/drivers/video/console/Kconfig +++ b/drivers/video/console/Kconfig @@ -78,6 +78,26 @@ config FRAMEBUFFER_CONSOLE help Low-level framebuffer-based console driver. +config FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION + bool "Enable legacy fbcon hardware acceleration code" + depends on FRAMEBUFFER_CONSOLE + default y if PARISC + default n + help + This option enables the fbcon (framebuffer text-based) hardware + acceleration for graphics drivers which were written for the fbdev + graphics interface. + + On modern machines, on mainstream machines (like x86-64) or when + using a modern Linux distribution those fbdev drivers usually aren't used. + So enabling this option wouldn't have any effect, which is why you want + to disable this option on such newer machines. + + If you compile this kernel for older machines which still require the + fbdev drivers, you may want to say Y. + + If unsure, select n. + config FRAMEBUFFER_CONSOLE_DETECT_PRIMARY bool "Map the console to the primary display device" depends on FRAMEBUFFER_CONSOLE diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 0cc2a36b674a..f36829eeb5a9 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -1136,11 +1136,13 @@ static void fbcon_init(struct vc_data *vc, int init) ops->graphics = 0; +#ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION if ((cap & FBINFO_HWACCEL_COPYAREA) && !(cap & FBINFO_HWACCEL_DISABLED)) p->scrollmode = SCROLL_MOVE; else /* default to something safe */ p->scrollmode = SCROLL_REDRAW; +#endif /* * ++guenther: console.c:vc_allocate() relies on initializing @@ -1705,7 +1707,7 @@ static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b, count = vc->vc_rows; if (logo_shown >= 0) goto redraw_up; - switch (p->scrollmode) { + switch (fb_scrollmode(p)) { case SCROLL_MOVE: fbcon_redraw_blit(vc, info, p, t, b - t - count, count); @@ -1795,7 +1797,7 @@ static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b, count = vc->vc_rows; if (logo_shown >= 0) goto redraw_down; - switch (p->scrollmode) { + switch (fb_scrollmode(p)) { case SCROLL_MOVE: fbcon_redraw_blit(vc, info, p, b - 1, b - t - count, -count); @@ -1946,12 +1948,12 @@ static void fbcon_bmove_rec(struct vc_data *vc, struct fbcon_display *p, int sy, height, width); } -static void updatescrollmode(struct fbcon_display *p, +static void updatescrollmode_accel(struct fbcon_display *p, struct fb_info *info, struct vc_data *vc) { +#ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION struct fbcon_ops *ops = info->fbcon_par; - int fh = vc->vc_font.height; int cap = info->flags; u16 t = 0; int ypan = FBCON_SWAP(ops->rotate, info->fix.ypanstep, @@ -1972,12 +1974,6 @@ static void updatescrollmode(struct fbcon_display *p, int fast_imageblit = (cap & FBINFO_HWACCEL_IMAGEBLIT) && !(cap & FBINFO_HWACCEL_DISABLED); - p->vrows = vyres/fh; - if (yres > (fh * (vc->vc_rows + 1))) - p->vrows -= (yres - (fh * vc->vc_rows)) / fh; - if ((yres % fh) && (vyres % fh < yres % fh)) - p->vrows--; - if (good_wrap || good_pan) { if (reading_fast || fast_copyarea) p->scrollmode = good_wrap ? @@ -1991,6 +1987,27 @@ static void updatescrollmode(struct fbcon_display *p, else p->scrollmode = SCROLL_REDRAW; } +#endif +} + +static void updatescrollmode(struct fbcon_display *p, + struct fb_info *info, + struct vc_data *vc) +{ + struct fbcon_ops *ops = info->fbcon_par; + int fh = vc->vc_font.height; + int yres = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres); + int vyres = FBCON_SWAP(ops->rotate, info->var.yres_virtual, + info->var.xres_virtual); + + p->vrows = vyres/fh; + if (yres > (fh * (vc->vc_rows + 1))) + p->vrows -= (yres - (fh * vc->vc_rows)) / fh; + if ((yres % fh) && (vyres % fh < yres % fh)) + p->vrows--; + + /* update scrollmode in case hardware acceleration is used */ + updatescrollmode_accel(p, info, vc); } #define PITCH(w) (((w) + 7) >> 3) @@ -2148,7 +2165,7 @@ static int fbcon_switch(struct vc_data *vc) updatescrollmode(p, info, vc); - switch (p->scrollmode) { + switch (fb_scrollmode(p)) { case SCROLL_WRAP_MOVE: scrollback_phys_max = p->vrows - vc->vc_rows; break; diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h index 5246d0f2574b..969d41ecede5 100644 --- a/drivers/video/fbdev/core/fbcon.h +++ b/drivers/video/fbdev/core/fbcon.h @@ -29,7 +29,9 @@ struct fbcon_display { /* Filled in by the low-level console driver */ const u_char *fontdata; int userfont; /* != 0 if fontdata kmalloc()ed */ - u_short scrollmode; /* Scroll Method */ +#ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION + u_short scrollmode; /* Scroll Method, use fb_scrollmode() */ +#endif u_short inverse; /* != 0 text black on white as default */ short yscroll; /* Hardware scrolling */ int vrows; /* number of virtual rows */ @@ -208,6 +210,17 @@ static inline int attr_col_ec(int shift, struct vc_data *vc, #define SCROLL_REDRAW 0x004 #define SCROLL_PAN_REDRAW 0x005 +static inline u_short fb_scrollmode(struct fbcon_display *fb) +{ +#ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION + return fb->scrollmode; +#else + /* hardcoded to SCROLL_REDRAW if acceleration was disabled. */ + return SCROLL_REDRAW; +#endif +} + + #ifdef CONFIG_FB_TILEBLITTING extern void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info); #endif diff --git a/drivers/video/fbdev/core/fbcon_ccw.c b/drivers/video/fbdev/core/fbcon_ccw.c index 9cd2c4b05c32..2789ace79634 100644 --- a/drivers/video/fbdev/core/fbcon_ccw.c +++ b/drivers/video/fbdev/core/fbcon_ccw.c @@ -65,7 +65,7 @@ static void ccw_bmove(struct vc_data *vc, struct fb_info *info, int sy, { struct fbcon_ops *ops = info->fbcon_par; struct fb_copyarea area; - u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); area.sx = sy * vc->vc_font.height; area.sy = vyres - ((sx + width) * vc->vc_font.width); @@ -83,7 +83,7 @@ static void ccw_clear(struct vc_data *vc, struct fb_info *info, int sy, struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dx = sy * vc->vc_font.height; @@ -140,7 +140,7 @@ static void ccw_putcs(struct vc_data *vc, struct fb_info *info, u32 cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); if (!ops->fontbuffer) return; @@ -229,7 +229,7 @@ static void ccw_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); if (!ops->fontbuffer) return; @@ -387,7 +387,7 @@ static int ccw_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; u32 yoffset; - u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); int err; yoffset = (vyres - info->var.yres) - ops->var.xoffset; diff --git a/drivers/video/fbdev/core/fbcon_cw.c b/drivers/video/fbdev/core/fbcon_cw.c index 88d89fad3f05..86a254c1b2b7 100644 --- a/drivers/video/fbdev/core/fbcon_cw.c +++ b/drivers/video/fbdev/core/fbcon_cw.c @@ -50,7 +50,7 @@ static void cw_bmove(struct vc_data *vc, struct fb_info *info, int sy, { struct fbcon_ops *ops = info->fbcon_par; struct fb_copyarea area; - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p, info); area.sx = vxres - ((sy + height) * vc->vc_font.height); area.sy = sx * vc->vc_font.width; @@ -68,7 +68,7 @@ static void cw_clear(struct vc_data *vc, struct fb_info *info, int sy, struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dx = vxres - ((sy + height) * vc->vc_font.height); @@ -125,7 +125,7 @@ static void cw_putcs(struct vc_data *vc, struct fb_info *info, u32 cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p, info); if (!ops->fontbuffer) return; @@ -212,7 +212,7 @@ static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p, info); if (!ops->fontbuffer) return; @@ -369,7 +369,7 @@ static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode, static int cw_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p, info); u32 xoffset; int err; diff --git a/drivers/video/fbdev/core/fbcon_rotate.h b/drivers/video/fbdev/core/fbcon_rotate.h index e233444cda66..01cbe303b8a2 100644 --- a/drivers/video/fbdev/core/fbcon_rotate.h +++ b/drivers/video/fbdev/core/fbcon_rotate.h @@ -12,11 +12,11 @@ #define _FBCON_ROTATE_H #define GETVYRES(s,i) ({ \ - (s == SCROLL_REDRAW || s == SCROLL_MOVE) ? \ + (fb_scrollmode(s) == SCROLL_REDRAW || fb_scrollmode(s) == SCROLL_MOVE) ? \ (i)->var.yres : (i)->var.yres_virtual; }) #define GETVXRES(s,i) ({ \ - (s == SCROLL_REDRAW || s == SCROLL_MOVE || !(i)->fix.xpanstep) ? \ + (fb_scrollmode(s) == SCROLL_REDRAW || fb_scrollmode(s) == SCROLL_MOVE || !(i)->fix.xpanstep) ? \ (i)->var.xres : (i)->var.xres_virtual; }) diff --git a/drivers/video/fbdev/core/fbcon_ud.c b/drivers/video/fbdev/core/fbcon_ud.c index 8d5e66b1bdfb..23bc045769d0 100644 --- a/drivers/video/fbdev/core/fbcon_ud.c +++ b/drivers/video/fbdev/core/fbcon_ud.c @@ -50,8 +50,8 @@ static void ud_bmove(struct vc_data *vc, struct fb_info *info, int sy, { struct fbcon_ops *ops = info->fbcon_par; struct fb_copyarea area; - u32 vyres = GETVYRES(ops->p->scrollmode, info); - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); + u32 vxres = GETVXRES(ops->p, info); area.sy = vyres - ((sy + height) * vc->vc_font.height); area.sx = vxres - ((sx + width) * vc->vc_font.width); @@ -69,8 +69,8 @@ static void ud_clear(struct vc_data *vc, struct fb_info *info, int sy, struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vyres = GETVYRES(ops->p->scrollmode, info); - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); + u32 vxres = GETVXRES(ops->p, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dy = vyres - ((sy + height) * vc->vc_font.height); @@ -162,8 +162,8 @@ static void ud_putcs(struct vc_data *vc, struct fb_info *info, u32 mod = vc->vc_font.width % 8, cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vyres = GETVYRES(ops->p->scrollmode, info); - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); + u32 vxres = GETVXRES(ops->p, info); if (!ops->fontbuffer) return; @@ -259,8 +259,8 @@ static void ud_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vyres = GETVYRES(ops->p->scrollmode, info); - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); + u32 vxres = GETVXRES(ops->p, info); if (!ops->fontbuffer) return; @@ -410,8 +410,8 @@ static int ud_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; int xoffset, yoffset; - u32 vyres = GETVYRES(ops->p->scrollmode, info); - u32 vxres = GETVXRES(ops->p->scrollmode, info); + u32 vyres = GETVYRES(ops->p, info); + u32 vxres = GETVXRES(ops->p, info); int err; xoffset = vxres - info->var.xres - ops->var.xoffset; From 3e1f941dd9f33776b3df4e30f741fe445ff773f3 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 1 Feb 2022 11:04:20 +0100 Subject: [PATCH 0973/1295] block: fix DIO handling regressions in blkdev_read_iter() Commit ceaa762527f4 ("block: move direct_IO into our own read_iter handler") introduced several regressions for bdev DIO: 1. read spanning EOF always returns 0 instead of the number of bytes read. This is because "count" is assigned early and isn't updated when the iterator is truncated: $ lsblk -o name,size /dev/vdb NAME SIZE vdb 1G $ xfs_io -d -c 'pread -b 4M 1021M 4M' /dev/vdb read 0/4194304 bytes at offset 1070596096 0.000000 bytes, 0 ops; 0.0007 sec (0.000000 bytes/sec and 0.0000 ops/sec) instead of $ xfs_io -d -c 'pread -b 4M 1021M 4M' /dev/vdb read 3145728/4194304 bytes at offset 1070596096 3 MiB, 1 ops; 0.0007 sec (3.865 GiB/sec and 1319.2612 ops/sec) 2. truncated iterator isn't reexpanded 3. iterator isn't reverted on blkdev_direct_IO() error 4. zero size read no longer skips atime update Fixes: ceaa762527f4 ("block: move direct_IO into our own read_iter handler") Signed-off-by: Ilya Dryomov Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220201100420.25875-1-idryomov@gmail.com Signed-off-by: Jens Axboe --- block/fops.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/block/fops.c b/block/fops.c index 26bf15c770d2..4f59e0f5bf30 100644 --- a/block/fops.c +++ b/block/fops.c @@ -566,34 +566,37 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct block_device *bdev = iocb->ki_filp->private_data; loff_t size = bdev_nr_bytes(bdev); - size_t count = iov_iter_count(to); loff_t pos = iocb->ki_pos; size_t shorted = 0; ssize_t ret = 0; + size_t count; - if (unlikely(pos + count > size)) { + if (unlikely(pos + iov_iter_count(to) > size)) { if (pos >= size) return 0; size -= pos; - if (count > size) { - shorted = count - size; - iov_iter_truncate(to, size); - } + shorted = iov_iter_count(to) - size; + iov_iter_truncate(to, size); } + count = iov_iter_count(to); + if (!count) + goto reexpand; /* skip atime */ + if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = iocb->ki_filp->f_mapping; if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1)) - return -EAGAIN; + if (filemap_range_needs_writeback(mapping, pos, + pos + count - 1)) { + ret = -EAGAIN; + goto reexpand; + } } else { - ret = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); + ret = filemap_write_and_wait_range(mapping, pos, + pos + count - 1); if (ret < 0) - return ret; + goto reexpand; } file_accessed(iocb->ki_filp); @@ -603,12 +606,14 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) iocb->ki_pos += ret; count -= ret; } + iov_iter_revert(to, count - iov_iter_count(to)); if (ret < 0 || !count) - return ret; + goto reexpand; } ret = filemap_read(iocb, to, ret); +reexpand: if (unlikely(shorted)) iov_iter_reexpand(to, iov_iter_count(to) + shorted); return ret; From c86d86131ab75696fc52d98571148842e067d620 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Wed, 2 Feb 2022 06:09:04 +0300 Subject: [PATCH 0974/1295] Partially revert "net/smc: Add netlink net namespace support" The change of sizeof(struct smc_diag_linkinfo) by commit 79d39fc503b4 ("net/smc: Add netlink net namespace support") introduced an ABI regression: since struct smc_diag_lgrinfo contains an object of type "struct smc_diag_linkinfo", offset of all subsequent members of struct smc_diag_lgrinfo was changed by that change. As result, applications compiled with the old version of struct smc_diag_linkinfo will receive garbage in struct smc_diag_lgrinfo.role if the kernel implements this new version of struct smc_diag_linkinfo. Fix this regression by reverting the part of commit 79d39fc503b4 that changes struct smc_diag_linkinfo. After all, there is SMC_GEN_NETLINK interface which is good enough, so there is probably no need to touch the smc_diag ABI in the first place. Fixes: 79d39fc503b4 ("net/smc: Add netlink net namespace support") Signed-off-by: Dmitry V. Levin Reviewed-by: Karsten Graul Link: https://lore.kernel.org/r/20220202030904.GA9742@altlinux.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/smc_diag.h | 11 +++++------ net/smc/smc_diag.c | 2 -- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index c7008d87f1a4..8cb3a6fef553 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -84,12 +84,11 @@ struct smc_diag_conninfo { /* SMC_DIAG_LINKINFO */ struct smc_diag_linkinfo { - __u8 link_id; /* link identifier */ - __u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */ - __u8 ibport; /* RDMA device port number */ - __u8 gid[40]; /* local GID */ - __u8 peer_gid[40]; /* peer GID */ - __aligned_u64 net_cookie; /* RDMA device net namespace */ + __u8 link_id; /* link identifier */ + __u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */ + __u8 ibport; /* RDMA device port number */ + __u8 gid[40]; /* local GID */ + __u8 peer_gid[40]; /* peer GID */ }; struct smc_diag_lgrinfo { diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index b8898c787d23..1fca2f90a9c7 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -146,13 +146,11 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_link *link = smc->conn.lnk; - struct net *net = read_pnet(&link->smcibdev->ibdev->coredev.rdma_net); struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, .lnk[0].ibport = link->ibport, .lnk[0].link_id = link->link_id, - .lnk[0].net_cookie = net->net_cookie, }; memcpy(linfo.lnk[0].ibname, From d9c4e39c1f8f8a8ebaccf00b8f22c14364b2d27e Mon Sep 17 00:00:00 2001 From: "trondmy@kernel.org" Date: Tue, 18 Jan 2022 19:25:42 -0500 Subject: [PATCH 0975/1295] NFS: Don't overfill uncached readdir pages If we're doing an uncached read of the directory, then we ideally want to read only the exact set of entries that will fit in the buffer supplied by the getdents() system call. So unlike the case where we're reading into the page cache, let's send only one READDIR call, before trying to fill up the buffer. Fixes: 35df59d3ef69 ("NFS: Reduce number of RPC calls when doing uncached readdir") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 848f3b8fb821..43df0b365b98 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -867,7 +867,8 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, status = nfs_readdir_page_filler(desc, entry, pages, pglen, arrays, narrays); - } while (!status && nfs_readdir_page_needs_filling(page)); + } while (!status && nfs_readdir_page_needs_filling(page) && + page_mapping(page)); nfs_readdir_free_pages(pages, array_size); out: From ce292d8faf41f62e0fb0c78476c6fce5d629235a Mon Sep 17 00:00:00 2001 From: "trondmy@kernel.org" Date: Tue, 18 Jan 2022 19:52:16 -0500 Subject: [PATCH 0976/1295] NFS: Don't skip directory entries when doing uncached readdir Ensure that we initialise desc->cache_entry_index correctly in uncached_readdir(). Fixes: d1bacf9eb2fd ("NFS: add readdir cache array") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 43df0b365b98..a3de586d21e2 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1042,6 +1042,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) goto out; desc->page_index = 0; + desc->cache_entry_index = 0; desc->last_cookie = desc->dir_cookie; desc->duped = 0; From e1d2699b96793d19388e302fa095e0da2c145701 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Jan 2022 22:10:52 -0500 Subject: [PATCH 0977/1295] NFS: Avoid duplicate uncached readdir calls on eof If we've reached the end of the directory, then cache that information in the context so that we don't need to do an uncached readdir in order to rediscover that fact. Fixes: 794092c57f89 ("NFS: Do uncached readdir when we're seeking a cookie in an empty page cache") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 20 +++++++++++++++----- include/linux/nfs_fs.h | 1 + 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a3de586d21e2..7bc7cf6b26f0 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -80,6 +80,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir ctx->dir_cookie = 0; ctx->dup_cookie = 0; ctx->page_index = 0; + ctx->eof = false; spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) @@ -168,6 +169,7 @@ struct nfs_readdir_descriptor { unsigned int cache_entry_index; signed char duped; bool plus; + bool eob; bool eof; }; @@ -989,7 +991,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, ent = &array->array[i]; if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { - desc->eof = true; + desc->eob = true; break; } memcpy(desc->verf, verf, sizeof(desc->verf)); @@ -1005,7 +1007,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, desc->duped = 1; } if (array->page_is_eof) - desc->eof = true; + desc->eof = !desc->eob; kunmap(desc->page); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n", @@ -1048,7 +1050,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz); - for (i = 0; !desc->eof && i < sz && arrays[i]; i++) { + for (i = 0; !desc->eob && i < sz && arrays[i]; i++) { desc->page = arrays[i]; nfs_do_filldir(desc, verf); } @@ -1107,9 +1109,15 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->duped = dir_ctx->duped; page_index = dir_ctx->page_index; desc->attr_gencount = dir_ctx->attr_gencount; + desc->eof = dir_ctx->eof; memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); spin_unlock(&file->f_lock); + if (desc->eof) { + res = 0; + goto out_free; + } + if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) && list_is_singular(&nfsi->open_files)) invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1); @@ -1143,7 +1151,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) nfs_do_filldir(desc, nfsi->cookieverf); nfs_readdir_page_unlock_and_put_cached(desc); - } while (!desc->eof); + } while (!desc->eob && !desc->eof); spin_lock(&file->f_lock); dir_ctx->dir_cookie = desc->dir_cookie; @@ -1151,9 +1159,10 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) dir_ctx->duped = desc->duped; dir_ctx->attr_gencount = desc->attr_gencount; dir_ctx->page_index = desc->page_index; + dir_ctx->eof = desc->eof; memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf)); spin_unlock(&file->f_lock); - +out_free: kfree(desc); out: @@ -1195,6 +1204,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) if (offset == 0) memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf)); dir_ctx->duped = 0; + dir_ctx->eof = false; } spin_unlock(&filp->f_lock); return offset; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 02aa49323d1d..68f81d8d36de 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -107,6 +107,7 @@ struct nfs_open_dir_context { __u64 dup_cookie; pgoff_t page_index; signed char duped; + bool eof; }; /* From 186edf7e368c40d06cf727a1ad14698ea67b74ad Mon Sep 17 00:00:00 2001 From: Vratislav Bendel Date: Wed, 2 Feb 2022 12:25:11 +0100 Subject: [PATCH 0978/1295] selinux: fix double free of cond_list on error paths On error path from cond_read_list() and duplicate_policydb_cond_list() the cond_list_destroy() gets called a second time in caller functions, resulting in NULL pointer deref. Fix this by resetting the cond_list_len to 0 in cond_list_destroy(), making subsequent calls a noop. Also consistently reset the cond_list pointer to NULL after freeing. Cc: stable@vger.kernel.org Signed-off-by: Vratislav Bendel [PM: fix line lengths in the description] Signed-off-by: Paul Moore --- security/selinux/ss/conditional.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c index 2ec6e5cd25d9..feb206f3acb4 100644 --- a/security/selinux/ss/conditional.c +++ b/security/selinux/ss/conditional.c @@ -152,6 +152,8 @@ static void cond_list_destroy(struct policydb *p) for (i = 0; i < p->cond_list_len; i++) cond_node_destroy(&p->cond_list[i]); kfree(p->cond_list); + p->cond_list = NULL; + p->cond_list_len = 0; } void cond_policydb_destroy(struct policydb *p) @@ -441,7 +443,6 @@ int cond_read_list(struct policydb *p, void *fp) return 0; err: cond_list_destroy(p); - p->cond_list = NULL; return rc; } From 83230351c523b04ff8a029a4bdf97d881ecb96fc Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Sat, 15 Jan 2022 09:11:11 +0800 Subject: [PATCH 0979/1295] integrity: check the return value of audit_log_start() audit_log_start() returns audit_buffer pointer on success or NULL on error, so it is better to check the return value of it. Fixes: 3323eec921ef ("integrity: IMA as an integrity service provider") Signed-off-by: Xiaoke Wang Cc: Reviewed-by: Paul Moore Signed-off-by: Mimi Zohar --- security/integrity/integrity_audit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/security/integrity/integrity_audit.c b/security/integrity/integrity_audit.c index 29220056207f..0ec5e4c22cb2 100644 --- a/security/integrity/integrity_audit.c +++ b/security/integrity/integrity_audit.c @@ -45,6 +45,8 @@ void integrity_audit_message(int audit_msgno, struct inode *inode, return; ab = audit_log_start(audit_context(), GFP_KERNEL, audit_msgno); + if (!ab) + return; audit_log_format(ab, "pid=%d uid=%u auid=%u ses=%u", task_pid_nr(current), from_kuid(&init_user_ns, current_uid()), From f7333b9572d0559e00352a926c92f29f061b4569 Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Tue, 25 Jan 2022 17:46:23 -0500 Subject: [PATCH 0980/1295] ima: Remove ima_policy file before directory The removal of ima_dir currently fails since ima_policy still exists, so remove the ima_policy file before removing the directory. Fixes: 4af4662fa4a9 ("integrity: IMA policy") Signed-off-by: Stefan Berger Cc: Acked-by: Christian Brauner Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index 3d8e9d5db5aa..3ad8f7734208 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -496,12 +496,12 @@ int __init ima_fs_init(void) return 0; out: + securityfs_remove(ima_policy); securityfs_remove(violations); securityfs_remove(runtime_measurements_count); securityfs_remove(ascii_runtime_measurements); securityfs_remove(binary_runtime_measurements); securityfs_remove(ima_symlink); securityfs_remove(ima_dir); - securityfs_remove(ima_policy); return -1; } From bb8e52e4906f148c2faf6656b5106cf7233e9301 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Mon, 31 Jan 2022 18:11:39 +0100 Subject: [PATCH 0981/1295] ima: Allow template selection with ima_template[_fmt]= after ima_hash= Commit c2426d2ad5027 ("ima: added support for new kernel cmdline parameter ima_template_fmt") introduced an additional check on the ima_template variable to avoid multiple template selection. Unfortunately, ima_template could be also set by the setup function of the ima_hash= parameter, when it calls ima_template_desc_current(). This causes attempts to choose a new template with ima_template= or with ima_template_fmt=, after ima_hash=, to be ignored. Achieve the goal of the commit mentioned with the new static variable template_setup_done, so that template selection requests after ima_hash= are not ignored. Finally, call ima_init_template_list(), if not already done, to initialize the list of templates before lookup_template_desc() is called. Reported-by: Guo Zihua Signed-off-by: Roberto Sassu Cc: stable@vger.kernel.org Fixes: c2426d2ad5027 ("ima: added support for new kernel cmdline parameter ima_template_fmt") Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_template.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index 694560396be0..db1ad6d7a57f 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -29,6 +29,7 @@ static struct ima_template_desc builtin_templates[] = { static LIST_HEAD(defined_templates); static DEFINE_SPINLOCK(template_list); +static int template_setup_done; static const struct ima_template_field supported_fields[] = { {.field_id = "d", .field_init = ima_eventdigest_init, @@ -101,10 +102,11 @@ static int __init ima_template_setup(char *str) struct ima_template_desc *template_desc; int template_len = strlen(str); - if (ima_template) + if (template_setup_done) return 1; - ima_init_template_list(); + if (!ima_template) + ima_init_template_list(); /* * Verify that a template with the supplied name exists. @@ -128,6 +130,7 @@ static int __init ima_template_setup(char *str) } ima_template = template_desc; + template_setup_done = 1; return 1; } __setup("ima_template=", ima_template_setup); @@ -136,7 +139,7 @@ static int __init ima_template_fmt_setup(char *str) { int num_templates = ARRAY_SIZE(builtin_templates); - if (ima_template) + if (template_setup_done) return 1; if (template_desc_init_fields(str, NULL, NULL) < 0) { @@ -147,6 +150,7 @@ static int __init ima_template_fmt_setup(char *str) builtin_templates[num_templates - 1].fmt = str; ima_template = builtin_templates + num_templates - 1; + template_setup_done = 1; return 1; } From 89677197ae709eb1ab3646952c44f6a171c9e74c Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Tue, 1 Feb 2022 15:37:10 -0500 Subject: [PATCH 0982/1295] ima: Do not print policy rule with inactive LSM labels Before printing a policy rule scan for inactive LSM labels in the policy rule. Inactive LSM labels are identified by args_p != NULL and rule == NULL. Fixes: 483ec26eed42 ("ima: ima/lsm policy rule loading logic bug fixes") Signed-off-by: Stefan Berger Cc: # v5.6+ Acked-by: Christian Brauner [zohar@linux.ibm.com: Updated "Fixes" tag] Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_policy.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 320ca80aacab..2a1f6418b10a 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -1967,6 +1967,14 @@ int ima_policy_show(struct seq_file *m, void *v) rcu_read_lock(); + /* Do not print rules with inactive LSM labels */ + for (i = 0; i < MAX_LSM_RULES; i++) { + if (entry->lsm[i].args_p && !entry->lsm[i].rule) { + rcu_read_unlock(); + return 0; + } + } + if (entry->action & MEASURE) seq_puts(m, pt(Opt_measure)); if (entry->action & DONT_MEASURE) From 81eb8b0b18789e647e65579303529fd52d861cc2 Mon Sep 17 00:00:00 2001 From: Steen Hegelund Date: Wed, 2 Feb 2022 09:30:39 +0100 Subject: [PATCH 0983/1295] net: sparx5: do not refer to skb after passing it on Do not try to use any SKB fields after the packet has been passed up in the receive stack. Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Steen Hegelund Link: https://lore.kernel.org/r/20220202083039.3774851-1-steen.hegelund@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c index dc7e5ea6ec15..148d431fcde4 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c @@ -145,9 +145,9 @@ static void sparx5_xtr_grp(struct sparx5 *sparx5, u8 grp, bool byte_swap) skb_put(skb, byte_cnt - ETH_FCS_LEN); eth_skb_pad(skb); skb->protocol = eth_type_trans(skb, netdev); - netif_rx(skb); netdev->stats.rx_bytes += skb->len; netdev->stats.rx_packets++; + netif_rx(skb); } static int sparx5_inject(struct sparx5 *sparx5, From 2ea88716369ac9a7486a8cb309d6bf1239ea156c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 23 Jan 2022 17:27:47 +0100 Subject: [PATCH 0984/1295] libceph: make recv path in secure mode work the same as send path The recv path of secure mode is intertwined with that of crc mode. While it's slightly more efficient that way (the ciphertext is read into the destination buffer and decrypted in place, thus avoiding two potentially heavy memory allocations for the bounce buffer and the corresponding sg array), it isn't really amenable to changes. Sacrifice that edge and align with the send path which always uses a full-sized bounce buffer (currently there is no other way -- if the kernel crypto API ever grows support for streaming (piecewise) en/decryption for GCM [1], we would be able to easily take advantage of that on both sides). [1] https://lore.kernel.org/all/20141225202830.GA18794@gondor.apana.org.au/ Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/messenger.h | 4 + net/ceph/messenger_v2.c | 216 +++++++++++++++++++++++---------- 2 files changed, 158 insertions(+), 62 deletions(-) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index ff99ce094cfa..6c6b6ea52bb8 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -383,6 +383,10 @@ struct ceph_connection_v2_info { struct ceph_gcm_nonce in_gcm_nonce; struct ceph_gcm_nonce out_gcm_nonce; + struct page **in_enc_pages; + int in_enc_page_cnt; + int in_enc_resid; + int in_enc_i; struct page **out_enc_pages; int out_enc_page_cnt; int out_enc_resid; diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index c4099b641b38..2ea00489e691 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -57,8 +57,9 @@ #define IN_S_HANDLE_CONTROL_REMAINDER 3 #define IN_S_PREPARE_READ_DATA 4 #define IN_S_PREPARE_READ_DATA_CONT 5 -#define IN_S_HANDLE_EPILOGUE 6 -#define IN_S_FINISH_SKIP 7 +#define IN_S_PREPARE_READ_ENC_PAGE 6 +#define IN_S_HANDLE_EPILOGUE 7 +#define IN_S_FINISH_SKIP 8 #define OUT_S_QUEUE_DATA 1 #define OUT_S_QUEUE_DATA_CONT 2 @@ -1032,22 +1033,41 @@ static int decrypt_control_remainder(struct ceph_connection *con) padded_len(rem_len) + CEPH_GCM_TAG_LEN); } -static int decrypt_message(struct ceph_connection *con) +static int decrypt_tail(struct ceph_connection *con) { + struct sg_table enc_sgt = {}; struct sg_table sgt = {}; + int tail_len; int ret; + tail_len = tail_onwire_len(con->in_msg, true); + ret = sg_alloc_table_from_pages(&enc_sgt, con->v2.in_enc_pages, + con->v2.in_enc_page_cnt, 0, tail_len, + GFP_NOIO); + if (ret) + goto out; + ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf), MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf), con->v2.in_buf, true); if (ret) goto out; - ret = gcm_crypt(con, false, sgt.sgl, sgt.sgl, - tail_onwire_len(con->in_msg, true)); + dout("%s con %p msg %p enc_page_cnt %d sg_cnt %d\n", __func__, con, + con->in_msg, con->v2.in_enc_page_cnt, sgt.orig_nents); + ret = gcm_crypt(con, false, enc_sgt.sgl, sgt.sgl, tail_len); + if (ret) + goto out; + + WARN_ON(!con->v2.in_enc_page_cnt); + ceph_release_page_vector(con->v2.in_enc_pages, + con->v2.in_enc_page_cnt); + con->v2.in_enc_pages = NULL; + con->v2.in_enc_page_cnt = 0; out: sg_free_table(&sgt); + sg_free_table(&enc_sgt); return ret; } @@ -1737,8 +1757,7 @@ static void prepare_read_data(struct ceph_connection *con) { struct bio_vec bv; - if (!con_secure(con)) - con->in_data_crc = -1; + con->in_data_crc = -1; ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg, data_len(con->in_msg)); @@ -1751,11 +1770,10 @@ static void prepare_read_data_cont(struct ceph_connection *con) { struct bio_vec bv; - if (!con_secure(con)) - con->in_data_crc = ceph_crc32c_page(con->in_data_crc, - con->v2.in_bvec.bv_page, - con->v2.in_bvec.bv_offset, - con->v2.in_bvec.bv_len); + con->in_data_crc = ceph_crc32c_page(con->in_data_crc, + con->v2.in_bvec.bv_page, + con->v2.in_bvec.bv_offset, + con->v2.in_bvec.bv_len); ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len); if (con->v2.in_cursor.total_resid) { @@ -1766,21 +1784,94 @@ static void prepare_read_data_cont(struct ceph_connection *con) } /* - * We've read all data. Prepare to read data padding (if any) - * and epilogue. + * We've read all data. Prepare to read epilogue. */ reset_in_kvecs(con); - if (con_secure(con)) { - if (need_padding(data_len(con->in_msg))) - add_in_kvec(con, DATA_PAD(con->v2.in_buf), - padding_len(data_len(con->in_msg))); - add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_SECURE_LEN); + add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_EPILOGUE; +} + +static void prepare_read_tail_plain(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->in_msg; + + if (!front_len(msg) && !middle_len(msg)) { + WARN_ON(!data_len(msg)); + prepare_read_data(con); + return; + } + + reset_in_kvecs(con); + if (front_len(msg)) { + add_in_kvec(con, msg->front.iov_base, front_len(msg)); + WARN_ON(msg->front.iov_len != front_len(msg)); + } + if (middle_len(msg)) { + add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg)); + WARN_ON(msg->middle->vec.iov_len != middle_len(msg)); + } + + if (data_len(msg)) { + con->v2.in_state = IN_S_PREPARE_READ_DATA; } else { add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_EPILOGUE; } +} + +static void prepare_read_enc_page(struct ceph_connection *con) +{ + struct bio_vec bv; + + dout("%s con %p i %d resid %d\n", __func__, con, con->v2.in_enc_i, + con->v2.in_enc_resid); + WARN_ON(!con->v2.in_enc_resid); + + bv.bv_page = con->v2.in_enc_pages[con->v2.in_enc_i]; + bv.bv_offset = 0; + bv.bv_len = min(con->v2.in_enc_resid, (int)PAGE_SIZE); + + set_in_bvec(con, &bv); + con->v2.in_enc_i++; + con->v2.in_enc_resid -= bv.bv_len; + + if (con->v2.in_enc_resid) { + con->v2.in_state = IN_S_PREPARE_READ_ENC_PAGE; + return; + } + + /* + * We are set to read the last piece of ciphertext (ending + * with epilogue) + auth tag. + */ + WARN_ON(con->v2.in_enc_i != con->v2.in_enc_page_cnt); con->v2.in_state = IN_S_HANDLE_EPILOGUE; } +static int prepare_read_tail_secure(struct ceph_connection *con) +{ + struct page **enc_pages; + int enc_page_cnt; + int tail_len; + + tail_len = tail_onwire_len(con->in_msg, true); + WARN_ON(!tail_len); + + enc_page_cnt = calc_pages_for(0, tail_len); + enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO); + if (IS_ERR(enc_pages)) + return PTR_ERR(enc_pages); + + WARN_ON(con->v2.in_enc_pages || con->v2.in_enc_page_cnt); + con->v2.in_enc_pages = enc_pages; + con->v2.in_enc_page_cnt = enc_page_cnt; + con->v2.in_enc_resid = tail_len; + con->v2.in_enc_i = 0; + + prepare_read_enc_page(con); + return 0; +} + static void __finish_skip(struct ceph_connection *con) { con->in_seq++; @@ -2589,46 +2680,26 @@ static int __handle_control(struct ceph_connection *con, void *p) } msg = con->in_msg; /* set in process_message_header() */ - if (!front_len(msg) && !middle_len(msg)) { - if (!data_len(msg)) - return process_message(con); - - prepare_read_data(con); - return 0; - } - - reset_in_kvecs(con); if (front_len(msg)) { WARN_ON(front_len(msg) > msg->front_alloc_len); - add_in_kvec(con, msg->front.iov_base, front_len(msg)); msg->front.iov_len = front_len(msg); - - if (con_secure(con) && need_padding(front_len(msg))) - add_in_kvec(con, FRONT_PAD(con->v2.in_buf), - padding_len(front_len(msg))); } else { msg->front.iov_len = 0; } if (middle_len(msg)) { WARN_ON(middle_len(msg) > msg->middle->alloc_len); - add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg)); msg->middle->vec.iov_len = middle_len(msg); - - if (con_secure(con) && need_padding(middle_len(msg))) - add_in_kvec(con, MIDDLE_PAD(con->v2.in_buf), - padding_len(middle_len(msg))); } else if (msg->middle) { msg->middle->vec.iov_len = 0; } - if (data_len(msg)) { - con->v2.in_state = IN_S_PREPARE_READ_DATA; - } else { - add_in_kvec(con, con->v2.in_buf, - con_secure(con) ? CEPH_EPILOGUE_SECURE_LEN : - CEPH_EPILOGUE_PLAIN_LEN); - con->v2.in_state = IN_S_HANDLE_EPILOGUE; - } + if (!front_len(msg) && !middle_len(msg) && !data_len(msg)) + return process_message(con); + + if (con_secure(con)) + return prepare_read_tail_secure(con); + + prepare_read_tail_plain(con); return 0; } @@ -2717,7 +2788,7 @@ static int handle_epilogue(struct ceph_connection *con) int ret; if (con_secure(con)) { - ret = decrypt_message(con); + ret = decrypt_tail(con); if (ret) { if (ret == -EBADMSG) con->error_msg = "integrity error, bad epilogue auth tag"; @@ -2792,6 +2863,10 @@ static int populate_in_iter(struct ceph_connection *con) prepare_read_data_cont(con); ret = 0; break; + case IN_S_PREPARE_READ_ENC_PAGE: + prepare_read_enc_page(con); + ret = 0; + break; case IN_S_HANDLE_EPILOGUE: ret = handle_epilogue(con); break; @@ -3326,20 +3401,16 @@ void ceph_con_v2_revoke(struct ceph_connection *con) static void revoke_at_prepare_read_data(struct ceph_connection *con) { - int remaining; /* data + [data padding] + epilogue */ + int remaining; int resid; + WARN_ON(con_secure(con)); WARN_ON(!data_len(con->in_msg)); WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); resid = iov_iter_count(&con->v2.in_iter); WARN_ON(!resid); - if (con_secure(con)) - remaining = padded_len(data_len(con->in_msg)) + - CEPH_EPILOGUE_SECURE_LEN; - else - remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN; - + remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN; dout("%s con %p resid %d remaining %d\n", __func__, con, resid, remaining); con->v2.in_iter.count -= resid; @@ -3350,8 +3421,9 @@ static void revoke_at_prepare_read_data(struct ceph_connection *con) static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) { int recved, resid; /* current piece of data */ - int remaining; /* [data padding] + epilogue */ + int remaining; + WARN_ON(con_secure(con)); WARN_ON(!data_len(con->in_msg)); WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter)); resid = iov_iter_count(&con->v2.in_iter); @@ -3363,12 +3435,7 @@ static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) ceph_msg_data_advance(&con->v2.in_cursor, recved); WARN_ON(resid > con->v2.in_cursor.total_resid); - if (con_secure(con)) - remaining = padding_len(data_len(con->in_msg)) + - CEPH_EPILOGUE_SECURE_LEN; - else - remaining = CEPH_EPILOGUE_PLAIN_LEN; - + remaining = CEPH_EPILOGUE_PLAIN_LEN; dout("%s con %p total_resid %zu remaining %d\n", __func__, con, con->v2.in_cursor.total_resid, remaining); con->v2.in_iter.count -= resid; @@ -3376,11 +3443,26 @@ static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) con->v2.in_state = IN_S_FINISH_SKIP; } +static void revoke_at_prepare_read_enc_page(struct ceph_connection *con) +{ + int resid; /* current enc page (not necessarily data) */ + + WARN_ON(!con_secure(con)); + WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + WARN_ON(!resid || resid > con->v2.in_bvec.bv_len); + + dout("%s con %p resid %d enc_resid %d\n", __func__, con, resid, + con->v2.in_enc_resid); + con->v2.in_iter.count -= resid; + set_in_skip(con, resid + con->v2.in_enc_resid); + con->v2.in_state = IN_S_FINISH_SKIP; +} + static void revoke_at_handle_epilogue(struct ceph_connection *con) { int resid; - WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); resid = iov_iter_count(&con->v2.in_iter); WARN_ON(!resid); @@ -3399,6 +3481,9 @@ void ceph_con_v2_revoke_incoming(struct ceph_connection *con) case IN_S_PREPARE_READ_DATA_CONT: revoke_at_prepare_read_data_cont(con); break; + case IN_S_PREPARE_READ_ENC_PAGE: + revoke_at_prepare_read_enc_page(con); + break; case IN_S_HANDLE_EPILOGUE: revoke_at_handle_epilogue(con); break; @@ -3432,6 +3517,13 @@ void ceph_con_v2_reset_protocol(struct ceph_connection *con) clear_out_sign_kvecs(con); free_conn_bufs(con); + if (con->v2.in_enc_pages) { + WARN_ON(!con->v2.in_enc_page_cnt); + ceph_release_page_vector(con->v2.in_enc_pages, + con->v2.in_enc_page_cnt); + con->v2.in_enc_pages = NULL; + con->v2.in_enc_page_cnt = 0; + } if (con->v2.out_enc_pages) { WARN_ON(!con->v2.out_enc_page_cnt); ceph_release_page_vector(con->v2.out_enc_pages, From 038b8d1d1ab1cce11a158d30bf080ff41a2cfd15 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 30 Dec 2021 15:13:32 +0100 Subject: [PATCH 0985/1295] libceph: optionally use bounce buffer on recv path in crc mode Both msgr1 and msgr2 in crc mode are zero copy in the sense that message data is read from the socket directly into the destination buffer. We assume that the destination buffer is stable (i.e. remains unchanged while it is being read to) though. Otherwise, CRC errors ensue: libceph: read_partial_message 0000000048edf8ad data crc 1063286393 != exp. 228122706 libceph: osd1 (1)192.168.122.1:6843 bad crc/signature libceph: bad data crc, calculated 57958023, expected 1805382778 libceph: osd2 (2)192.168.122.1:6876 integrity error, bad crc Introduce rxbounce option to enable use of a bounce buffer when receiving message data. In particular this is needed if a mapped image is a Windows VM disk, passed to QEMU. Windows has a system-wide "dummy" page that may be mapped into the destination buffer (potentially more than once into the same buffer) by the Windows Memory Manager in an effort to generate a single large I/O [1][2]. QEMU makes a point of preserving overlap relationships when cloning I/O vectors, so krbd gets exposed to this behaviour. [1] "What Is Really in That MDL?" https://docs.microsoft.com/en-us/previous-versions/windows/hardware/design/dn614012(v=vs.85) [2] https://blogs.msmvps.com/kernelmustard/2005/05/04/dummy-pages/ URL: https://bugzilla.redhat.com/show_bug.cgi?id=1973317 Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/libceph.h | 1 + include/linux/ceph/messenger.h | 1 + net/ceph/ceph_common.c | 7 ++++ net/ceph/messenger.c | 4 +++ net/ceph/messenger_v1.c | 54 +++++++++++++++++++++++++++---- net/ceph/messenger_v2.c | 58 ++++++++++++++++++++++++++-------- 6 files changed, 105 insertions(+), 20 deletions(-) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 6a89ea410e43..edf62eaa6285 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -35,6 +35,7 @@ #define CEPH_OPT_TCP_NODELAY (1<<4) /* TCP_NODELAY on TCP sockets */ #define CEPH_OPT_NOMSGSIGN (1<<5) /* don't sign msgs (msgr1) */ #define CEPH_OPT_ABORT_ON_FULL (1<<6) /* abort w/ ENOSPC when full */ +#define CEPH_OPT_RXBOUNCE (1<<7) /* double-buffer read data */ #define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 6c6b6ea52bb8..e7f2fb2fc207 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -461,6 +461,7 @@ struct ceph_connection { struct ceph_msg *out_msg; /* sending message (== tail of out_sent) */ + struct page *bounce_page; u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */ diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index ecc400a0b7bb..4c6441536d55 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -246,6 +246,7 @@ enum { Opt_cephx_sign_messages, Opt_tcp_nodelay, Opt_abort_on_full, + Opt_rxbounce, }; enum { @@ -295,6 +296,7 @@ static const struct fs_parameter_spec ceph_parameters[] = { fsparam_u32 ("osdkeepalive", Opt_osdkeepalivetimeout), fsparam_enum ("read_from_replica", Opt_read_from_replica, ceph_param_read_from_replica), + fsparam_flag ("rxbounce", Opt_rxbounce), fsparam_enum ("ms_mode", Opt_ms_mode, ceph_param_ms_mode), fsparam_string ("secret", Opt_secret), @@ -584,6 +586,9 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, case Opt_abort_on_full: opt->flags |= CEPH_OPT_ABORT_ON_FULL; break; + case Opt_rxbounce: + opt->flags |= CEPH_OPT_RXBOUNCE; + break; default: BUG(); @@ -660,6 +665,8 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, seq_puts(m, "notcp_nodelay,"); if (show_all && (opt->flags & CEPH_OPT_ABORT_ON_FULL)) seq_puts(m, "abort_on_full,"); + if (opt->flags & CEPH_OPT_RXBOUNCE) + seq_puts(m, "rxbounce,"); if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) seq_printf(m, "mount_timeout=%d,", diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 45eba2dcb67a..d3bb656308b4 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -515,6 +515,10 @@ static void ceph_con_reset_protocol(struct ceph_connection *con) ceph_msg_put(con->out_msg); con->out_msg = NULL; } + if (con->bounce_page) { + __free_page(con->bounce_page); + con->bounce_page = NULL; + } if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_reset_protocol(con); diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 2cb5ffdf071a..6b014eca3a13 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -992,8 +992,7 @@ static int read_partial_message_section(struct ceph_connection *con, static int read_partial_msg_data(struct ceph_connection *con) { - struct ceph_msg *msg = con->in_msg; - struct ceph_msg_data_cursor *cursor = &msg->cursor; + struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); struct page *page; size_t page_offset; @@ -1001,9 +1000,6 @@ static int read_partial_msg_data(struct ceph_connection *con) u32 crc = 0; int ret; - if (!msg->num_data_items) - return -EIO; - if (do_datacrc) crc = con->in_data_crc; while (cursor->total_resid) { @@ -1031,6 +1027,46 @@ static int read_partial_msg_data(struct ceph_connection *con) return 1; /* must return > 0 to indicate success */ } +static int read_partial_msg_data_bounce(struct ceph_connection *con) +{ + struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; + struct page *page; + size_t off, len; + u32 crc; + int ret; + + if (unlikely(!con->bounce_page)) { + con->bounce_page = alloc_page(GFP_NOIO); + if (!con->bounce_page) { + pr_err("failed to allocate bounce page\n"); + return -ENOMEM; + } + } + + crc = con->in_data_crc; + while (cursor->total_resid) { + if (!cursor->resid) { + ceph_msg_data_advance(cursor, 0); + continue; + } + + page = ceph_msg_data_next(cursor, &off, &len, NULL); + ret = ceph_tcp_recvpage(con->sock, con->bounce_page, 0, len); + if (ret <= 0) { + con->in_data_crc = crc; + return ret; + } + + crc = crc32c(crc, page_address(con->bounce_page), ret); + memcpy_to_page(page, off, page_address(con->bounce_page), ret); + + ceph_msg_data_advance(cursor, ret); + } + con->in_data_crc = crc; + + return 1; /* must return > 0 to indicate success */ +} + /* * read (part of) a message. */ @@ -1141,7 +1177,13 @@ static int read_partial_message(struct ceph_connection *con) /* (page) data */ if (data_len) { - ret = read_partial_msg_data(con); + if (!m->num_data_items) + return -EIO; + + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) + ret = read_partial_msg_data_bounce(con); + else + ret = read_partial_msg_data(con); if (ret <= 0) return ret; } diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 2ea00489e691..c81379f93ad5 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -1753,7 +1753,7 @@ static int prepare_read_control_remainder(struct ceph_connection *con) return 0; } -static void prepare_read_data(struct ceph_connection *con) +static int prepare_read_data(struct ceph_connection *con) { struct bio_vec bv; @@ -1762,23 +1762,55 @@ static void prepare_read_data(struct ceph_connection *con) data_len(con->in_msg)); get_bvec_at(&con->v2.in_cursor, &bv); - set_in_bvec(con, &bv); + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + if (unlikely(!con->bounce_page)) { + con->bounce_page = alloc_page(GFP_NOIO); + if (!con->bounce_page) { + pr_err("failed to allocate bounce page\n"); + return -ENOMEM; + } + } + + bv.bv_page = con->bounce_page; + bv.bv_offset = 0; + set_in_bvec(con, &bv); + } else { + set_in_bvec(con, &bv); + } con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT; + return 0; } static void prepare_read_data_cont(struct ceph_connection *con) { struct bio_vec bv; - con->in_data_crc = ceph_crc32c_page(con->in_data_crc, - con->v2.in_bvec.bv_page, - con->v2.in_bvec.bv_offset, - con->v2.in_bvec.bv_len); + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + con->in_data_crc = crc32c(con->in_data_crc, + page_address(con->bounce_page), + con->v2.in_bvec.bv_len); + + get_bvec_at(&con->v2.in_cursor, &bv); + memcpy_to_page(bv.bv_page, bv.bv_offset, + page_address(con->bounce_page), + con->v2.in_bvec.bv_len); + } else { + con->in_data_crc = ceph_crc32c_page(con->in_data_crc, + con->v2.in_bvec.bv_page, + con->v2.in_bvec.bv_offset, + con->v2.in_bvec.bv_len); + } ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len); if (con->v2.in_cursor.total_resid) { get_bvec_at(&con->v2.in_cursor, &bv); - set_in_bvec(con, &bv); + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + bv.bv_page = con->bounce_page; + bv.bv_offset = 0; + set_in_bvec(con, &bv); + } else { + set_in_bvec(con, &bv); + } WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT); return; } @@ -1791,14 +1823,13 @@ static void prepare_read_data_cont(struct ceph_connection *con) con->v2.in_state = IN_S_HANDLE_EPILOGUE; } -static void prepare_read_tail_plain(struct ceph_connection *con) +static int prepare_read_tail_plain(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; if (!front_len(msg) && !middle_len(msg)) { WARN_ON(!data_len(msg)); - prepare_read_data(con); - return; + return prepare_read_data(con); } reset_in_kvecs(con); @@ -1817,6 +1848,7 @@ static void prepare_read_tail_plain(struct ceph_connection *con) add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); con->v2.in_state = IN_S_HANDLE_EPILOGUE; } + return 0; } static void prepare_read_enc_page(struct ceph_connection *con) @@ -2699,8 +2731,7 @@ static int __handle_control(struct ceph_connection *con, void *p) if (con_secure(con)) return prepare_read_tail_secure(con); - prepare_read_tail_plain(con); - return 0; + return prepare_read_tail_plain(con); } static int handle_preamble(struct ceph_connection *con) @@ -2856,8 +2887,7 @@ static int populate_in_iter(struct ceph_connection *con) ret = handle_control_remainder(con); break; case IN_S_PREPARE_READ_DATA: - prepare_read_data(con); - ret = 0; + ret = prepare_read_data(con); break; case IN_S_PREPARE_READ_DATA_CONT: prepare_read_data_cont(con); From de4d73b16d5d9c3e5d03a66046e5410e1e74f903 Mon Sep 17 00:00:00 2001 From: Daniel Latypov Date: Thu, 27 Jan 2022 14:17:10 -0800 Subject: [PATCH 0986/1295] kunit: fix missing f in f-string in run_checks.py We're missing the `f` prefix to have python do string interpolation, so we'd never end up printing what the actual "unexpected" error is. Fixes: ee92ed38364e ("kunit: add run_checks.py script to validate kunit changes") Signed-off-by: Daniel Latypov Reviewed-by: David Gow Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/run_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/kunit/run_checks.py b/tools/testing/kunit/run_checks.py index 4f32133ed77c..13d854afca9d 100755 --- a/tools/testing/kunit/run_checks.py +++ b/tools/testing/kunit/run_checks.py @@ -61,7 +61,7 @@ def main(argv: Sequence[str]) -> None: elif isinstance(ex, subprocess.CalledProcessError): print(f'{name}: FAILED') else: - print('{name}: unexpected exception: {ex}') + print(f'{name}: unexpected exception: {ex}') continue output = ex.output From 0f9650bd838efe5c52f7e5f40c3204ad59f1964d Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 2 Feb 2022 09:24:10 -0800 Subject: [PATCH 0987/1295] md: fix NULL pointer deref with nowait but no mddev->queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Leon reported NULL pointer deref with nowait support: [ 15.123761] device-mapper: raid: Loading target version 1.15.1 [ 15.124185] device-mapper: raid: Ignoring chunk size parameter for RAID 1 [ 15.124192] device-mapper: raid: Choosing default region size of 4MiB [ 15.129524] BUG: kernel NULL pointer dereference, address: 0000000000000060 [ 15.129530] #PF: supervisor write access in kernel mode [ 15.129533] #PF: error_code(0x0002) - not-present page [ 15.129535] PGD 0 P4D 0 [ 15.129538] Oops: 0002 [#1] PREEMPT SMP NOPTI [ 15.129541] CPU: 5 PID: 494 Comm: ldmtool Not tainted 5.17.0-rc2-1-mainline #1 9fe89d43dfcb215d2731e6f8851740520778615e [ 15.129546] Hardware name: Gigabyte Technology Co., Ltd. X570 AORUS ELITE/X570 AORUS ELITE, BIOS F36e 10/14/2021 [ 15.129549] RIP: 0010:blk_queue_flag_set+0x7/0x20 [ 15.129555] Code: 00 00 00 0f 1f 44 00 00 48 8b 35 e4 e0 04 02 48 8d 57 28 bf 40 01 \ 00 00 e9 16 c1 be ff 66 0f 1f 44 00 00 0f 1f 44 00 00 89 ff 48 0f ab 7e 60 \ 31 f6 89 f7 c3 66 66 2e 0f 1f 84 00 00 00 00 00 [ 15.129559] RSP: 0018:ffff966b81987a88 EFLAGS: 00010202 [ 15.129562] RAX: ffff8b11c363a0d0 RBX: ffff8b11e294b070 RCX: 0000000000000000 [ 15.129564] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000000001d [ 15.129566] RBP: ffff8b11e294b058 R08: 0000000000000000 R09: 0000000000000000 [ 15.129568] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8b11e294b070 [ 15.129570] R13: 0000000000000000 R14: ffff8b11e294b000 R15: 0000000000000001 [ 15.129572] FS: 00007fa96e826780(0000) GS:ffff8b18deb40000(0000) knlGS:0000000000000000 [ 15.129575] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 15.129577] CR2: 0000000000000060 CR3: 000000010b8ce000 CR4: 00000000003506e0 [ 15.129580] Call Trace: [ 15.129582] [ 15.129584] md_run+0x67c/0xc70 [md_mod 1e470c1b6bcf1114198109f42682f5a2740e9531] [ 15.129597] raid_ctr+0x134a/0x28ea [dm_raid 6a645dd7519e72834bd7e98c23497eeade14cd63] [ 15.129604] ? dm_split_args+0x63/0x150 [dm_mod 0d7b0bc3414340a79c4553bae5ca97294b78336e] [ 15.129615] dm_table_add_target+0x188/0x380 [dm_mod 0d7b0bc3414340a79c4553bae5ca97294b78336e] [ 15.129625] table_load+0x13b/0x370 [dm_mod 0d7b0bc3414340a79c4553bae5ca97294b78336e] [ 15.129635] ? dev_suspend+0x2d0/0x2d0 [dm_mod 0d7b0bc3414340a79c4553bae5ca97294b78336e] [ 15.129644] ctl_ioctl+0x1bd/0x460 [dm_mod 0d7b0bc3414340a79c4553bae5ca97294b78336e] [ 15.129655] dm_ctl_ioctl+0xa/0x20 [dm_mod 0d7b0bc3414340a79c4553bae5ca97294b78336e] [ 15.129663] __x64_sys_ioctl+0x8e/0xd0 [ 15.129667] do_syscall_64+0x5c/0x90 [ 15.129672] ? syscall_exit_to_user_mode+0x23/0x50 [ 15.129675] ? do_syscall_64+0x69/0x90 [ 15.129677] ? do_syscall_64+0x69/0x90 [ 15.129679] ? syscall_exit_to_user_mode+0x23/0x50 [ 15.129682] ? do_syscall_64+0x69/0x90 [ 15.129684] ? do_syscall_64+0x69/0x90 [ 15.129686] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 15.129689] RIP: 0033:0x7fa96ecd559b [ 15.129692] Code: ff ff ff 85 c0 79 9b 49 c7 c4 ff ff ff ff 5b 5d 4c 89 e0 41 5c \ c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff \ ff 73 01 c3 48 8b 0d a5 a8 0c 00 f7 d8 64 89 01 48 [ 15.129696] RSP: 002b:00007ffcaf85c258 EFLAGS: 00000206 ORIG_RAX: 0000000000000010 [ 15.129699] RAX: ffffffffffffffda RBX: 00007fa96f1b48f0 RCX: 00007fa96ecd559b [ 15.129701] RDX: 00007fa97017e610 RSI: 00000000c138fd09 RDI: 0000000000000003 [ 15.129702] RBP: 00007fa96ebab583 R08: 00007fa97017c9e0 R09: 00007ffcaf85bf27 [ 15.129704] R10: 0000000000000001 R11: 0000000000000206 R12: 00007fa97017e610 [ 15.129706] R13: 00007fa97017e640 R14: 00007fa97017e6c0 R15: 00007fa97017e530 [ 15.129709] This is caused by missing mddev->queue check for setting QUEUE_FLAG_NOWAIT Fix this by moving the QUEUE_FLAG_NOWAIT logic to under mddev->queue check. Fixes: f51d46d0e7cb ("md: add support for REQ_NOWAIT") Reported-by: Leon Möller Tested-by: Leon Möller Cc: Vishal Verma Signed-off-by: Song Liu --- drivers/md/md.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 5881d05a76eb..4d38bd7dadd6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5869,10 +5869,6 @@ int md_run(struct mddev *mddev) nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev)); } - /* Set the NOWAIT flags if all underlying devices support it */ - if (nowait) - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); - if (!bioset_initialized(&mddev->bio_set)) { err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) @@ -6010,6 +6006,10 @@ int md_run(struct mddev *mddev) else blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); + + /* Set the NOWAIT flags if all underlying devices support it */ + if (nowait) + blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); } if (pers->sync_request) { if (mddev->kobj.sd && From bfb1a7c91fb7758273b4a8d735313d9cc388b502 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 2 Feb 2022 12:55:53 -0800 Subject: [PATCH 0988/1295] x86/bug: Merge annotate_reachable() into _BUG_FLAGS() asm In __WARN_FLAGS(), we had two asm statements (abbreviated): asm volatile("ud2"); asm volatile(".pushsection .discard.reachable"); These pair of statements are used to trigger an exception, but then help objtool understand that for warnings, control flow will be restored immediately afterwards. The problem is that volatile is not a compiler barrier. GCC explicitly documents this: > Note that the compiler can move even volatile asm instructions > relative to other code, including across jump instructions. Also, no clobbers are specified to prevent instructions from subsequent statements from being scheduled by compiler before the second asm statement. This can lead to instructions from subsequent statements being emitted by the compiler before the second asm statement. Providing a scheduling model such as via -march= options enables the compiler to better schedule instructions with known latencies to hide latencies from data hazards compared to inline asm statements in which latencies are not estimated. If an instruction gets scheduled by the compiler between the two asm statements, then objtool will think that it is not reachable, producing a warning. To prevent instructions from being scheduled in between the two asm statements, merge them. Also remove an unnecessary unreachable() asm annotation from BUG() in favor of __builtin_unreachable(). objtool is able to track that the ud2 from BUG() terminates control flow within the function. Link: https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Volatile Link: https://github.com/ClangBuiltLinux/linux/issues/1483 Signed-off-by: Nick Desaulniers Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20220202205557.2260694-1-ndesaulniers@google.com --- arch/x86/include/asm/bug.h | 20 +++++++++++--------- include/linux/compiler.h | 21 +++++---------------- 2 files changed, 16 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 84b87538a15d..bab883c0b6fe 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -22,7 +22,7 @@ #ifdef CONFIG_DEBUG_BUGVERBOSE -#define _BUG_FLAGS(ins, flags) \ +#define _BUG_FLAGS(ins, flags, extra) \ do { \ asm_inline volatile("1:\t" ins "\n" \ ".pushsection __bug_table,\"aw\"\n" \ @@ -31,7 +31,8 @@ do { \ "\t.word %c1" "\t# bug_entry::line\n" \ "\t.word %c2" "\t# bug_entry::flags\n" \ "\t.org 2b+%c3\n" \ - ".popsection" \ + ".popsection\n" \ + extra \ : : "i" (__FILE__), "i" (__LINE__), \ "i" (flags), \ "i" (sizeof(struct bug_entry))); \ @@ -39,14 +40,15 @@ do { \ #else /* !CONFIG_DEBUG_BUGVERBOSE */ -#define _BUG_FLAGS(ins, flags) \ +#define _BUG_FLAGS(ins, flags, extra) \ do { \ asm_inline volatile("1:\t" ins "\n" \ ".pushsection __bug_table,\"aw\"\n" \ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ "\t.word %c0" "\t# bug_entry::flags\n" \ "\t.org 2b+%c1\n" \ - ".popsection" \ + ".popsection\n" \ + extra \ : : "i" (flags), \ "i" (sizeof(struct bug_entry))); \ } while (0) @@ -55,7 +57,7 @@ do { \ #else -#define _BUG_FLAGS(ins, flags) asm volatile(ins) +#define _BUG_FLAGS(ins, flags, extra) asm volatile(ins) #endif /* CONFIG_GENERIC_BUG */ @@ -63,8 +65,8 @@ do { \ #define BUG() \ do { \ instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, 0); \ - unreachable(); \ + _BUG_FLAGS(ASM_UD2, 0, ""); \ + __builtin_unreachable(); \ } while (0) /* @@ -75,9 +77,9 @@ do { \ */ #define __WARN_FLAGS(flags) \ do { \ + __auto_type f = BUGFLAG_WARNING|(flags); \ instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, BUGFLAG_WARNING|(flags)); \ - annotate_reachable(); \ + _BUG_FLAGS(ASM_UD2, f, ASM_REACHABLE); \ instrumentation_end(); \ } while (0) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 429dcebe2b99..0f7fd205ab7e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -117,14 +117,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, */ #define __stringify_label(n) #n -#define __annotate_reachable(c) ({ \ - asm volatile(__stringify_label(c) ":\n\t" \ - ".pushsection .discard.reachable\n\t" \ - ".long " __stringify_label(c) "b - .\n\t" \ - ".popsection\n\t" : : "i" (c)); \ -}) -#define annotate_reachable() __annotate_reachable(__COUNTER__) - #define __annotate_unreachable(c) ({ \ asm volatile(__stringify_label(c) ":\n\t" \ ".pushsection .discard.unreachable\n\t" \ @@ -133,24 +125,21 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, }) #define annotate_unreachable() __annotate_unreachable(__COUNTER__) -#define ASM_UNREACHABLE \ - "999:\n\t" \ - ".pushsection .discard.unreachable\n\t" \ - ".long 999b - .\n\t" \ +#define ASM_REACHABLE \ + "998:\n\t" \ + ".pushsection .discard.reachable\n\t" \ + ".long 998b - .\n\t" \ ".popsection\n\t" /* Annotate a C jump table to allow objtool to follow the code flow */ #define __annotate_jump_table __section(".rodata..c_jump_table") #else -#define annotate_reachable() #define annotate_unreachable() +# define ASM_REACHABLE #define __annotate_jump_table #endif -#ifndef ASM_UNREACHABLE -# define ASM_UNREACHABLE -#endif #ifndef unreachable # define unreachable() do { \ annotate_unreachable(); \ From f52a2b8badbd24faf73a13c9c07fdb9d07352944 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 25 Jan 2022 21:35:09 -0600 Subject: [PATCH 0989/1295] drm/amd: add support to check whether the system is set to s3 This will be used to help make decisions on what to do in misconfigured systems. v2: squash in semicolon fix from Stephen Rothwell Signed-off-by: Mario Limonciello Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 5c466d8972f5..9a53a4de2bb7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1421,9 +1421,11 @@ static inline int amdgpu_acpi_smart_shift_update(struct drm_device *dev, #endif #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) +bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev); bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev); #else static inline bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { return false; } +static inline bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) { return false; } #endif int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index b19d40751802..0e12315fa0cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -1032,6 +1032,19 @@ void amdgpu_acpi_detect(void) } #if IS_ENABLED(CONFIG_SUSPEND) +/** + * amdgpu_acpi_is_s3_active + * + * @adev: amdgpu_device_pointer + * + * returns true if supported, false if not. + */ +bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) +{ + return !(adev->flags & AMD_IS_APU) || + (pm_suspend_target_state == PM_SUSPEND_MEM); +} + /** * amdgpu_acpi_is_s0ix_active * From 04ef860469fda6a646dc841190d05b31fae68e8c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 25 Jan 2022 21:37:57 -0600 Subject: [PATCH 0990/1295] drm/amd: Only run s3 or s0ix if system is configured properly This will cause misconfigured systems to not run the GPU suspend routines. * In APUs that are properly configured system will go into s2idle. * In APUs that are intended to be S3 but user selects s2idle the GPU will stay fully powered for the suspend. * In APUs that are intended to be s2idle and system misconfigured the GPU will stay fully powered for the suspend. * In systems that are intended to be s2idle, but AMD dGPU is also present, the dGPU will go through S3 Signed-off-by: Mario Limonciello Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 4c83f1db8a24..17747252e722 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2246,6 +2246,7 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work) static int amdgpu_pmops_prepare(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(drm_dev); /* Return a positive number here so * DPM_FLAG_SMART_SUSPEND works properly @@ -2254,6 +2255,13 @@ static int amdgpu_pmops_prepare(struct device *dev) return pm_runtime_suspended(dev) && pm_suspend_via_firmware(); + /* if we will not support s3 or s2i for the device + * then skip suspend + */ + if (!amdgpu_acpi_is_s0ix_active(adev) && + !amdgpu_acpi_is_s3_active(adev)) + return 1; + return 0; } From bca52455a3c07922ee976714b00563a13a29ab15 Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Fri, 28 Jan 2022 18:24:53 +0800 Subject: [PATCH 0991/1295] drm/amdgpu: fix a potential GPU hang on cyan skillfish We observed a GPU hang when querying GMC CG state(i.e., cat amdgpu_pm_info) on cyan skillfish. Acctually, cyan skillfish doesn't support any CG features. Just prevent it from accessing GMC CG registers. Signed-off-by: Lang Yu Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 38bb42727715..a2f8ed0e6a64 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -1140,6 +1140,9 @@ static void gmc_v10_0_get_clockgating_state(void *handle, u32 *flags) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 1, 3)) + return; + adev->mmhub.funcs->get_clockgating(adev, flags); if (adev->ip_versions[ATHUB_HWIP][0] >= IP_VERSION(2, 1, 0)) From 2d8ae25d233767171942a9fba5fd8f4a620996be Mon Sep 17 00:00:00 2001 From: Agustin Gutierrez Date: Fri, 28 Jan 2022 17:51:53 -0500 Subject: [PATCH 0992/1295] drm/amd/display: Update watermark values for DCN301 [Why] There is underflow / visual corruption DCN301, for high bandwidth MST DSC configurations such as 2x1440p144 or 2x4k60. [How] Use up-to-date watermark values for DCN301. Reviewed-by: Zhan Liu Signed-off-by: Agustin Gutierrez Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c index 48005def1164..bc4ddc36fe58 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c @@ -570,32 +570,32 @@ static struct wm_table lpddr5_wm_table = { .wm_inst = WM_A, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.65333, - .sr_exit_time_us = 7.95, - .sr_enter_plus_exit_time_us = 9, + .sr_exit_time_us = 13.5, + .sr_enter_plus_exit_time_us = 16.5, .valid = true, }, { .wm_inst = WM_B, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.65333, - .sr_exit_time_us = 9.82, - .sr_enter_plus_exit_time_us = 11.196, + .sr_exit_time_us = 13.5, + .sr_enter_plus_exit_time_us = 16.5, .valid = true, }, { .wm_inst = WM_C, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.65333, - .sr_exit_time_us = 9.89, - .sr_enter_plus_exit_time_us = 11.24, + .sr_exit_time_us = 13.5, + .sr_enter_plus_exit_time_us = 16.5, .valid = true, }, { .wm_inst = WM_D, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.65333, - .sr_exit_time_us = 9.748, - .sr_enter_plus_exit_time_us = 11.102, + .sr_exit_time_us = 13.5, + .sr_enter_plus_exit_time_us = 16.5, .valid = true, }, } From f5fa54f45ab41cbb1f99b1208f49554132ffb207 Mon Sep 17 00:00:00 2001 From: Paul Hsieh Date: Fri, 28 Jan 2022 22:03:57 +0800 Subject: [PATCH 0993/1295] drm/amd/display: watermark latencies is not enough on DCN31 [Why] The original latencies were causing underflow in some modes. Resolution: 2880x1620@60p when HDR enable [How] 1. Replace with the up-to-date watermark values based on new measurments 2. Correct the ddr_wm_table name to DDR5 on DCN31 Tested-by: Daniel Wheeler Reviewed-by: Aric Cyr Acked-by: Stylon Wang Signed-off-by: Paul Hsieh Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c index 4162ce40089b..9d17c5a5ae01 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c @@ -329,38 +329,38 @@ static struct clk_bw_params dcn31_bw_params = { }; -static struct wm_table ddr4_wm_table = { +static struct wm_table ddr5_wm_table = { .entries = { { .wm_inst = WM_A, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 6.09, - .sr_enter_plus_exit_time_us = 7.14, + .sr_exit_time_us = 9, + .sr_enter_plus_exit_time_us = 11, .valid = true, }, { .wm_inst = WM_B, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 10.12, - .sr_enter_plus_exit_time_us = 11.48, + .sr_exit_time_us = 9, + .sr_enter_plus_exit_time_us = 11, .valid = true, }, { .wm_inst = WM_C, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 10.12, - .sr_enter_plus_exit_time_us = 11.48, + .sr_exit_time_us = 9, + .sr_enter_plus_exit_time_us = 11, .valid = true, }, { .wm_inst = WM_D, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 10.12, - .sr_enter_plus_exit_time_us = 11.48, + .sr_exit_time_us = 9, + .sr_enter_plus_exit_time_us = 11, .valid = true, }, } @@ -687,7 +687,7 @@ void dcn31_clk_mgr_construct( if (ctx->dc_bios->integrated_info->memory_type == LpDdr5MemType) { dcn31_bw_params.wm_table = lpddr5_wm_table; } else { - dcn31_bw_params.wm_table = ddr4_wm_table; + dcn31_bw_params.wm_table = ddr5_wm_table; } /* Saved clocks configured at boot for debug purposes */ dcn31_dump_clk_registers(&clk_mgr->base.base.boot_snapshot, &clk_mgr->base.base, &log_info); From 49a6ebb95d04bdaa5d57313a380c44249cf02100 Mon Sep 17 00:00:00 2001 From: Zhan Liu Date: Fri, 28 Jan 2022 22:03:59 +0800 Subject: [PATCH 0994/1295] drm/amd/display: revert "Reset fifo after enable otg" [Why] This change causes regression, that prevents some systems from lighting up internal displays. [How] Revert this patch until a new solution is ready. Tested-by: Daniel Wheeler Reviewed-by: Charlene Liu Acked-by: Stylon Wang Signed-off-by: Zhan Liu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../amd/display/dc/dce110/dce110_hw_sequencer.c | 5 ----- .../amd/display/dc/dcn10/dcn10_stream_encoder.c | 15 --------------- .../amd/display/dc/dcn10/dcn10_stream_encoder.h | 3 --- .../amd/display/dc/dcn20/dcn20_stream_encoder.c | 2 -- .../display/dc/dcn30/dcn30_dio_stream_encoder.c | 2 -- .../drm/amd/display/dc/inc/hw/stream_encoder.h | 4 ---- 6 files changed, 31 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c index f3ff141b706a..26ec69bb5db9 100644 --- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c @@ -1608,11 +1608,6 @@ static enum dc_status apply_single_controller_ctx_to_hw( pipe_ctx->stream_res.stream_enc, pipe_ctx->stream_res.tg->inst); - if (dc_is_embedded_signal(pipe_ctx->stream->signal) && - pipe_ctx->stream_res.stream_enc->funcs->reset_fifo) - pipe_ctx->stream_res.stream_enc->funcs->reset_fifo( - pipe_ctx->stream_res.stream_enc); - if (dc_is_dp_signal(pipe_ctx->stream->signal)) dp_source_sequence_trace(link, DPCD_SOURCE_SEQ_AFTER_CONNECT_DIG_FE_OTG); diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.c index bf4436d7aaab..b0c08ee6bc2c 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.c @@ -902,19 +902,6 @@ void enc1_stream_encoder_stop_dp_info_packets( } -void enc1_stream_encoder_reset_fifo( - struct stream_encoder *enc) -{ - struct dcn10_stream_encoder *enc1 = DCN10STRENC_FROM_STRENC(enc); - - /* set DIG_START to 0x1 to reset FIFO */ - REG_UPDATE(DIG_FE_CNTL, DIG_START, 1); - udelay(100); - - /* write 0 to take the FIFO out of reset */ - REG_UPDATE(DIG_FE_CNTL, DIG_START, 0); -} - void enc1_stream_encoder_dp_blank( struct dc_link *link, struct stream_encoder *enc) @@ -1600,8 +1587,6 @@ static const struct stream_encoder_funcs dcn10_str_enc_funcs = { enc1_stream_encoder_send_immediate_sdp_message, .stop_dp_info_packets = enc1_stream_encoder_stop_dp_info_packets, - .reset_fifo = - enc1_stream_encoder_reset_fifo, .dp_blank = enc1_stream_encoder_dp_blank, .dp_unblank = diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.h b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.h index a146a41f68e9..687d7e4bf7ca 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.h +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_stream_encoder.h @@ -626,9 +626,6 @@ void enc1_stream_encoder_send_immediate_sdp_message( void enc1_stream_encoder_stop_dp_info_packets( struct stream_encoder *enc); -void enc1_stream_encoder_reset_fifo( - struct stream_encoder *enc); - void enc1_stream_encoder_dp_blank( struct dc_link *link, struct stream_encoder *enc); diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_stream_encoder.c index 8a70f92795c2..aab25ca8343a 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_stream_encoder.c @@ -593,8 +593,6 @@ static const struct stream_encoder_funcs dcn20_str_enc_funcs = { enc1_stream_encoder_send_immediate_sdp_message, .stop_dp_info_packets = enc1_stream_encoder_stop_dp_info_packets, - .reset_fifo = - enc1_stream_encoder_reset_fifo, .dp_blank = enc1_stream_encoder_dp_blank, .dp_unblank = diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dio_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dio_stream_encoder.c index 8daa12730bc1..a04ca4a98392 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dio_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dio_stream_encoder.c @@ -789,8 +789,6 @@ static const struct stream_encoder_funcs dcn30_str_enc_funcs = { enc3_stream_encoder_update_dp_info_packets, .stop_dp_info_packets = enc1_stream_encoder_stop_dp_info_packets, - .reset_fifo = - enc1_stream_encoder_reset_fifo, .dp_blank = enc1_stream_encoder_dp_blank, .dp_unblank = diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/stream_encoder.h b/drivers/gpu/drm/amd/display/dc/inc/hw/stream_encoder.h index 073f8b667eff..c88e113b94d1 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/stream_encoder.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/stream_encoder.h @@ -164,10 +164,6 @@ struct stream_encoder_funcs { void (*stop_dp_info_packets)( struct stream_encoder *enc); - void (*reset_fifo)( - struct stream_encoder *enc - ); - void (*dp_blank)( struct dc_link *link, struct stream_encoder *enc); From 30fbce374745a9c6af93c775a5ac49a97f822fda Mon Sep 17 00:00:00 2001 From: Aun-Ali Zaidi Date: Sat, 29 Jan 2022 05:49:55 +0000 Subject: [PATCH 0995/1295] drm/amd/display: Force link_rate as LINK_RATE_RBR2 for 2018 15" Apple Retina panels The eDP link rate reported by the DP_MAX_LINK_RATE dpcd register (0xa) is contradictory to the highest rate supported reported by EDID (0xc = LINK_RATE_RBR2). The effects of this compounded with commit '4a8ca46bae8a ("drm/amd/display: Default max bpc to 16 for eDP")' results in no display modes being found and a dark panel. For now, simply force the maximum supported link rate for the eDP attached 2018 15" Apple Retina panels. Additionally, we must also check the firmware revision since the device ID reported by the DPCD is identical to that of the more capable 16,1, incorrectly quirking it. We also use said firmware check to quirk the refreshed 15,1 models with Vega graphics as they use a slightly newer firmware version. Tested-by: Aun-Ali Zaidi Reviewed-by: Harry Wentland Signed-off-by: Aun-Ali Zaidi Signed-off-by: Aditya Garg Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/dc/core/dc_link_dp.c | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c index 4c3ab2575e4b..61b8f29a0c30 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c @@ -5597,6 +5597,26 @@ static bool retrieve_link_cap(struct dc_link *link) dp_hw_fw_revision.ieee_fw_rev, sizeof(dp_hw_fw_revision.ieee_fw_rev)); + /* Quirk for Apple MBP 2018 15" Retina panels: wrong DP_MAX_LINK_RATE */ + { + uint8_t str_mbp_2018[] = { 101, 68, 21, 103, 98, 97 }; + uint8_t fwrev_mbp_2018[] = { 7, 4 }; + uint8_t fwrev_mbp_2018_vega[] = { 8, 4 }; + + /* We also check for the firmware revision as 16,1 models have an + * identical device id and are incorrectly quirked otherwise. + */ + if ((link->dpcd_caps.sink_dev_id == 0x0010fa) && + !memcmp(link->dpcd_caps.sink_dev_id_str, str_mbp_2018, + sizeof(str_mbp_2018)) && + (!memcmp(link->dpcd_caps.sink_fw_revision, fwrev_mbp_2018, + sizeof(fwrev_mbp_2018)) || + !memcmp(link->dpcd_caps.sink_fw_revision, fwrev_mbp_2018_vega, + sizeof(fwrev_mbp_2018_vega)))) { + link->reported_link_cap.link_rate = LINK_RATE_RBR2; + } + } + memset(&link->dpcd_caps.dsc_caps, '\0', sizeof(link->dpcd_caps.dsc_caps)); memset(&link->dpcd_caps.fec_cap, '\0', sizeof(link->dpcd_caps.fec_cap)); From e55a3aea418269266d84f426b3bd70794d3389c8 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 25 Jan 2022 21:46:58 -0600 Subject: [PATCH 0996/1295] drm/amd: avoid suspend on dGPUs w/ s2idle support when runtime PM enabled dGPUs connected to Intel systems configured for suspend to idle will not have the power rails cut at suspend and resetting the GPU may lead to problematic behaviors. Fixes: e25443d2765f4 ("drm/amdgpu: add a dev_pm_ops prepare callback (v2)") Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1879 Reviewed-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 17747252e722..63a089992645 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2252,8 +2252,7 @@ static int amdgpu_pmops_prepare(struct device *dev) * DPM_FLAG_SMART_SUSPEND works properly */ if (amdgpu_device_supports_boco(drm_dev)) - return pm_runtime_suspended(dev) && - pm_suspend_via_firmware(); + return pm_runtime_suspended(dev); /* if we will not support s3 or s2i for the device * then skip suspend From e8ae38720e1a685fd98cfa5ae118c9d07b45ca79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 28 Jan 2022 13:21:10 +0100 Subject: [PATCH 0997/1295] drm/amdgpu: fix logic inversion in check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We probably never trigger this, but the logic inside the check is inverted. Signed-off-by: Christian König Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 5c3f24069f2a..4655702a5e00 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1904,7 +1904,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset, unsigned i; int r; - if (direct_submit && !ring->sched.ready) { + if (!direct_submit && !ring->sched.ready) { DRM_ERROR("Trying to move memory with ring turned off.\n"); return -EINVAL; } From c4f9c8bbcc24f00002827f73957053a59aba5646 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Thu, 3 Feb 2022 00:30:38 +0300 Subject: [PATCH 0998/1295] MAINTAINERS: add myself as PATA drivers reviewer Add myself as a reviewer for the libata PATA drivers -- there is activity in this area still... 8-) Having been hacking on ATA from the early 90s, I think I deserved this highly responsible position, at last! :-) Signed-off-by: Sergey Shtylyov Signed-off-by: Damien Le Moal --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index f41088418aae..3dbc66c37335 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10880,6 +10880,12 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git F: drivers/ata/pata_arasan_cf.c F: include/linux/pata_arasan_cf_data.h +LIBATA PATA DRIVERS +R: Sergey Shtylyov +L: linux-ide@vger.kernel.org +F: drivers/ata/ata_*.c +F: drivers/ata/pata_*.c + LIBATA PATA FARADAY FTIDE010 AND GEMINI SATA BRIDGE DRIVERS M: Linus Walleij L: linux-ide@vger.kernel.org From b67985be400969578d4d4b17299714c0e5d2c07b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 1 Feb 2022 10:46:40 -0800 Subject: [PATCH 0999/1295] tcp: add missing tcp_skb_can_collapse() test in tcp_shift_skb_data() tcp_shift_skb_data() might collapse three packets into a larger one. P_A, P_B, P_C -> P_ABC Historically, it used a single tcp_skb_can_collapse_to(P_A) call, because it was enough. In commit 85712484110d ("tcp: coalesce/collapse must respect MPTCP extensions"), this call was replaced by a call to tcp_skb_can_collapse(P_A, P_B) But the now needed test over P_C has been missed. This probably broke MPTCP. Then later, commit 9b65b17db723 ("net: avoid double accounting for pure zerocopy skbs") added an extra condition to tcp_skb_can_collapse(), but the missing call from tcp_shift_skb_data() is also breaking TCP zerocopy, because P_A and P_C might have different skb_zcopy_pure() status. Fixes: 85712484110d ("tcp: coalesce/collapse must respect MPTCP extensions") Fixes: 9b65b17db723 ("net: avoid double accounting for pure zerocopy skbs") Signed-off-by: Eric Dumazet Cc: Mat Martineau Cc: Talal Ahmad Cc: Arjun Roy Cc: Willem de Bruijn Acked-by: Soheil Hassas Yeganeh Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220201184640.756716-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dc49a3d551eb..bfe4112e000c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1660,6 +1660,8 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, (mss != tcp_skb_seglen(skb))) goto out; + if (!tcp_skb_can_collapse(prev, skb)) + goto out; len = skb->len; pcount = tcp_skb_pcount(skb); if (tcp_skb_shift(prev, skb, pcount, len)) From 4a81f6da9cb2d1ef911131a6fd8bd15cb61fc772 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 1 Feb 2022 20:39:42 +0100 Subject: [PATCH 1000/1295] net, neigh: Do not trigger immediate probes on NUD_FAILED from neigh_managed_work syzkaller was able to trigger a deadlock for NTF_MANAGED entries [0]: kworker/0:16/14617 is trying to acquire lock: ffffffff8d4dd370 (&tbl->lock){++-.}-{2:2}, at: ___neigh_create+0x9e1/0x2990 net/core/neighbour.c:652 [...] but task is already holding lock: ffffffff8d4dd370 (&tbl->lock){++-.}-{2:2}, at: neigh_managed_work+0x35/0x250 net/core/neighbour.c:1572 The neighbor entry turned to NUD_FAILED state, where __neigh_event_send() triggered an immediate probe as per commit cd28ca0a3dd1 ("neigh: reduce arp latency") via neigh_probe() given table lock was held. One option to fix this situation is to defer the neigh_probe() back to the neigh_timer_handler() similarly as pre cd28ca0a3dd1. For the case of NTF_MANAGED, this deferral is acceptable given this only happens on actual failure state and regular / expected state is NUD_VALID with the entry already present. The fix adds a parameter to __neigh_event_send() in order to communicate whether immediate probe is allowed or disallowed. Existing call-sites of neigh_event_send() default as-is to immediate probe. However, the neigh_managed_work() disables it via use of neigh_event_send_probe(). [0] __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_deadlock_bug kernel/locking/lockdep.c:2956 [inline] check_deadlock kernel/locking/lockdep.c:2999 [inline] validate_chain kernel/locking/lockdep.c:3788 [inline] __lock_acquire.cold+0x149/0x3ab kernel/locking/lockdep.c:5027 lock_acquire kernel/locking/lockdep.c:5639 [inline] lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5604 __raw_write_lock_bh include/linux/rwlock_api_smp.h:202 [inline] _raw_write_lock_bh+0x2f/0x40 kernel/locking/spinlock.c:334 ___neigh_create+0x9e1/0x2990 net/core/neighbour.c:652 ip6_finish_output2+0x1070/0x14f0 net/ipv6/ip6_output.c:123 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] __ip6_finish_output+0x61e/0xe90 net/ipv6/ip6_output.c:170 ip6_finish_output+0x32/0x200 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x1e4/0x530 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:451 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0xa99/0x17f0 net/ipv6/ndisc.c:508 ndisc_send_ns+0x3a9/0x840 net/ipv6/ndisc.c:650 ndisc_solicit+0x2cd/0x4f0 net/ipv6/ndisc.c:742 neigh_probe+0xc2/0x110 net/core/neighbour.c:1040 __neigh_event_send+0x37d/0x1570 net/core/neighbour.c:1201 neigh_event_send include/net/neighbour.h:470 [inline] neigh_managed_work+0x162/0x250 net/core/neighbour.c:1574 process_one_work+0x9ac/0x1650 kernel/workqueue.c:2307 worker_thread+0x657/0x1110 kernel/workqueue.c:2454 kthread+0x2e9/0x3a0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Fixes: 7482e3841d52 ("net, neigh: Add NTF_MANAGED flag for managed neighbor entries") Reported-by: syzbot+5239d0e1778a500d477a@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Roopa Prabhu Tested-by: syzbot+5239d0e1778a500d477a@syzkaller.appspotmail.com Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220201193942.5055-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 18 +++++++++++++----- net/core/neighbour.c | 18 ++++++++++++------ 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 937389e04c8e..87419f7f5421 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -350,7 +350,8 @@ static inline struct neighbour *neigh_create(struct neigh_table *tbl, return __neigh_create(tbl, pkey, dev, true); } void neigh_destroy(struct neighbour *neigh); -int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, + const bool immediate_ok); int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); @@ -460,17 +461,24 @@ static inline struct neighbour * neigh_clone(struct neighbour *neigh) #define neigh_hold(n) refcount_inc(&(n)->refcnt) -static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +static __always_inline int neigh_event_send_probe(struct neighbour *neigh, + struct sk_buff *skb, + const bool immediate_ok) { unsigned long now = jiffies; - + if (READ_ONCE(neigh->used) != now) WRITE_ONCE(neigh->used, now); - if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) - return __neigh_event_send(neigh, skb); + if (!(neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))) + return __neigh_event_send(neigh, skb, immediate_ok); return 0; } +static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + return neigh_event_send_probe(neigh, skb, true); +} + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6c2016f7f3d1..ec0bf737b076 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1133,7 +1133,8 @@ out: neigh_release(neigh); } -int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, + const bool immediate_ok) { int rc; bool immediate_probe = false; @@ -1154,12 +1155,17 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh_del_timer(neigh); - neigh->nud_state = NUD_INCOMPLETE; + neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; - next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), - HZ/100); + if (!immediate_ok) { + next = now + 1; + } else { + immediate_probe = true; + next = now + max(NEIGH_VAR(neigh->parms, + RETRANS_TIME), + HZ / 100); + } neigh_add_timer(neigh, next); - immediate_probe = true; } else { neigh->nud_state = NUD_FAILED; neigh->updated = jiffies; @@ -1571,7 +1577,7 @@ static void neigh_managed_work(struct work_struct *work) write_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) - neigh_event_send(neigh, NULL); + neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME)); write_unlock_bh(&tbl->lock); From 40c845c176953311c8fc232e977516f7733c64cd Mon Sep 17 00:00:00 2001 From: Rohith Surabattula Date: Tue, 1 Feb 2022 07:22:02 +0000 Subject: [PATCH 1001/1295] Invalidate fscache cookie only when inode attributes are changed. For example if mtime or size has changed. Signed-off-by: Rohith Surabattula Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 7d8b3ceb2af3..60d853c92f6a 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -83,6 +83,7 @@ static void cifs_set_ops(struct inode *inode) static void cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) { + struct cifs_fscache_inode_coherency_data cd; struct cifsInodeInfo *cifs_i = CIFS_I(inode); cifs_dbg(FYI, "%s: revalidating inode %llu\n", @@ -113,6 +114,9 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n", __func__, cifs_i->uniqueid); set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags); + /* Invalidate fscache cookie */ + cifs_fscache_fill_coherency(&cifs_i->vfs_inode, &cd); + fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0); } /* @@ -2261,8 +2265,6 @@ cifs_dentry_needs_reval(struct dentry *dentry) int cifs_invalidate_mapping(struct inode *inode) { - struct cifs_fscache_inode_coherency_data cd; - struct cifsInodeInfo *cifsi = CIFS_I(inode); int rc = 0; if (inode->i_mapping && inode->i_mapping->nrpages != 0) { @@ -2272,8 +2274,6 @@ cifs_invalidate_mapping(struct inode *inode) __func__, inode); } - cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd); - fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0); return rc; } From d3b331fb51f326d5b5326010bf2b5841bb86cdc6 Mon Sep 17 00:00:00 2001 From: Ryan Bair Date: Wed, 22 Dec 2021 11:04:05 -0500 Subject: [PATCH 1002/1295] cifs: fix workstation_name for multiuser mounts Set workstation_name from the master_tcon for multiuser mounts. Just in case, protect size_of_ntlmssp_blob against a NULL workstation_name. Fixes: 49bd49f983b5 ("cifs: send workstation name during ntlmssp session setup") Cc: stable@vger.kernel.org # 5.16 Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Ryan Bair Signed-off-by: Steve French --- fs/cifs/connect.c | 13 +++++++++++++ fs/cifs/sess.c | 6 +++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5a51a098b845..0b742bd50642 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1977,6 +1977,19 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) } } + ctx->workstation_name = kstrdup(ses->workstation_name, GFP_KERNEL); + if (!ctx->workstation_name) { + cifs_dbg(FYI, "Unable to allocate memory for workstation_name\n"); + rc = -ENOMEM; + kfree(ctx->username); + ctx->username = NULL; + kfree_sensitive(ctx->password); + ctx->password = NULL; + kfree(ctx->domainname); + ctx->domainname = NULL; + goto out_key_put; + } + out_key_put: up_read(&key->sem); key_put(key); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index dc3b16d1be09..5723d50340e5 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -713,7 +713,11 @@ static int size_of_ntlmssp_blob(struct cifs_ses *ses, int base_size) else sz += sizeof(__le16); - sz += sizeof(__le16) * strnlen(ses->workstation_name, CIFS_MAX_WORKSTATION_LEN); + if (ses->workstation_name) + sz += sizeof(__le16) * strnlen(ses->workstation_name, + CIFS_MAX_WORKSTATION_LEN); + else + sz += sizeof(__le16); return sz; } From 6a51abdeb259a56d95f13cc67e3a0838bcda0377 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Thu, 20 Jan 2022 12:17:37 -0800 Subject: [PATCH 1003/1295] nvme-fabrics: fix state check in nvmf_ctlr_matches_baseopts() Controller deletion/reset, immediately followed by or concurrent with a reconnect, is hard failing the connect attempt resulting in a complete loss of connectivity to the controller. In the connect request, fabrics looks for an existing controller with the same address components and aborts the connect if a controller already exists and the duplicate connect option isn't set. The match routine filters out controllers that are dead or dying, so they don't interfere with the new connect request. When NVME_CTRL_DELETING_NOIO was added, it missed updating the state filters in the nvmf_ctlr_matches_baseopts() routine. Thus, when in this new state, it's seen as a live controller and fails the connect request. Correct by adding the DELETING_NIO state to the match checks. Fixes: ecca390e8056 ("nvme: fix deadlock in disconnect during scan_work and/or ana_work") Cc: # v5.7+ Signed-off-by: Uday Shankar Reviewed-by: James Smart Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index c3203ff1c654..1e3a09cad961 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -170,6 +170,7 @@ nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl, struct nvmf_ctrl_options *opts) { if (ctrl->state == NVME_CTRL_DELETING || + ctrl->state == NVME_CTRL_DELETING_NOIO || ctrl->state == NVME_CTRL_DEAD || strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) || strcmp(opts->host->nqn, ctrl->opts->host->nqn) || From b293dcc473d22a62dc6d78de2b15e4f49515db56 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 2 Feb 2022 14:01:58 +0800 Subject: [PATCH 1004/1295] bpf: Use VM_MAP instead of VM_ALLOC for ringbuf After commit 2fd3fb0be1d1 ("kasan, vmalloc: unpoison VM_ALLOC pages after mapping"), non-VM_ALLOC mappings will be marked as accessible in __get_vm_area_node() when KASAN is enabled. But now the flag for ringbuf area is VM_ALLOC, so KASAN will complain out-of-bound access after vmap() returns. Because the ringbuf area is created by mapping allocated pages, so use VM_MAP instead. After the change, info in /proc/vmallocinfo also changes from [start]-[end] 24576 ringbuf_map_alloc+0x171/0x290 vmalloc user to [start]-[end] 24576 ringbuf_map_alloc+0x171/0x290 vmap user Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: syzbot+5ad567a418794b9b5983@syzkaller.appspotmail.com Signed-off-by: Hou Tao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220202060158.6260-1-houtao1@huawei.com --- kernel/bpf/ringbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 638d7fd7b375..710ba9de12ce 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -104,7 +104,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) } rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, - VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + VM_MAP | VM_USERMAP, PAGE_KERNEL); if (rb) { kmemleak_not_leak(pages); rb->pages = pages; From 4564661af6ee321942ec1ab012191d7adedd3e00 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 30 Jan 2022 11:17:05 -0800 Subject: [PATCH 1005/1295] xen: xenbus_dev.h: delete incorrect file name It is better/preferred not to include file names in source files because (a) they are not needed and (b) they can be incorrect, so just delete this incorrect file name. Signed-off-by: Randy Dunlap Reviewed-by: Juergen Gross Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: xen-devel@lists.xenproject.org Link: https://lore.kernel.org/r/20220130191705.24971-1-rdunlap@infradead.org Signed-off-by: Juergen Gross --- include/xen/xenbus_dev.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/xen/xenbus_dev.h b/include/xen/xenbus_dev.h index bbee8c6a349d..4dc45a51c044 100644 --- a/include/xen/xenbus_dev.h +++ b/include/xen/xenbus_dev.h @@ -1,6 +1,4 @@ /****************************************************************************** - * evtchn.h - * * Interface to /dev/xen/xenbus_backend. * * Copyright (c) 2011 Bastian Blank From 164666fa66669d437bdcc8d5f1744a2aee73be41 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Mon, 31 Jan 2022 12:23:07 -0500 Subject: [PATCH 1006/1295] Improve docs for IOCTL_GNTDEV_MAP_GRANT_REF --------------cKY3Ggs6VDUCSn4I6iN78sHA Content-Type: multipart/mixed; boundary="------------g0T69ASidFiPhh4eOY4XzIg1" --------------g0T69ASidFiPhh4eOY4XzIg1 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: quoted-printable The current implementation of gntdev guarantees that the first call to IOCTL_GNTDEV_MAP_GRANT_REF will set @index to 0. This is required to use gntdev for Wayland, which is a future desire of Qubes OS. Additionally, requesting zero grants results in an error, but this was not documented either. Document both of these. Signed-off-by: Demi Marie Obenour Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/f66c5a4e-2034-00b5-a635-6983bd999c07@gmail.com Signed-off-by: Juergen Gross --- include/uapi/xen/gntdev.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/uapi/xen/gntdev.h b/include/uapi/xen/gntdev.h index 9ac5515b9bc2..7a7145395c09 100644 --- a/include/uapi/xen/gntdev.h +++ b/include/uapi/xen/gntdev.h @@ -47,7 +47,13 @@ struct ioctl_gntdev_grant_ref { /* * Inserts the grant references into the mapping table of an instance * of gntdev. N.B. This does not perform the mapping, which is deferred - * until mmap() is called with @index as the offset. + * until mmap() is called with @index as the offset. @index should be + * considered opaque to userspace, with one exception: if no grant + * references have ever been inserted into the mapping table of this + * instance, @index will be set to 0. This is necessary to use gntdev + * with userspace APIs that expect a file descriptor that can be + * mmap()'d at offset 0, such as Wayland. If @count is set to 0, this + * ioctl will fail. */ #define IOCTL_GNTDEV_MAP_GRANT_REF \ _IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) From 3ccb3128e50380b86e01c2f9b6f4ac9c11dc05eb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 31 Jan 2022 08:19:59 -0800 Subject: [PATCH 1007/1295] xen: update missing ioctl magic numers documentation Add missing ioctl "magic numbers" for various Xen interfaces (xenbus_dev.h, gntalloc.h, gntdev.h, and privcmd.h). Signed-off-by: Randy Dunlap Reviewed-by: Juergen Gross Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: xen-devel@lists.xenproject.org Link: https://lore.kernel.org/r/20220131161959.16509-1-rdunlap@infradead.org Signed-off-by: Juergen Gross --- Documentation/userspace-api/ioctl/ioctl-number.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 687efcf245c1..e6fce2cbd99e 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -115,6 +115,7 @@ Code Seq# Include File Comments 'B' 00-1F linux/cciss_ioctl.h conflict! 'B' 00-0F include/linux/pmu.h conflict! 'B' C0-FF advanced bbus +'B' 00-0F xen/xenbus_dev.h conflict! 'C' all linux/soundcard.h conflict! 'C' 01-2F linux/capi.h conflict! 'C' F0-FF drivers/net/wan/cosa.h conflict! @@ -134,6 +135,7 @@ Code Seq# Include File Comments 'F' 80-8F linux/arcfb.h conflict! 'F' DD video/sstfb.h conflict! 'G' 00-3F drivers/misc/sgi-gru/grulib.h conflict! +'G' 00-0F xen/gntalloc.h, xen/gntdev.h conflict! 'H' 00-7F linux/hiddev.h conflict! 'H' 00-0F linux/hidraw.h conflict! 'H' 01 linux/mei.h conflict! @@ -176,6 +178,7 @@ Code Seq# Include File Comments 'P' 60-6F sound/sscape_ioctl.h conflict! 'P' 00-0F drivers/usb/class/usblp.c conflict! 'P' 01-09 drivers/misc/pci_endpoint_test.c conflict! +'P' 00-0F xen/privcmd.h conflict! 'Q' all linux/soundcard.h 'R' 00-1F linux/random.h conflict! 'R' 01 linux/rfkill.h conflict! From e25a8d959992f61b64a58fc62fb7951dc6f31d1f Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 1 Feb 2022 11:57:16 +0100 Subject: [PATCH 1008/1295] x86/Xen: streamline (and fix) PV CPU enumeration This started out with me noticing that "dom0_max_vcpus=" with larger than the number of physical CPUs reported through ACPI tables would not bring up the "excess" vCPU-s. Addressing this is the primary purpose of the change; CPU maps handling is being tidied only as far as is necessary for the change here (with the effect of also avoiding the setting up of too much per-CPU infrastructure, i.e. for CPUs which can never come online). Noticing that xen_fill_possible_map() is called way too early, whereas xen_filter_cpu_maps() is called too late (after per-CPU areas were already set up), and further observing that each of the functions serves only one of Dom0 or DomU, it looked like it was better to simplify this. Use the .get_smp_config hook instead, uniformly for Dom0 and DomU. xen_fill_possible_map() can be dropped altogether, while xen_filter_cpu_maps() is re-purposed but not otherwise changed. Signed-off-by: Jan Beulich Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/2dbd5f0a-9859-ca2d-085e-a02f7166c610@suse.com Signed-off-by: Juergen Gross --- arch/x86/xen/enlighten_pv.c | 4 ---- arch/x86/xen/smp_pv.c | 26 ++++++-------------------- 2 files changed, 6 insertions(+), 24 deletions(-) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5004feb16783..d47c3d176ae4 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1341,10 +1341,6 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_acpi_sleep_register(); - /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; - xen_boot_params_init_edd(); #ifdef CONFIG_ACPI diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 6a8f3b53ab83..4a6019238ee7 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -148,28 +148,12 @@ int xen_smp_intr_init_pv(unsigned int cpu) return rc; } -static void __init xen_fill_possible_map(void) -{ - int i, rc; - - if (xen_initial_domain()) - return; - - for (i = 0; i < nr_cpu_ids; i++) { - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) { - num_processors++; - set_cpu_possible(i, true); - } - } -} - -static void __init xen_filter_cpu_maps(void) +static void __init _get_smp_config(unsigned int early) { int i, rc; unsigned int subtract = 0; - if (!xen_initial_domain()) + if (early) return; num_processors = 0; @@ -210,7 +194,6 @@ static void __init xen_pv_smp_prepare_boot_cpu(void) * sure the old memory can be recycled. */ make_lowmem_page_readwrite(xen_initial_gdt); - xen_filter_cpu_maps(); xen_setup_vcpu_info_placement(); /* @@ -476,5 +459,8 @@ static const struct smp_ops xen_smp_ops __initconst = { void __init xen_smp_init(void) { smp_ops = xen_smp_ops; - xen_fill_possible_map(); + + /* Avoid searching for BIOS MP tables */ + x86_init.mpparse.find_smp_config = x86_init_noop; + x86_init.mpparse.get_smp_config = _get_smp_config; } From 622c9a3a7868e1eeca39c55305ca3ebec4742b64 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Wed, 2 Feb 2022 09:17:55 +0100 Subject: [PATCH 1009/1295] drm: mxsfb: Fix NULL pointer dereference mxsfb should not ever dereference the NULL pointer which drm_atomic_get_new_bridge_state is allowed to return. Assume a fixed format instead. Fixes: b776b0f00f24 ("drm: mxsfb: Use bus_format from the nearest bridge if present") Signed-off-by: Alexander Stein Signed-off-by: Marek Vasut Link: https://patchwork.freedesktop.org/patch/msgid/20220202081755.145716-3-alexander.stein@ew.tq-group.com --- drivers/gpu/drm/mxsfb/mxsfb_kms.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/mxsfb/mxsfb_kms.c b/drivers/gpu/drm/mxsfb/mxsfb_kms.c index 0655582ae8ed..4cfb6c001679 100644 --- a/drivers/gpu/drm/mxsfb/mxsfb_kms.c +++ b/drivers/gpu/drm/mxsfb/mxsfb_kms.c @@ -361,7 +361,11 @@ static void mxsfb_crtc_atomic_enable(struct drm_crtc *crtc, bridge_state = drm_atomic_get_new_bridge_state(state, mxsfb->bridge); - bus_format = bridge_state->input_bus_cfg.format; + if (!bridge_state) + bus_format = MEDIA_BUS_FMT_FIXED; + else + bus_format = bridge_state->input_bus_cfg.format; + if (bus_format == MEDIA_BUS_FMT_FIXED) { dev_warn_once(drm->dev, "Bridge does not provide bus format, assuming MEDIA_BUS_FMT_RGB888_1X24.\n" From 1c71dbc8a179d99dd9bb7e7fc1888db613cf85de Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 27 Jan 2022 12:20:50 +0000 Subject: [PATCH 1010/1295] KVM: arm64: Avoid consuming a stale esr value when SError occur When any exception other than an IRQ occurs, the CPU updates the ESR_EL2 register with the exception syndrome. An SError may also become pending, and will be synchronised by KVM. KVM notes the exception type, and whether an SError was synchronised in exit_code. When an exception other than an IRQ occurs, fixup_guest_exit() updates vcpu->arch.fault.esr_el2 from the hardware register. When an SError was synchronised, the vcpu esr value is used to determine if the exception was due to an HVC. If so, ELR_EL2 is moved back one instruction. This is so that KVM can process the SError first, and re-execute the HVC if the guest survives the SError. But if an IRQ synchronises an SError, the vcpu's esr value is stale. If the previous non-IRQ exception was an HVC, KVM will corrupt ELR_EL2, causing an unrelated guest instruction to be executed twice. Check ARM_EXCEPTION_CODE() before messing with ELR_EL2, IRQs don't update this register so don't need to check. Fixes: defe21f49bc9 ("KVM: arm64: Move PC rollback on SError to HYP") Cc: stable@vger.kernel.org Reported-by: Steven Price Signed-off-by: James Morse Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220127122052.1584324-3-james.morse@arm.com --- arch/arm64/kvm/hyp/include/hyp/switch.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 58e14f8ead23..331dd10821df 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -424,7 +424,8 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); - if (ARM_SERROR_PENDING(*exit_code)) { + if (ARM_SERROR_PENDING(*exit_code) && + ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) { u8 esr_ec = kvm_vcpu_trap_get_class(vcpu); /* From 1229630af88620f6e3a621a1ebd1ca14d9340df7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 27 Jan 2022 12:20:51 +0000 Subject: [PATCH 1011/1295] KVM: arm64: Stop handle_exit() from handling HVC twice when an SError occurs Prior to commit defe21f49bc9 ("KVM: arm64: Move PC rollback on SError to HYP"), when an SError is synchronised due to another exception, KVM handles the SError first. If the guest survives, the instruction that triggered the original exception is re-exectued to handle the first exception. HVC is treated as a special case as the instruction wouldn't normally be re-exectued, as its not a trap. Commit defe21f49bc9 didn't preserve the behaviour of the 'return 1' that skips the rest of handle_exit(). Since commit defe21f49bc9, KVM will try to handle the SError and the original exception at the same time. When the exception was an HVC, fixup_guest_exit() has already rolled back ELR_EL2, meaning if the guest has virtual SError masked, it will execute and handle the HVC twice. Restore the original behaviour. Fixes: defe21f49bc9 ("KVM: arm64: Move PC rollback on SError to HYP") Cc: stable@vger.kernel.org Signed-off-by: James Morse Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220127122052.1584324-4-james.morse@arm.com --- arch/arm64/kvm/handle_exit.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index fd2dd26caf91..e3140abd2e2e 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -228,6 +228,14 @@ int handle_exit(struct kvm_vcpu *vcpu, int exception_index) { struct kvm_run *run = vcpu->run; + if (ARM_SERROR_PENDING(exception_index)) { + /* + * The SError is handled by handle_exit_early(). If the guest + * survives it will re-execute the original instruction. + */ + return 1; + } + exception_index = ARM_EXCEPTION_CODE(exception_index); switch (exception_index) { From 1dd498e5e26ad71e3e9130daf72cfb6a693fee03 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 27 Jan 2022 12:20:52 +0000 Subject: [PATCH 1012/1295] KVM: arm64: Workaround Cortex-A510's single-step and PAC trap errata Cortex-A510's erratum #2077057 causes SPSR_EL2 to be corrupted when single-stepping authenticated ERET instructions. A single step is expected, but a pointer authentication trap is taken instead. The erratum causes SPSR_EL1 to be copied to SPSR_EL2, which could allow EL1 to cause a return to EL2 with a guest controlled ELR_EL2. Because the conditions require an ERET into active-not-pending state, this is only a problem for the EL2 when EL2 is stepping EL1. In this case the previous SPSR_EL2 value is preserved in struct kvm_vcpu, and can be restored. Cc: stable@vger.kernel.org # 53960faf2b73: arm64: Add Cortex-A510 CPU part definition Cc: stable@vger.kernel.org Signed-off-by: James Morse [maz: fixup cpucaps ordering] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220127122052.1584324-5-james.morse@arm.com --- Documentation/arm64/silicon-errata.rst | 2 ++ arch/arm64/Kconfig | 16 ++++++++++++++++ arch/arm64/kernel/cpu_errata.c | 8 ++++++++ arch/arm64/kvm/hyp/include/hyp/switch.h | 20 +++++++++++++++++++- arch/arm64/tools/cpucaps | 5 +++-- 5 files changed, 48 insertions(+), 3 deletions(-) diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index 0ec7b7f1524b..ea281dd75517 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -100,6 +100,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A510 | #2051678 | ARM64_ERRATUM_2051678 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A510 | #2077057 | ARM64_ERRATUM_2077057 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A710 | #2119858 | ARM64_ERRATUM_2119858 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A710 | #2054223 | ARM64_ERRATUM_2054223 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f2b5a4abef21..cbcd42decb2a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -680,6 +680,22 @@ config ARM64_ERRATUM_2051678 If unsure, say Y. +config ARM64_ERRATUM_2077057 + bool "Cortex-A510: 2077057: workaround software-step corrupting SPSR_EL2" + help + This option adds the workaround for ARM Cortex-A510 erratum 2077057. + Affected Cortex-A510 may corrupt SPSR_EL2 when the a step exception is + expected, but a Pointer Authentication trap is taken instead. The + erratum causes SPSR_EL1 to be copied to SPSR_EL2, which could allow + EL1 to cause a return to EL2 with a guest controlled ELR_EL2. + + This can only happen when EL2 is stepping EL1. + + When these conditions occur, the SPSR_EL2 value is unchanged from the + previous guest entry, and can be restored from the in-memory copy. + + If unsure, say Y. + config ARM64_ERRATUM_2119858 bool "Cortex-A710/X2: 2119858: workaround TRBE overwriting trace data in FILL mode" default y diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 066098198c24..b217941713a8 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -600,6 +600,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = { CAP_MIDR_RANGE_LIST(trbe_write_out_of_range_cpus), }, #endif +#ifdef CONFIG_ARM64_ERRATUM_2077057 + { + .desc = "ARM erratum 2077057", + .capability = ARM64_WORKAROUND_2077057, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2), + }, +#endif #ifdef CONFIG_ARM64_ERRATUM_2064142 { .desc = "ARM erratum 2064142", diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 331dd10821df..701cfb964905 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -402,6 +402,24 @@ static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) return false; } +static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + /* + * Check for the conditions of Cortex-A510's #2077057. When these occur + * SPSR_EL2 can't be trusted, but isn't needed either as it is + * unchanged from the value in vcpu_gp_regs(vcpu)->pstate. + * Are we single-stepping the guest, and took a PAC exception from the + * active-not-pending state? + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_2077057) && + vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && + *vcpu_cpsr(vcpu) & DBG_SPSR_SS && + ESR_ELx_EC(read_sysreg_el2(SYS_ESR)) == ESR_ELx_EC_PAC) + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); + + vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR); +} + /* * Return true when we were able to fixup the guest exit and should return to * the guest, false when we should restore the host state and return to the @@ -413,7 +431,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) * Save PSTATE early so that we can evaluate the vcpu mode * early on. */ - vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR); + synchronize_vcpu_pstate(vcpu, exit_code); /* * Check whether we want to repaint the state one way or diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index e7719e8f18de..9c65b1e25a96 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -55,9 +55,10 @@ WORKAROUND_1418040 WORKAROUND_1463225 WORKAROUND_1508412 WORKAROUND_1542419 -WORKAROUND_2064142 -WORKAROUND_2038923 WORKAROUND_1902691 +WORKAROUND_2038923 +WORKAROUND_2064142 +WORKAROUND_2077057 WORKAROUND_TRBE_OVERWRITE_FILL_MODE WORKAROUND_TSB_FLUSH_FAILURE WORKAROUND_TRBE_WRITE_OUT_OF_RANGE From aceeafefff736057e8f93f19bbfbef26abd94604 Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Thu, 27 Jan 2022 15:29:39 +0100 Subject: [PATCH 1013/1295] optee: use driver internal tee_context for some rpc Adds a driver private tee_context by moving the tee_context in struct optee_notif to struct optee. This tee_context was previously used when doing internal calls to secure world to deliver notification. The new driver internal tee_context is now also when allocating driver private shared memory. This decouples the shared memory object from its original tee_context. This is needed when the life time of such a memory allocation outlives the client tee_context. This patch fixes the problem described below: The addition of a shutdown hook by commit f25889f93184 ("optee: fix tee out of memory failure seen during kexec reboot") introduced a kernel shutdown regression that can be triggered after running the OP-TEE xtest suites. Once the shutdown hook is called it is not possible to communicate any more with the supplicant process because the system is not scheduling task any longer. Thus if the optee driver shutdown path receives a supplicant RPC request from the OP-TEE we will deadlock the kernel's shutdown. Fixes: f25889f93184 ("optee: fix tee out of memory failure seen during kexec reboot") Fixes: 217e0250cccb ("tee: use reference counting for tee_context") Reported-by: Lars Persson Cc: stable@vger.kernel.org Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/optee/core.c | 1 + drivers/tee/optee/ffa_abi.c | 77 +++++++++++++++++-------------- drivers/tee/optee/optee_private.h | 5 +- drivers/tee/optee/smc_abi.c | 48 +++++++------------ 4 files changed, 64 insertions(+), 67 deletions(-) diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c index 1ca320885fad..17a6f51d3089 100644 --- a/drivers/tee/optee/core.c +++ b/drivers/tee/optee/core.c @@ -158,6 +158,7 @@ void optee_remove_common(struct optee *optee) optee_unregister_devices(); optee_notif_uninit(optee); + teedev_close_context(optee->ctx); /* * The two devices have to be unregistered before we can free the * other resources. diff --git a/drivers/tee/optee/ffa_abi.c b/drivers/tee/optee/ffa_abi.c index 20a1b1a3d965..545f61af1248 100644 --- a/drivers/tee/optee/ffa_abi.c +++ b/drivers/tee/optee/ffa_abi.c @@ -424,6 +424,7 @@ static struct tee_shm_pool_mgr *optee_ffa_shm_pool_alloc_pages(void) */ static void handle_ffa_rpc_func_cmd_shm_alloc(struct tee_context *ctx, + struct optee *optee, struct optee_msg_arg *arg) { struct tee_shm *shm; @@ -439,7 +440,7 @@ static void handle_ffa_rpc_func_cmd_shm_alloc(struct tee_context *ctx, shm = optee_rpc_cmd_alloc_suppl(ctx, arg->params[0].u.value.b); break; case OPTEE_RPC_SHM_TYPE_KERNEL: - shm = tee_shm_alloc(ctx, arg->params[0].u.value.b, + shm = tee_shm_alloc(optee->ctx, arg->params[0].u.value.b, TEE_SHM_MAPPED | TEE_SHM_PRIV); break; default: @@ -493,14 +494,13 @@ err_bad_param: } static void handle_ffa_rpc_func_cmd(struct tee_context *ctx, + struct optee *optee, struct optee_msg_arg *arg) { - struct optee *optee = tee_get_drvdata(ctx->teedev); - arg->ret_origin = TEEC_ORIGIN_COMMS; switch (arg->cmd) { case OPTEE_RPC_CMD_SHM_ALLOC: - handle_ffa_rpc_func_cmd_shm_alloc(ctx, arg); + handle_ffa_rpc_func_cmd_shm_alloc(ctx, optee, arg); break; case OPTEE_RPC_CMD_SHM_FREE: handle_ffa_rpc_func_cmd_shm_free(ctx, optee, arg); @@ -510,12 +510,12 @@ static void handle_ffa_rpc_func_cmd(struct tee_context *ctx, } } -static void optee_handle_ffa_rpc(struct tee_context *ctx, u32 cmd, - struct optee_msg_arg *arg) +static void optee_handle_ffa_rpc(struct tee_context *ctx, struct optee *optee, + u32 cmd, struct optee_msg_arg *arg) { switch (cmd) { case OPTEE_FFA_YIELDING_CALL_RETURN_RPC_CMD: - handle_ffa_rpc_func_cmd(ctx, arg); + handle_ffa_rpc_func_cmd(ctx, optee, arg); break; case OPTEE_FFA_YIELDING_CALL_RETURN_INTERRUPT: /* Interrupt delivered by now */ @@ -582,7 +582,7 @@ static int optee_ffa_yielding_call(struct tee_context *ctx, * above. */ cond_resched(); - optee_handle_ffa_rpc(ctx, data->data1, rpc_arg); + optee_handle_ffa_rpc(ctx, optee, data->data1, rpc_arg); cmd = OPTEE_FFA_YIELDING_CALL_RESUME; data->data0 = cmd; data->data1 = 0; @@ -793,7 +793,9 @@ static int optee_ffa_probe(struct ffa_device *ffa_dev) { const struct ffa_dev_ops *ffa_ops; unsigned int rpc_arg_count; + struct tee_shm_pool *pool; struct tee_device *teedev; + struct tee_context *ctx; struct optee *optee; int rc; @@ -813,12 +815,12 @@ static int optee_ffa_probe(struct ffa_device *ffa_dev) if (!optee) return -ENOMEM; - optee->pool = optee_ffa_config_dyn_shm(); - if (IS_ERR(optee->pool)) { - rc = PTR_ERR(optee->pool); - optee->pool = NULL; - goto err; + pool = optee_ffa_config_dyn_shm(); + if (IS_ERR(pool)) { + rc = PTR_ERR(pool); + goto err_free_optee; } + optee->pool = pool; optee->ops = &optee_ffa_ops; optee->ffa.ffa_dev = ffa_dev; @@ -829,7 +831,7 @@ static int optee_ffa_probe(struct ffa_device *ffa_dev) optee); if (IS_ERR(teedev)) { rc = PTR_ERR(teedev); - goto err; + goto err_free_pool; } optee->teedev = teedev; @@ -837,50 +839,57 @@ static int optee_ffa_probe(struct ffa_device *ffa_dev) optee); if (IS_ERR(teedev)) { rc = PTR_ERR(teedev); - goto err; + goto err_unreg_teedev; } optee->supp_teedev = teedev; rc = tee_device_register(optee->teedev); if (rc) - goto err; + goto err_unreg_supp_teedev; rc = tee_device_register(optee->supp_teedev); if (rc) - goto err; + goto err_unreg_supp_teedev; rc = rhashtable_init(&optee->ffa.global_ids, &shm_rhash_params); if (rc) - goto err; + goto err_unreg_supp_teedev; mutex_init(&optee->ffa.mutex); mutex_init(&optee->call_queue.mutex); INIT_LIST_HEAD(&optee->call_queue.waiters); optee_supp_init(&optee->supp); ffa_dev_set_drvdata(ffa_dev, optee); + ctx = teedev_open(optee->teedev); + if (IS_ERR(ctx)) + goto err_rhashtable_free; + optee->ctx = ctx; rc = optee_notif_init(optee, OPTEE_DEFAULT_MAX_NOTIF_VALUE); - if (rc) { - optee_ffa_remove(ffa_dev); - return rc; - } + if (rc) + goto err_close_ctx; rc = optee_enumerate_devices(PTA_CMD_GET_DEVICES); - if (rc) { - optee_ffa_remove(ffa_dev); - return rc; - } + if (rc) + goto err_unregister_devices; pr_info("initialized driver\n"); return 0; -err: - /* - * tee_device_unregister() is safe to call even if the - * devices hasn't been registered with - * tee_device_register() yet. - */ + +err_unregister_devices: + optee_unregister_devices(); + optee_notif_uninit(optee); +err_close_ctx: + teedev_close_context(ctx); +err_rhashtable_free: + rhashtable_free_and_destroy(&optee->ffa.global_ids, rh_free_fn, NULL); + optee_supp_uninit(&optee->supp); + mutex_destroy(&optee->call_queue.mutex); +err_unreg_supp_teedev: tee_device_unregister(optee->supp_teedev); +err_unreg_teedev: tee_device_unregister(optee->teedev); - if (optee->pool) - tee_shm_pool_free(optee->pool); +err_free_pool: + tee_shm_pool_free(pool); +err_free_optee: kfree(optee); return rc; } diff --git a/drivers/tee/optee/optee_private.h b/drivers/tee/optee/optee_private.h index 46f74ab07c7e..92bc47bef95f 100644 --- a/drivers/tee/optee/optee_private.h +++ b/drivers/tee/optee/optee_private.h @@ -53,7 +53,6 @@ struct optee_call_queue { struct optee_notif { u_int max_key; - struct tee_context *ctx; /* Serializes access to the elements below in this struct */ spinlock_t lock; struct list_head db; @@ -134,9 +133,10 @@ struct optee_ops { /** * struct optee - main service struct * @supp_teedev: supplicant device + * @teedev: client device * @ops: internal callbacks for different ways to reach secure * world - * @teedev: client device + * @ctx: driver internal TEE context * @smc: specific to SMC ABI * @ffa: specific to FF-A ABI * @call_queue: queue of threads waiting to call @invoke_fn @@ -152,6 +152,7 @@ struct optee { struct tee_device *supp_teedev; struct tee_device *teedev; const struct optee_ops *ops; + struct tee_context *ctx; union { struct optee_smc smc; struct optee_ffa ffa; diff --git a/drivers/tee/optee/smc_abi.c b/drivers/tee/optee/smc_abi.c index 449d6a72d289..bacd1a1d79ee 100644 --- a/drivers/tee/optee/smc_abi.c +++ b/drivers/tee/optee/smc_abi.c @@ -622,6 +622,7 @@ static void handle_rpc_func_cmd_shm_free(struct tee_context *ctx, } static void handle_rpc_func_cmd_shm_alloc(struct tee_context *ctx, + struct optee *optee, struct optee_msg_arg *arg, struct optee_call_ctx *call_ctx) { @@ -651,7 +652,8 @@ static void handle_rpc_func_cmd_shm_alloc(struct tee_context *ctx, shm = optee_rpc_cmd_alloc_suppl(ctx, sz); break; case OPTEE_RPC_SHM_TYPE_KERNEL: - shm = tee_shm_alloc(ctx, sz, TEE_SHM_MAPPED | TEE_SHM_PRIV); + shm = tee_shm_alloc(optee->ctx, sz, + TEE_SHM_MAPPED | TEE_SHM_PRIV); break; default: arg->ret = TEEC_ERROR_BAD_PARAMETERS; @@ -747,7 +749,7 @@ static void handle_rpc_func_cmd(struct tee_context *ctx, struct optee *optee, switch (arg->cmd) { case OPTEE_RPC_CMD_SHM_ALLOC: free_pages_list(call_ctx); - handle_rpc_func_cmd_shm_alloc(ctx, arg, call_ctx); + handle_rpc_func_cmd_shm_alloc(ctx, optee, arg, call_ctx); break; case OPTEE_RPC_CMD_SHM_FREE: handle_rpc_func_cmd_shm_free(ctx, arg); @@ -776,7 +778,7 @@ static void optee_handle_rpc(struct tee_context *ctx, switch (OPTEE_SMC_RETURN_GET_RPC_FUNC(param->a0)) { case OPTEE_SMC_RPC_FUNC_ALLOC: - shm = tee_shm_alloc(ctx, param->a1, + shm = tee_shm_alloc(optee->ctx, param->a1, TEE_SHM_MAPPED | TEE_SHM_PRIV); if (!IS_ERR(shm) && !tee_shm_get_pa(shm, 0, &pa)) { reg_pair_from_64(¶m->a1, ¶m->a2, pa); @@ -954,57 +956,34 @@ static irqreturn_t notif_irq_thread_fn(int irq, void *dev_id) { struct optee *optee = dev_id; - optee_smc_do_bottom_half(optee->notif.ctx); + optee_smc_do_bottom_half(optee->ctx); return IRQ_HANDLED; } static int optee_smc_notif_init_irq(struct optee *optee, u_int irq) { - struct tee_context *ctx; int rc; - ctx = teedev_open(optee->teedev); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - optee->notif.ctx = ctx; rc = request_threaded_irq(irq, notif_irq_handler, notif_irq_thread_fn, 0, "optee_notification", optee); if (rc) - goto err_close_ctx; + return rc; optee->smc.notif_irq = irq; return 0; - -err_close_ctx: - teedev_close_context(optee->notif.ctx); - optee->notif.ctx = NULL; - - return rc; } static void optee_smc_notif_uninit_irq(struct optee *optee) { - if (optee->notif.ctx) { - optee_smc_stop_async_notif(optee->notif.ctx); + if (optee->smc.sec_caps & OPTEE_SMC_SEC_CAP_ASYNC_NOTIF) { + optee_smc_stop_async_notif(optee->ctx); if (optee->smc.notif_irq) { free_irq(optee->smc.notif_irq, optee); irq_dispose_mapping(optee->smc.notif_irq); } - - /* - * The thread normally working with optee->notif.ctx was - * stopped with free_irq() above. - * - * Note we're not using teedev_close_context() or - * tee_client_close_context() since we have already called - * tee_device_put() while initializing to avoid a circular - * reference counting. - */ - teedev_close_context(optee->notif.ctx); } } @@ -1366,6 +1345,7 @@ static int optee_probe(struct platform_device *pdev) struct optee *optee = NULL; void *memremaped_shm = NULL; struct tee_device *teedev; + struct tee_context *ctx; u32 max_notif_value; u32 sec_caps; int rc; @@ -1446,9 +1426,13 @@ static int optee_probe(struct platform_device *pdev) optee->pool = pool; platform_set_drvdata(pdev, optee); + ctx = teedev_open(optee->teedev); + if (IS_ERR(ctx)) + goto err_supp_uninit; + optee->ctx = ctx; rc = optee_notif_init(optee, max_notif_value); if (rc) - goto err_supp_uninit; + goto err_close_ctx; if (sec_caps & OPTEE_SMC_SEC_CAP_ASYNC_NOTIF) { unsigned int irq; @@ -1496,6 +1480,8 @@ err_disable_shm_cache: optee_unregister_devices(); err_notif_uninit: optee_notif_uninit(optee); +err_close_ctx: + teedev_close_context(ctx); err_supp_uninit: optee_supp_uninit(&optee->supp); mutex_destroy(&optee->call_queue.mutex); From 68e8cc2a23b61862bf20ffc9b782a3fb49c65568 Mon Sep 17 00:00:00 2001 From: Yizhuo Zhai Date: Wed, 2 Feb 2022 15:58:08 -0800 Subject: [PATCH 1014/1295] fbdev: fbmem: Fix the implicit type casting In function do_fb_ioctl(), the "arg" is the type of unsigned long, and in "case FBIOBLANK:" this argument is casted into an int before passig to fb_blank(). In fb_blank(), the comparision if (blank > FB_BLANK_POWERDOWN) would be bypass if the original "arg" is a large number, which is possible because it comes from the user input. Fix this by adding the check before the function call. Reviewed-by: Guenter Roeck Reviewed-by: Sam Ravnborg Acked-by: Helge Deller Signed-off-by: Yizhuo Zhai Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20220202235811.1621017-1-yzhai003@ucr.edu --- drivers/video/fbdev/core/fbmem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 0fa7ede94fa6..13083ad8d751 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1160,6 +1160,8 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, ret = fbcon_set_con2fb_map_ioctl(argp); break; case FBIOBLANK: + if (arg > FB_BLANK_POWERDOWN) + return -EINVAL; console_lock(); lock_fb_info(info); ret = fb_blank(info, arg); From 3149efcdf2c6314420c418dfc94de53bfd076b1f Mon Sep 17 00:00:00 2001 From: Long Li Date: Wed, 26 Jan 2022 17:43:34 -0800 Subject: [PATCH 1015/1295] PCI: hv: Fix NUMA node assignment when kernel boots with custom NUMA topology When kernel boots with a NUMA topology with some NUMA nodes offline, the PCI driver should only set an online NUMA node on the device. This can happen during KDUMP where some NUMA nodes are not made online by the KDUMP kernel. This patch also fixes the case where kernel is booting with "numa=off". Fixes: 999dd956d838 ("PCI: hv: Add support for protocol 1.3 and support PCI_BUS_RELATIONS2") Signed-off-by: Long Li Reviewed-by: Michael Kelley Tested-by: Purna Pavan Chandra Aekkaladevi Acked-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/1643247814-15184-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 20ea2ee330b8..ae0bc2fee4ca 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2155,8 +2155,17 @@ static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus) if (!hv_dev) continue; - if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY) - set_dev_node(&dev->dev, hv_dev->desc.virtual_numa_node); + if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY && + hv_dev->desc.virtual_numa_node < num_possible_nodes()) + /* + * The kernel may boot with some NUMA nodes offline + * (e.g. in a KDUMP kernel) or with NUMA disabled via + * "numa=off". In those cases, adjust the host provided + * NUMA node to a valid NUMA node used by the kernel. + */ + set_dev_node(&dev->dev, + numa_map_to_online_node( + hv_dev->desc.virtual_numa_node)); put_pcichild(hv_dev); } From c36c04c2e132fc39f6b658bf607aed4425427fd7 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 1 Feb 2022 19:23:17 -0800 Subject: [PATCH 1016/1295] Revert "mm/gup: small refactoring: simplify try_grab_page()" This reverts commit 54d516b1d62ff8f17cee2da06e5e4706a0d00b8a That commit did a refactoring that effectively combined fast and slow gup paths (again). And that was again incorrect, for two reasons: a) Fast gup and slow gup get reference counts on pages in different ways and with different goals: see Linus' writeup in commit cd1adf1b63a1 ("Revert "mm/gup: remove try_get_page(), call try_get_compound_head() directly""), and b) try_grab_compound_head() also has a specific check for "FOLL_LONGTERM && !is_pinned(page)", that assumes that the caller can fall back to slow gup. This resulted in new failures, as recently report by Will McVicker [1]. But (a) has problems too, even though they may not have been reported yet. So just revert this. Link: https://lore.kernel.org/r/20220131203504.3458775-1-willmcvicker@google.com [1] Fixes: 54d516b1d62f ("mm/gup: small refactoring: simplify try_grab_page()") Reported-and-tested-by: Will McVicker Cc: Christoph Hellwig Cc: Minchan Kim Cc: Matthew Wilcox Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Cc: stable@vger.kernel.org # 5.15 Signed-off-by: John Hubbard Signed-off-by: Linus Torvalds --- mm/gup.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index f0af462ac1e2..a9d4d724aef7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -124,8 +124,8 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) * considered failure, and furthermore, a likely bug in the caller, so a warning * is also emitted. */ -struct page *try_grab_compound_head(struct page *page, - int refs, unsigned int flags) +__maybe_unused struct page *try_grab_compound_head(struct page *page, + int refs, unsigned int flags) { if (flags & FOLL_GET) return try_get_compound_head(page, refs); @@ -208,10 +208,35 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) */ bool __must_check try_grab_page(struct page *page, unsigned int flags) { - if (!(flags & (FOLL_GET | FOLL_PIN))) - return true; + WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); - return try_grab_compound_head(page, 1, flags); + if (flags & FOLL_GET) + return try_get_page(page); + else if (flags & FOLL_PIN) { + int refs = 1; + + page = compound_head(page); + + if (WARN_ON_ONCE(page_ref_count(page) <= 0)) + return false; + + if (hpage_pincount_available(page)) + hpage_pincount_add(page, 1); + else + refs = GUP_PIN_COUNTING_BIAS; + + /* + * Similar to try_grab_compound_head(): even if using the + * hpage_pincount_add/_sub() routines, be sure to + * *also* increment the normal page refcount field at least + * once, so that the page really is pinned. + */ + page_ref_add(page, refs); + + mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1); + } + + return true; } /** From 3404b39919c3c87677cdba45405d24033b3276f3 Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Thu, 27 Jan 2022 14:17:54 +0100 Subject: [PATCH 1017/1295] drm/vc4: hdmi: Ensure we don't use 2711 HPD registers on Pi0-3 The existing logic was flawed in that it could try reading the 2711 specific registers for HPD on a CM1/3 where the HPD GPIO hadn't been defined in DT. Ensure we don't do the 2711 register read on invalid hardware, and then Signed-off-by: Dave Stevenson Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220127131754.236074-1-maxime@cerno.tech --- drivers/gpu/drm/vc4/vc4_hdmi.c | 24 ++++++++++++++++-------- drivers/gpu/drm/vc4/vc4_hdmi.h | 3 +++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c index 053fbaf765ca..3286decf5be5 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.c +++ b/drivers/gpu/drm/vc4/vc4_hdmi.c @@ -196,14 +196,8 @@ vc4_hdmi_connector_detect(struct drm_connector *connector, bool force) if (gpiod_get_value_cansleep(vc4_hdmi->hpd_gpio)) connected = true; } else { - unsigned long flags; - u32 hotplug; - - spin_lock_irqsave(&vc4_hdmi->hw_lock, flags); - hotplug = HDMI_READ(HDMI_HOTPLUG); - spin_unlock_irqrestore(&vc4_hdmi->hw_lock, flags); - - if (hotplug & VC4_HDMI_HOTPLUG_CONNECTED) + if (vc4_hdmi->variant->hp_detect && + vc4_hdmi->variant->hp_detect(vc4_hdmi)) connected = true; } @@ -1343,6 +1337,18 @@ static u32 vc5_hdmi_channel_map(struct vc4_hdmi *vc4_hdmi, u32 channel_mask) return channel_map; } +static bool vc5_hdmi_hp_detect(struct vc4_hdmi *vc4_hdmi) +{ + unsigned long flags; + u32 hotplug; + + spin_lock_irqsave(&vc4_hdmi->hw_lock, flags); + hotplug = HDMI_READ(HDMI_HOTPLUG); + spin_unlock_irqrestore(&vc4_hdmi->hw_lock, flags); + + return !!(hotplug & VC4_HDMI_HOTPLUG_CONNECTED); +} + /* HDMI audio codec callbacks */ static void vc4_hdmi_audio_set_mai_clock(struct vc4_hdmi *vc4_hdmi, unsigned int samplerate) @@ -2723,6 +2729,7 @@ static const struct vc4_hdmi_variant bcm2711_hdmi0_variant = { .phy_rng_disable = vc5_hdmi_phy_rng_disable, .channel_map = vc5_hdmi_channel_map, .supports_hdr = true, + .hp_detect = vc5_hdmi_hp_detect, }; static const struct vc4_hdmi_variant bcm2711_hdmi1_variant = { @@ -2751,6 +2758,7 @@ static const struct vc4_hdmi_variant bcm2711_hdmi1_variant = { .phy_rng_disable = vc5_hdmi_phy_rng_disable, .channel_map = vc5_hdmi_channel_map, .supports_hdr = true, + .hp_detect = vc5_hdmi_hp_detect, }; static const struct of_device_id vc4_hdmi_dt_match[] = { diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.h b/drivers/gpu/drm/vc4/vc4_hdmi.h index 36c0b082a43b..31b77a94c526 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.h +++ b/drivers/gpu/drm/vc4/vc4_hdmi.h @@ -102,6 +102,9 @@ struct vc4_hdmi_variant { /* Enables HDR metadata */ bool supports_hdr; + + /* Callback for hardware specific hotplug detect */ + bool (*hp_detect)(struct vc4_hdmi *vc4_hdmi); }; /* HDMI audio information */ From 71702c495b78dfbc22eeac32ea9cda452862750d Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Thu, 27 Jan 2022 14:45:59 +0100 Subject: [PATCH 1018/1295] drm/vc4: hdmi: Don't try disabling SCDC on Pi0-3. The code that set the scdc_enabled flag to ensure it was disabled at boot time also ran on Pi0-3 where there is no SCDC support. This lead to a warning in vc4_hdmi_encoder_post_crtc_disable due to vc4_hdmi_disable_scrambling being called and trying to read (and write) register HDMI_SCRAMBLER_CTL which doesn't exist on those platforms. Only set the flag should the interface be configured to support more than HDMI 1.4. Fixes: 1998646129fa ("drm/vc4: hdmi: Introduce a scdc_enabled flag") Signed-off-by: Dave Stevenson Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220127134559.292778-1-maxime@cerno.tech --- drivers/gpu/drm/vc4/vc4_hdmi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c index 3286decf5be5..6d5a6388ca1e 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.c +++ b/drivers/gpu/drm/vc4/vc4_hdmi.c @@ -2510,7 +2510,8 @@ static int vc4_hdmi_bind(struct device *dev, struct device *master, void *data) * vc4_hdmi_disable_scrambling() will thus run at boot, make * sure it's disabled, and avoid any inconsistency. */ - vc4_hdmi->scdc_enabled = true; + if (variant->max_pixel_clock > HDMI_14_MAX_TMDS_CLK) + vc4_hdmi->scdc_enabled = true; ret = variant->init_resources(vc4_hdmi); if (ret) From 1d118965965f89948236ebe23072bb1fca5e7832 Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Thu, 27 Jan 2022 14:51:16 +0100 Subject: [PATCH 1019/1295] drm/vc4: hdmi: Allow DBLCLK modes even if horz timing is odd. The 2711 pixel valve can't produce odd horizontal timings, and checks were added to vc4_hdmi_encoder_atomic_check and vc4_hdmi_encoder_mode_valid to filter out/block selection of such modes. Modes with DRM_MODE_FLAG_DBLCLK double all the horizontal timing values before programming them into the PV. The PV values, therefore, can not be odd, and so the modes can be supported. Amend the filtering appropriately. Fixes: 57fb32e632be ("drm/vc4: hdmi: Block odd horizontal timings") Signed-off-by: Dave Stevenson Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220127135116.298278-1-maxime@cerno.tech --- drivers/gpu/drm/vc4/vc4_hdmi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c index 6d5a6388ca1e..b30500405fa7 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.c +++ b/drivers/gpu/drm/vc4/vc4_hdmi.c @@ -1245,6 +1245,7 @@ static int vc4_hdmi_encoder_atomic_check(struct drm_encoder *encoder, unsigned long long tmds_rate; if (vc4_hdmi->variant->unsupported_odd_h_timings && + !(mode->flags & DRM_MODE_FLAG_DBLCLK) && ((mode->hdisplay % 2) || (mode->hsync_start % 2) || (mode->hsync_end % 2) || (mode->htotal % 2))) return -EINVAL; @@ -1292,6 +1293,7 @@ vc4_hdmi_encoder_mode_valid(struct drm_encoder *encoder, struct vc4_hdmi *vc4_hdmi = encoder_to_vc4_hdmi(encoder); if (vc4_hdmi->variant->unsupported_odd_h_timings && + !(mode->flags & DRM_MODE_FLAG_DBLCLK) && ((mode->hdisplay % 2) || (mode->hsync_start % 2) || (mode->hsync_end % 2) || (mode->htotal % 2))) return MODE_H_ILLEGAL; From 7f3bdbc3f13146eb9d07de81ea71f551587a384b Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 1 Feb 2022 14:25:04 -0700 Subject: [PATCH 1020/1295] tools/resolve_btfids: Do not print any commands when building silently When building with 'make -s', there is some output from resolve_btfids: $ make -sj"$(nproc)" oldconfig prepare MKDIR .../tools/bpf/resolve_btfids/libbpf/ MKDIR .../tools/bpf/resolve_btfids//libsubcmd LINK resolve_btfids Silent mode means that no information should be emitted about what is currently being done. Use the $(silent) variable from Makefile.include to avoid defining the msg macro so that there is no information printed. Fixes: fbbb68de80a4 ("bpf: Add resolve_btfids tool to resolve BTF IDs in ELF object") Signed-off-by: Nathan Chancellor Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220201212503.731732-1-nathan@kernel.org --- tools/bpf/resolve_btfids/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index 9ddeca947635..320a88ac28c9 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -9,7 +9,11 @@ ifeq ($(V),1) msg = else Q = @ - msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))"; + ifeq ($(silent),1) + msg = + else + msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))"; + endif MAKEFLAGS=--no-print-directory endif From 46963e2e0629cb31c96b1d47ddd89dc3d8990b34 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 27 Jan 2022 14:02:18 +0100 Subject: [PATCH 1021/1295] misc: fastrpc: avoid double fput() on failed usercopy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the copy back to userland fails for the FASTRPC_IOCTL_ALLOC_DMA_BUFF ioctl(), we shouldn't assume that 'buf->dmabuf' is still valid. In fact, dma_buf_fd() called fd_install() before, i.e. "consumed" one reference, leaving us with none. Calling dma_buf_put() will therefore put a reference we no longer own, leading to a valid file descritor table entry for an already released 'file' object which is a straight use-after-free. Simply avoid calling dma_buf_put() and rely on the process exit code to do the necessary cleanup, if needed, i.e. if the file descriptor is still valid. Fixes: 6cffd79504ce ("misc: fastrpc: Add support for dmabuf exporter") Acked-by: Christian König Signed-off-by: Mathias Krause Link: https://lore.kernel.org/r/20220127130218.809261-1-minipli@grsecurity.net Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index 4ccbf43e6bfa..aa1682b94a23 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -1288,7 +1288,14 @@ static int fastrpc_dmabuf_alloc(struct fastrpc_user *fl, char __user *argp) } if (copy_to_user(argp, &bp, sizeof(bp))) { - dma_buf_put(buf->dmabuf); + /* + * The usercopy failed, but we can't do much about it, as + * dma_buf_fd() already called fd_install() and made the + * file descriptor accessible for the current process. It + * might already be closed and dmabuf no longer valid when + * we reach this point. Therefore "leak" the fd and rely on + * the process exit path to do any required cleanup. + */ return -EFAULT; } From 599ea31d13617c5484c40cdf50d88301dc351cfc Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Mon, 10 Jan 2022 11:51:40 +0800 Subject: [PATCH 1022/1295] ext4: prevent used blocks from being allocated during fast commit replay During fast commit replay procedure, we clear inode blocks bitmap in ext4_ext_clear_bb(), this may cause ext4_mb_new_blocks_simple() allocate blocks still in use. Make ext4_fc_record_regions() also record physical disk regions used by inodes during replay procedure. Then ext4_mb_new_blocks_simple() can excludes these blocks in use. Signed-off-by: Xin Yin Link: https://lore.kernel.org/r/20220110035141.1980-2-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 3 +++ fs/ext4/extents.c | 4 ++++ fs/ext4/fast_commit.c | 20 +++++++++++++++----- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 715ee206dfe1..598ecf07652a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2934,6 +2934,9 @@ void ext4_fc_replay_cleanup(struct super_block *sb); int ext4_fc_commit(journal_t *journal, tid_t commit_tid); int __init ext4_fc_init_dentry_cache(void); void ext4_fc_destroy_dentry_cache(void); +int ext4_fc_record_regions(struct super_block *sb, int ino, + ext4_lblk_t lblk, ext4_fsblk_t pblk, + int len, int replay); /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1077ce7e189f..5dd13108d4b7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -6093,11 +6093,15 @@ int ext4_ext_clear_bb(struct inode *inode) ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, 0); + ext4_fc_record_regions(inode->i_sb, inode->i_ino, + 0, path[j].p_block, 1, 1); } ext4_ext_drop_refs(path); kfree(path); } ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_fc_record_regions(inode->i_sb, inode->i_ino, + map.m_lblk, map.m_pblk, map.m_len, 1); } cur = cur + map.m_len; } diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 5ae8026a0c56..1abe78b8d84f 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1563,16 +1563,23 @@ out: } /* - * Record physical disk regions which are in use as per fast commit area. Our - * simple replay phase allocator excludes these regions from allocation. + * Record physical disk regions which are in use as per fast commit area, + * and used by inodes during replay phase. Our simple replay phase + * allocator excludes these regions from allocation. */ -static int ext4_fc_record_regions(struct super_block *sb, int ino, - ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) +int ext4_fc_record_regions(struct super_block *sb, int ino, + ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) { struct ext4_fc_replay_state *state; struct ext4_fc_alloc_region *region; state = &EXT4_SB(sb)->s_fc_replay_state; + /* + * during replay phase, the fc_regions_valid may not same as + * fc_regions_used, update it when do new additions. + */ + if (replay && state->fc_regions_used != state->fc_regions_valid) + state->fc_regions_used = state->fc_regions_valid; if (state->fc_regions_used == state->fc_regions_size) { state->fc_regions_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; @@ -1590,6 +1597,9 @@ static int ext4_fc_record_regions(struct super_block *sb, int ino, region->pblk = pblk; region->len = len; + if (replay) + state->fc_regions_valid++; + return 0; } @@ -1937,7 +1947,7 @@ static int ext4_fc_replay_scan(journal_t *journal, ret = ext4_fc_record_regions(sb, le32_to_cpu(ext.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), - ext4_ext_get_actual_len(ex)); + ext4_ext_get_actual_len(ex), 0); if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; From 31a074a0c62dc0d2bfb9b543142db4fe27f9e5eb Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Mon, 10 Jan 2022 11:51:41 +0800 Subject: [PATCH 1023/1295] ext4: modify the logic of ext4_mb_new_blocks_simple For now in ext4_mb_new_blocks_simple, if we found a block which should be excluded then will switch to next group, this may probably cause 'group' run out of range. Change to check next block in the same group when get a block should be excluded. Also change the search range to EXT4_CLUSTERS_PER_GROUP and add error checking. Signed-off-by: Xin Yin Reviewed-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20220110035141.1980-3-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/mballoc.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cf2fd9fc7d98..c781974df9d0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5753,7 +5753,8 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, struct super_block *sb = ar->inode->i_sb; ext4_group_t group; ext4_grpblk_t blkoff; - int i = sb->s_blocksize; + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_grpblk_t i = 0; ext4_fsblk_t goal, block; struct ext4_super_block *es = EXT4_SB(sb)->s_es; @@ -5775,19 +5776,26 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, ext4_get_group_no_and_offset(sb, max(ext4_group_first_block_no(sb, group), goal), NULL, &blkoff); - i = mb_find_next_zero_bit(bitmap_bh->b_data, sb->s_blocksize, + while (1) { + i = mb_find_next_zero_bit(bitmap_bh->b_data, max, blkoff); + if (i >= max) + break; + if (ext4_fc_replay_check_excluded(sb, + ext4_group_first_block_no(sb, group) + i)) { + blkoff = i + 1; + } else + break; + } brelse(bitmap_bh); - if (i >= sb->s_blocksize) - continue; - if (ext4_fc_replay_check_excluded(sb, - ext4_group_first_block_no(sb, group) + i)) - continue; - break; + if (i < max) + break; } - if (group >= ext4_get_groups_count(sb) && i >= sb->s_blocksize) + if (group >= ext4_get_groups_count(sb) || i >= max) { + *errp = -ENOSPC; return 0; + } block = ext4_group_first_block_no(sb, group) + i; ext4_mb_mark_bb(sb, block, 1, 1); From e85c81ba8859a4c839bcd69c5d83b32954133a5b Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Mon, 17 Jan 2022 17:36:54 +0800 Subject: [PATCH 1024/1295] ext4: fast commit may not fallback for ineligible commit For the follow scenario: 1. jbd start commit transaction n 2. task A get new handle for transaction n+1 3. task A do some ineligible actions and mark FC_INELIGIBLE 4. jbd complete transaction n and clean FC_INELIGIBLE 5. task A call fsync In this case fast commit will not fallback to full commit and transaction n+1 also not handled by jbd. Make ext4_fc_mark_ineligible() also record transaction tid for latest ineligible case, when call ext4_fc_cleanup() check current transaction tid, if small than latest ineligible tid do not clear the EXT4_MF_FC_INELIGIBLE. Reported-by: kernel test robot Reported-by: Dan Carpenter Reported-by: Ritesh Harjani Suggested-by: Harshad Shirwadkar Signed-off-by: Xin Yin Link: https://lore.kernel.org/r/20220117093655.35160-2-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 3 ++- fs/ext4/extents.c | 4 ++-- fs/ext4/fast_commit.c | 33 +++++++++++++++++++++++++-------- fs/ext4/inode.c | 4 ++-- fs/ext4/ioctl.c | 4 ++-- fs/ext4/namei.c | 4 ++-- fs/ext4/super.c | 1 + fs/ext4/xattr.c | 6 +++--- fs/jbd2/commit.c | 2 +- fs/jbd2/journal.c | 2 +- include/linux/jbd2.h | 2 +- 11 files changed, 42 insertions(+), 23 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 598ecf07652a..d52295becda3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1749,6 +1749,7 @@ struct ext4_sb_info { spinlock_t s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; + tid_t s_fc_ineligible_tid; #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif @@ -2925,7 +2926,7 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); -void ext4_fc_mark_ineligible(struct super_block *sb, int reason); +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); void ext4_fc_start_update(struct inode *inode); void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5dd13108d4b7..3ce4fc250b0d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5336,7 +5336,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); @@ -5476,7 +5476,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 1abe78b8d84f..e031afee42de 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -300,18 +300,32 @@ restart: } /* - * Mark file system as fast commit ineligible. This means that next commit - * operation would result in a full jbd2 commit. + * Mark file system as fast commit ineligible, and record latest + * ineligible transaction tid. This means until the recorded + * transaction, commit operation would result in a full jbd2 commit. */ -void ext4_fc_mark_ineligible(struct super_block *sb, int reason) +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) { struct ext4_sb_info *sbi = EXT4_SB(sb); + tid_t tid; if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (handle && !IS_ERR(handle)) + tid = handle->h_transaction->t_tid; + else { + read_lock(&sbi->s_journal->j_state_lock); + tid = sbi->s_journal->j_running_transaction ? + sbi->s_journal->j_running_transaction->t_tid : 0; + read_unlock(&sbi->s_journal->j_state_lock); + } + spin_lock(&sbi->s_fc_lock); + if (sbi->s_fc_ineligible_tid < tid) + sbi->s_fc_ineligible_tid = tid; + spin_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -387,7 +401,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) mutex_unlock(&ei->i_fc_lock); node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -400,7 +414,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_NOMEM); + EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -502,7 +516,7 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_INODE_JOURNAL_DATA); + EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } @@ -1179,7 +1193,7 @@ fallback: * Fast commit cleanup routine. This is called after every fast commit and * full commit. full is true if we are called after a full commit. */ -static void ext4_fc_cleanup(journal_t *journal, int full) +static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1227,7 +1241,10 @@ static void ext4_fc_cleanup(journal_t *journal, int full) &sbi->s_fc_q[FC_Q_MAIN]); ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); - ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (tid >= sbi->s_fc_ineligible_tid) { + sbi->s_fc_ineligible_tid = 0; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + } if (full) sbi->s_fc_bytes = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9dbeb772de60..f368dd5fd8d5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -337,7 +337,7 @@ stop_handle: return; no_delete: if (!list_empty(&EXT4_I(inode)->i_fc_list)) - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } @@ -5976,7 +5976,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return PTR_ERR(handle); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_JOURNAL_FLAG_CHANGE); + EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); err = ext4_mark_inode_dirty(handle, inode); ext4_handle_sync(handle); ext4_journal_stop(handle); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bbbedf27b71c..a8022c2c6a58 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -411,7 +411,7 @@ static long swap_inode_boot_loader(struct super_block *sb, err = -EINVAL; goto err_out; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); @@ -1373,7 +1373,7 @@ mext_out: err = ext4_resize_fs(sb, n_blocks_count); if (EXT4_SB(sb)->s_journal) { - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 52c9bd154122..47b9f87dbc6f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3889,7 +3889,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, * dirents in directories. */ ext4_fc_mark_ineligible(old.inode->i_sb, - EXT4_FC_REASON_RENAME_DIR); + EXT4_FC_REASON_RENAME_DIR, handle); } else { if (new.inode) ext4_fc_track_unlink(handle, new.dentry); @@ -4049,7 +4049,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(retval)) goto end_rename; ext4_fc_mark_ineligible(new.inode->i_sb, - EXT4_FC_REASON_CROSS_RENAME); + EXT4_FC_REASON_CROSS_RENAME, handle); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9a936ecbaa3b..6930b7737ce4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5084,6 +5084,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); + sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1e0fc1ed845b..042325349098 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2408,7 +2408,7 @@ retry_inode: if (IS_SYNC(inode)) ext4_handle_sync(handle); } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); cleanup: brelse(is.iloc.bh); @@ -2486,7 +2486,7 @@ retry: if (error == 0) error = error2; } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, NULL); return error; } @@ -2920,7 +2920,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, error); goto cleanup; } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); } error = 0; cleanup: diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3cc4ab2ba7f4..d188fa913a07 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1170,7 +1170,7 @@ restart_loop: if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); if (journal->j_fc_cleanup_callback) - journal->j_fc_cleanup_callback(journal, 1); + journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); trace_jbd2_end_commit(journal, commit_transaction); jbd_debug(1, "JBD2: commit %d complete, head %d\n", diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0b86a4365b66..a8e64ad11ae3 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -771,7 +771,7 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { jbd2_journal_unlock_updates(journal); if (journal->j_fc_cleanup_callback) - journal->j_fc_cleanup_callback(journal, 0); + journal->j_fc_cleanup_callback(journal, 0, tid); write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; if (fallback) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index fd933c45281a..d63b8106796e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1295,7 +1295,7 @@ struct journal_s * Clean-up after fast commit or full commit. JBD2 calls this function * after every commit operation. */ - void (*j_fc_cleanup_callback)(struct journal_s *journal, int); + void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid); /** * @j_fc_replay_callback: From bdc8a53a6f2f0b1cb5f991440f2100732299eb93 Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Mon, 17 Jan 2022 17:36:55 +0800 Subject: [PATCH 1025/1295] ext4: fast commit may miss file actions in the follow scenario: 1. jbd start transaction n 2. task A get new handle for transaction n+1 3. task A do some actions and add inode to FC_Q_MAIN fc_q 4. jbd complete transaction n and clear FC_Q_MAIN fc_q 5. task A call fsync Fast commit will lost the file actions during a full commit. we should also add updates to staging queue during a full commit. and in ext4_fc_cleanup(), when reset a inode's fc track range, check it's i_sync_tid, if it bigger than current transaction tid, do not rest it, or we will lost the track range. And EXT4_MF_FC_COMMITTING is not needed anymore, so drop it. Signed-off-by: Xin Yin Link: https://lore.kernel.org/r/20220117093655.35160-3-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 5 +---- fs/ext4/fast_commit.c | 11 ++++++----- fs/ext4/super.c | 1 - 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d52295becda3..18cd5b3b4815 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1795,10 +1795,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) enum { EXT4_MF_MNTDIR_SAMPLED, EXT4_MF_FS_ABORTED, /* Fatal error detected */ - EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ - EXT4_MF_FC_COMMITTING /* File system underoing a fast - * commit. - */ + EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index e031afee42de..ccd2b216d6ba 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -375,7 +375,8 @@ static int ext4_fc_track_template( spin_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, - (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? + (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || + sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); spin_unlock(&sbi->s_fc_lock); @@ -428,7 +429,8 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) node->fcd_name.len = dentry->d_name.len; spin_lock(&sbi->s_fc_lock); - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) + if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || + sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else @@ -893,7 +895,6 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) int ret = 0; spin_lock(&sbi->s_fc_lock); - ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); while (atomic_read(&ei->i_fc_updates)) { @@ -1211,7 +1212,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) list_del_init(&iter->i_fc_list); ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); - ext4_fc_reset_inode(&iter->vfs_inode); + if (iter->i_sync_tid <= tid) + ext4_fc_reset_inode(&iter->vfs_inode); /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ smp_mb(); #if (BITS_PER_LONG < 64) @@ -1240,7 +1242,6 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_MAIN]); - ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); if (tid >= sbi->s_fc_ineligible_tid) { sbi->s_fc_ineligible_tid = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6930b7737ce4..57914acc5402 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5083,7 +5083,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); From 897026aaa73eb2517dfea8d147f20ddb0b813044 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 17 Jan 2022 17:41:47 +0530 Subject: [PATCH 1026/1295] ext4: fix error handling in ext4_restore_inline_data() While running "./check -I 200 generic/475" it sometimes gives below kernel BUG(). Ideally we should not call ext4_write_inline_data() if ext4_create_inline_data() has failed. [73131.453234] kernel BUG at fs/ext4/inline.c:223! 212 static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, 213 void *buffer, loff_t pos, unsigned int len) 214 { <...> 223 BUG_ON(!EXT4_I(inode)->i_inline_off); 224 BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); This patch handles the error and prints out a emergency msg saying potential data loss for the given inode (since we couldn't restore the original inline_data due to some previous error). [ 9571.070313] EXT4-fs (dm-0): error restoring inline_data for inode -- potential data loss! (inode 1703982, error -30) Reported-by: Eric Whitney Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/9f4cd7dfd54fa58ff27270881823d94ddf78dd07.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inline.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 39a1ab129fdc..d091133a4b46 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1133,7 +1133,15 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc, void *buf, int inline_size) { - ext4_create_inline_data(handle, inode, inline_size); + int ret; + + ret = ext4_create_inline_data(handle, inode, inline_size); + if (ret) { + ext4_msg(inode->i_sb, KERN_EMERG, + "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)", + inode->i_ino, ret); + return; + } ext4_write_inline_data(inode, iloc, buf, 0, inline_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } From 09355d9d038a1590ee055831a4ad3a79952cfa8b Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 17 Jan 2022 17:41:48 +0530 Subject: [PATCH 1027/1295] ext4: remove redundant max inline_size check in ext4_da_write_inline_data_begin() ext4_prepare_inline_data() already checks for ext4_get_max_inline_size() and returns -ENOSPC. So there is no need to check it twice within ext4_da_write_inline_data_begin(). This patch removes the extra check. It also makes it more clean. No functionality change in this patch. Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/cdd1654128d5105550c65fd13ca5da53b2162cc4.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d091133a4b46..cbdd84e49f2c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -911,7 +911,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, struct page **pagep, void **fsdata) { - int ret, inline_size; + int ret; handle_t *handle; struct page *page; struct ext4_iloc iloc; @@ -928,14 +928,9 @@ retry_journal: goto out; } - inline_size = ext4_get_max_inline_size(inode); - - ret = -ENOSPC; - if (inline_size >= pos + len) { - ret = ext4_prepare_inline_data(handle, inode, pos + len); - if (ret && ret != -ENOSPC) - goto out_journal; - } + ret = ext4_prepare_inline_data(handle, inode, pos + len); + if (ret && ret != -ENOSPC) + goto out_journal; /* * We cannot recurse into the filesystem as the transaction From cdce59a1549190b66f8e3fe465c2b2f714b98a94 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 17 Jan 2022 17:41:49 +0530 Subject: [PATCH 1028/1295] ext4: fix error handling in ext4_fc_record_modified_inode() Current code does not fully takes care of krealloc() error case, which could lead to silent memory corruption or a kernel bug. This patch fixes that. Also it cleans up some duplicated error handling logic from various functions in fast_commit.c file. Reported-by: luo penghao Suggested-by: Lukas Czerner Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/62e8b6a1cce9359682051deb736a3c0953c9d1e9.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/fast_commit.c | 64 ++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index ccd2b216d6ba..5934c23e153e 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1410,14 +1410,15 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) if (state->fc_modified_inodes[i] == ino) return 0; if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { - state->fc_modified_inodes_size += - EXT4_FC_REPLAY_REALLOC_INCREMENT; state->fc_modified_inodes = krealloc( - state->fc_modified_inodes, sizeof(int) * - state->fc_modified_inodes_size, - GFP_KERNEL); + state->fc_modified_inodes, + sizeof(int) * (state->fc_modified_inodes_size + + EXT4_FC_REPLAY_REALLOC_INCREMENT), + GFP_KERNEL); if (!state->fc_modified_inodes) return -ENOMEM; + state->fc_modified_inodes_size += + EXT4_FC_REPLAY_REALLOC_INCREMENT; } state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; return 0; @@ -1449,7 +1450,9 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, } inode = NULL; - ext4_fc_record_modified_inode(sb, ino); + ret = ext4_fc_record_modified_inode(sb, ino); + if (ret) + goto out; raw_fc_inode = (struct ext4_inode *) (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); @@ -1649,6 +1652,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb, } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); + if (ret) + goto out; start = le32_to_cpu(ex->ee_block); start_pblk = ext4_ext_pblock(ex); @@ -1666,18 +1671,14 @@ static int ext4_fc_replay_add_range(struct super_block *sb, map.m_pblk = 0; ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) { - iput(inode); - return 0; - } + if (ret < 0) + goto out; if (ret == 0) { /* Range is not mapped */ path = ext4_find_extent(inode, cur, NULL, 0); - if (IS_ERR(path)) { - iput(inode); - return 0; - } + if (IS_ERR(path)) + goto out; memset(&newex, 0, sizeof(newex)); newex.ee_block = cpu_to_le32(cur); ext4_ext_store_pblock( @@ -1691,10 +1692,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb, up_write((&EXT4_I(inode)->i_data_sem)); ext4_ext_drop_refs(path); kfree(path); - if (ret) { - iput(inode); - return 0; - } + if (ret) + goto out; goto next; } @@ -1707,10 +1706,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb, ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), start_pblk + cur - start); - if (ret) { - iput(inode); - return 0; - } + if (ret) + goto out; /* * Mark the old blocks as free since they aren't used * anymore. We maintain an array of all the modified @@ -1730,10 +1727,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), map.m_pblk); - if (ret) { - iput(inode); - return 0; - } + if (ret) + goto out; /* * We may have split the extent tree while toggling the state. * Try to shrink the extent tree now. @@ -1745,6 +1740,7 @@ next: } ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); +out: iput(inode); return 0; } @@ -1774,6 +1770,8 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); + if (ret) + goto out; jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", inode->i_ino, le32_to_cpu(lrange.fc_lblk), @@ -1783,10 +1781,8 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, map.m_len = remaining; ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) { - iput(inode); - return 0; - } + if (ret < 0) + goto out; if (ret > 0) { remaining -= ret; cur += ret; @@ -1801,15 +1797,13 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, ret = ext4_ext_remove_space(inode, lrange.fc_lblk, lrange.fc_lblk + lrange.fc_len - 1); up_write(&EXT4_I(inode)->i_data_sem); - if (ret) { - iput(inode); - return 0; - } + if (ret) + goto out; ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); ext4_mark_inode_dirty(NULL, inode); +out: iput(inode); - return 0; } From 3ca40c0d329113a9f76f6aa01abe73d9f16ace9d Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 17 Jan 2022 17:41:50 +0530 Subject: [PATCH 1029/1295] jbd2: cleanup unused functions declarations from jbd2.h During code review found no references of few of these below function declarations. This patch cleans those up from jbd2.h Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/30d1fc327becda197a4136cf9cdc73d9baa3b7b9.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d63b8106796e..afc5572e7b8a 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1419,9 +1419,7 @@ extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *); extern bool __jbd2_journal_refile_buffer(struct journal_head *); extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); -extern void __journal_free_buffer(struct journal_head *bh); extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); -extern void __journal_clean_data_list(transaction_t *transaction); static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh) { list_add_tail(&bh->b_assoc_buffers, head); @@ -1486,9 +1484,6 @@ extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct buffer_head **bh_out, sector_t blocknr); -/* Transaction locking */ -extern void __wait_on_journal (journal_t *); - /* Transaction cache support */ extern void jbd2_journal_destroy_transaction_cache(void); extern int __init jbd2_journal_init_transaction_cache(void); @@ -1774,8 +1769,6 @@ static inline unsigned long jbd2_log_space_left(journal_t *journal) #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ #define BJ_Types 5 -extern int jbd_blocks_per_page(struct inode *inode); - /* JBD uses a CRC32 checksum */ #define JBD_MAX_CHECKSUM_SIZE 4 From 4f98186848707f530669238d90e0562d92a78aab Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 17 Jan 2022 17:41:51 +0530 Subject: [PATCH 1030/1295] jbd2: refactor wait logic for transaction updates into a common function No functionality change as such in this patch. This only refactors the common piece of code which waits for t_updates to finish into a common function named as jbd2_journal_wait_updates(journal_t *) Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/8c564f70f4b2591171677a2a74fccb22a7b6c3a4.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 19 +++------------- fs/jbd2/transaction.c | 53 ++++++++++++++++++++++++++----------------- include/linux/jbd2.h | 4 +++- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index d188fa913a07..5b9408e3b370 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -484,22 +484,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, stats.run.rs_locked); - spin_lock(&commit_transaction->t_handle_lock); - while (atomic_read(&commit_transaction->t_updates)) { - DEFINE_WAIT(wait); + // waits for any t_updates to finish + jbd2_journal_wait_updates(journal); - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&commit_transaction->t_updates)) { - spin_unlock(&commit_transaction->t_handle_lock); - write_unlock(&journal->j_state_lock); - schedule(); - write_lock(&journal->j_state_lock); - spin_lock(&commit_transaction->t_handle_lock); - } - finish_wait(&journal->j_wait_updates, &wait); - } - spin_unlock(&commit_transaction->t_handle_lock); commit_transaction->t_state = T_SWITCH; write_unlock(&journal->j_state_lock); @@ -817,7 +804,7 @@ start_journal_io: commit_transaction->t_state = T_COMMIT_DFLUSH; write_unlock(&journal->j_state_lock); - /* + /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue * the commit record diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6a3caedd2285..8e2f8275a253 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -449,7 +449,7 @@ repeat: } /* OK, account for the buffers that this operation expects to - * use and add the handle to the running transaction. + * use and add the handle to the running transaction. */ update_t_max_wait(transaction, ts); handle->h_transaction = transaction; @@ -836,6 +836,35 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) } EXPORT_SYMBOL(jbd2_journal_restart); +/* + * Waits for any outstanding t_updates to finish. + * This is called with write j_state_lock held. + */ +void jbd2_journal_wait_updates(journal_t *journal) +{ + transaction_t *commit_transaction = journal->j_running_transaction; + + if (!commit_transaction) + return; + + spin_lock(&commit_transaction->t_handle_lock); + while (atomic_read(&commit_transaction->t_updates)) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + if (atomic_read(&commit_transaction->t_updates)) { + spin_unlock(&commit_transaction->t_handle_lock); + write_unlock(&journal->j_state_lock); + schedule(); + write_lock(&journal->j_state_lock); + spin_lock(&commit_transaction->t_handle_lock); + } + finish_wait(&journal->j_wait_updates, &wait); + } + spin_unlock(&commit_transaction->t_handle_lock); +} + /** * jbd2_journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. @@ -863,27 +892,9 @@ void jbd2_journal_lock_updates(journal_t *journal) write_lock(&journal->j_state_lock); } - /* Wait until there are no running updates */ - while (1) { - transaction_t *transaction = journal->j_running_transaction; + /* Wait until there are no running t_updates */ + jbd2_journal_wait_updates(journal); - if (!transaction) - break; - - spin_lock(&transaction->t_handle_lock); - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); - if (!atomic_read(&transaction->t_updates)) { - spin_unlock(&transaction->t_handle_lock); - finish_wait(&journal->j_wait_updates, &wait); - break; - } - spin_unlock(&transaction->t_handle_lock); - write_unlock(&journal->j_state_lock); - schedule(); - finish_wait(&journal->j_wait_updates, &wait); - write_lock(&journal->j_state_lock); - } write_unlock(&journal->j_state_lock); /* diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index afc5572e7b8a..9c3ada74ffb1 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -594,7 +594,7 @@ struct transaction_s */ unsigned long t_log_start; - /* + /* * Number of buffers on the t_buffers list [j_list_lock, no locks * needed for jbd2 thread] */ @@ -1538,6 +1538,8 @@ extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); extern void jbd2_journal_lock_updates (journal_t *); extern void jbd2_journal_unlock_updates (journal_t *); +void jbd2_journal_wait_updates(journal_t *); + extern journal_t * jbd2_journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int bsize); From 7c268d4ce2d3761f666a9950b029c8902bfab710 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 19 Jan 2022 14:02:09 +0100 Subject: [PATCH 1031/1295] ext4: fix potential NULL pointer dereference in ext4_fill_super() By mistake we fail to return an error from ext4_fill_super() in case that ext4_alloc_sbi() fails to allocate a new sbi. Instead we just set the ret variable and allow the function to continue which will later lead to a NULL pointer dereference. Fix it by returning -ENOMEM in the case ext4_alloc_sbi() fails. Fixes: cebe85d570cf ("ext4: switch to the new mount api") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Lukas Czerner Link: https://lore.kernel.org/r/20220119130209.40112-1-lczerner@redhat.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 57914acc5402..d1c4b04e72ab 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5541,7 +5541,7 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) sbi = ext4_alloc_sbi(sb); if (!sbi) - ret = -ENOMEM; + return -ENOMEM; fc->s_fs_info = sbi; From 715a67f11d6755e0cc853ff4ef539f362b566096 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 10 Jan 2022 21:28:41 +0800 Subject: [PATCH 1032/1295] jbd2: fix kernel-doc descriptions for jbd2_journal_shrink_{scan,count}() Add the description of @shrink and @sc in jbd2_journal_shrink_scan() and jbd2_journal_shrink_count() kernel-doc comment to remove warnings found by running scripts/kernel-doc, which is caused by using 'make W=1'. fs/jbd2/journal.c:1296: warning: Function parameter or member 'shrink' not described in 'jbd2_journal_shrink_scan' fs/jbd2/journal.c:1296: warning: Function parameter or member 'sc' not described in 'jbd2_journal_shrink_scan' fs/jbd2/journal.c:1320: warning: Function parameter or member 'shrink' not described in 'jbd2_journal_shrink_count' fs/jbd2/journal.c:1320: warning: Function parameter or member 'sc' not described in 'jbd2_journal_shrink_count' Reported-by: Abaci Robot Signed-off-by: Yang Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220110132841.34531-1-yang.lee@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index a8e64ad11ae3..a654d22aa2f1 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1287,6 +1287,8 @@ static int jbd2_min_tag_size(void) /** * jbd2_journal_shrink_scan() + * @shrink: shrinker to work on + * @sc: reclaim request to process * * Scan the checkpointed buffer on the checkpoint list and release the * journal_head. @@ -1312,6 +1314,8 @@ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, /** * jbd2_journal_shrink_count() + * @shrink: shrinker to work on + * @sc: reclaim request to process * * Count the number of checkpoint buffers on the checkpoint list. */ From 8fca8a2b0a822f7936130af7299d2fd7f0a66714 Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Wed, 26 Jan 2022 14:31:46 +0800 Subject: [PATCH 1033/1295] ext4: fix incorrect type issue during replay_del_range should not use fast commit log data directly, add le32_to_cpu(). Reported-by: kernel test robot Fixes: 0b5b5a62b945 ("ext4: use ext4_ext_remove_space() for fast commit replay delete range") Cc: stable@kernel.org Signed-off-by: Xin Yin Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20220126063146.2302-1-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 5934c23e153e..7964ee34e322 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1794,8 +1794,9 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, } down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_ext_remove_space(inode, lrange.fc_lblk, - lrange.fc_lblk + lrange.fc_len - 1); + ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), + le32_to_cpu(lrange.fc_lblk) + + le32_to_cpu(lrange.fc_len) - 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; From f340b3d9027485945d59f9c04f1e33070b02cae2 Mon Sep 17 00:00:00 2001 From: hongnanli Date: Fri, 21 Jan 2022 15:06:11 +0800 Subject: [PATCH 1034/1295] fs/ext4: fix comments mentioning i_mutex inode->i_mutex has been replaced with inode->i_rwsem long ago. Fix comments still mentioning i_mutex. Signed-off-by: hongnanli Link: https://lore.kernel.org/r/20220121070611.21618-1-hongnan.li@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/ext4/acl.c | 8 ++++---- fs/ext4/ext4.h | 6 +++--- fs/ext4/ext4_jbd2.h | 2 +- fs/ext4/extents.c | 8 ++++---- fs/ext4/indirect.c | 2 +- fs/ext4/inode.c | 8 ++++---- fs/ext4/migrate.c | 2 +- fs/ext4/orphan.c | 4 ++-- 8 files changed, 20 insertions(+), 20 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 5a35768d6149..57e82e25f8e2 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -139,7 +139,7 @@ fail: /* * Inode operation get_posix_acl(). * - * inode->i_mutex: don't care + * inode->i_rwsem: don't care */ struct posix_acl * ext4_get_acl(struct inode *inode, int type, bool rcu) @@ -183,7 +183,7 @@ ext4_get_acl(struct inode *inode, int type, bool rcu) /* * Set the access or default ACL of an inode. * - * inode->i_mutex: down unless called from ext4_new_inode + * inode->i_rwsem: down unless called from ext4_new_inode */ static int __ext4_set_acl(handle_t *handle, struct inode *inode, int type, @@ -271,8 +271,8 @@ out_stop: /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) + * dir->i_rwsem: down + * inode->i_rwsem: up (access to inode is still exclusive) */ int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 18cd5b3b4815..09d8f60ebf0f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1028,7 +1028,7 @@ struct ext4_inode_info { /* * Extended attributes can be read independently of the main file - * data. Taking i_mutex even when reading would cause contention + * data. Taking i_rwsem even when reading would cause contention * between readers of EAs and writers of regular file data, so * instead we synchronize on xattr_sem when reading or changing * EAs. @@ -3407,7 +3407,7 @@ do { \ #define EXT4_FREECLUSTERS_WATERMARK 0 #endif -/* Update i_disksize. Requires i_mutex to avoid races with truncate */ +/* Update i_disksize. Requires i_rwsem to avoid races with truncate */ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { WARN_ON_ONCE(S_ISREG(inode->i_mode) && @@ -3418,7 +3418,7 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) up_write(&EXT4_I(inode)->i_data_sem); } -/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */ +/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) { int changed = 0; diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 0e4fa644df01..db2ae4a2b38d 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -491,7 +491,7 @@ static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks) /* * This function controls whether or not we should try to go down the * dioread_nolock code paths, which makes it safe to avoid taking - * i_mutex for direct I/O reads. This only works for extent-based + * i_rwsem for direct I/O reads. This only works for extent-based * files, and it doesn't work if data journaling is enabled, since the * dioread_nolock code uses b_private to pass information back to the * I/O completion handler, and this conflicts with the jbd's use of diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3ce4fc250b0d..190c284e0a81 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -97,7 +97,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by - * i_mutex. So we can safely drop the i_data_sem here. + * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); ext4_discard_preallocations(inode, 0); @@ -4574,7 +4574,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - /* Wait all existing dio workers, newcomers will block on i_mutex */ + /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); /* Preallocate the range including the unaligned edges */ @@ -4740,7 +4740,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; } - /* Wait all existing dio workers, newcomers will block on i_mutex */ + /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); @@ -5573,7 +5573,7 @@ out_mutex: * stuff such as page-cache locking consistency, bh mapping consistency or * extent's data copying must be performed by caller. * Locking: - * i_mutex is held for both inodes + * i_rwsem is held for both inodes * i_data_sem is locked for write for both inodes * Assumptions: * All pages from requested range are locked for both inodes diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 89efa78ed4b2..07a8c75b65ed 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by - * i_mutex. So we can safely drop the i_data_sem here. + * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); ext4_discard_preallocations(inode, 0); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f368dd5fd8d5..d2a29fc93742 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1223,7 +1223,7 @@ retry_journal: /* * __block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. + * i_size_read because we hold i_rwsem. * * Add inode to orphan list in case we crash before * truncate finishes @@ -3972,7 +3972,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } - /* Wait all existing dio workers, newcomers will block on i_mutex */ + /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); /* @@ -4122,7 +4122,7 @@ int ext4_truncate(struct inode *inode) /* * There is a possibility that we're either freeing the inode * or it's a completely new inode. In those cases we might not - * have i_mutex locked because it's not necessary. + * have i_rwsem locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) WARN_ON(!inode_is_locked(inode)); @@ -5264,7 +5264,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * transaction are already on disk (truncate waits for pages under * writeback). * - * Called with inode->i_mutex down. + * Called with inode->i_rwsem down. */ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index ff8916e1d38e..7a5353a8cfd7 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -485,7 +485,7 @@ int ext4_ext_migrate(struct inode *inode) * when we add extents we extent the journal */ /* - * Even though we take i_mutex we can still cause block + * Even though we take i_rwsem we can still cause block * allocation via mmap write to holes. If we have allocated * new blocks we fail migrate. New block allocation will * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 53adc8f570a3..7de0612eb42d 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -93,7 +93,7 @@ static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) * At filesystem recovery time, we walk this list deleting unlinked * inodes and truncating linked inodes in ext4_orphan_cleanup(). * - * Orphan list manipulation functions must be called under i_mutex unless + * Orphan list manipulation functions must be called under i_rwsem unless * we are just creating the inode or deleting it. */ int ext4_orphan_add(handle_t *handle, struct inode *inode) @@ -119,7 +119,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) /* * Orphan handling is only valid for files with data blocks * being truncated, or files being unlinked. Note that we either - * hold i_mutex, or the inode can not be referenced from outside, + * hold i_rwsem, or the inode can not be referenced from outside, * so i_nlink should not be bumped due to race */ ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || From 2bdfd2825c9662463371e6691b1a794e97fa36b4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 2 Feb 2022 22:31:03 -0500 Subject: [PATCH 1035/1295] cgroup/cpuset: Fix "suspicious RCU usage" lockdep warning It was found that a "suspicious RCU usage" lockdep warning was issued with the rcu_read_lock() call in update_sibling_cpumasks(). It is because the update_cpumasks_hier() function may sleep. So we have to release the RCU lock, call update_cpumasks_hier() and reacquire it afterward. Also add a percpu_rwsem_assert_held() in update_sibling_cpumasks() instead of stating that in the comment. Fixes: 4716909cc5c5 ("cpuset: Track cpusets that use parent's effective_cpus") Signed-off-by: Waiman Long Tested-by: Phil Auld Reviewed-by: Phil Auld Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 804ff5738c5f..4c7254e8f49a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1550,10 +1550,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; + percpu_rwsem_assert_held(&cpuset_rwsem); + /* * Check all its siblings and call update_cpumasks_hier() * if their use_parent_ecpus flag is set in order for them * to use the right effective_cpus value. + * + * The update_cpumasks_hier() function may sleep. So we have to + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -1561,8 +1566,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; if (!sibling->use_parent_ecpus) continue; + if (!css_tryget_online(&sibling->css)) + continue; + rcu_read_unlock(); update_cpumasks_hier(sibling, tmp); + rcu_read_lock(); + css_put(&sibling->css); } rcu_read_unlock(); } From ac62a0174d62ae0f4447c0c8cf35a8e5d793df56 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 1 Feb 2022 09:02:04 -0600 Subject: [PATCH 1036/1295] dt-bindings: net: qcom,ipa: add optional qcom,qmp property For some systems, the IPA driver must make a request to ensure that its registers are retained across power collapse of the IPA hardware. On such systems, we'll use the existence of the "qcom,qmp" property as a signal that this request is required. Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/qcom,ipa.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/net/qcom,ipa.yaml b/Documentation/devicetree/bindings/net/qcom,ipa.yaml index b86edf67ce62..58ecc62adfaa 100644 --- a/Documentation/devicetree/bindings/net/qcom,ipa.yaml +++ b/Documentation/devicetree/bindings/net/qcom,ipa.yaml @@ -107,6 +107,10 @@ properties: - const: imem - const: config + qcom,qmp: + $ref: /schemas/types.yaml#/definitions/phandle + description: phandle to the AOSS side-channel message RAM + qcom,smem-states: $ref: /schemas/types.yaml#/definitions/phandle-array description: State bits used in by the AP to signal the modem. @@ -222,6 +226,8 @@ examples: "imem", "config"; + qcom,qmp = <&aoss_qmp>; + qcom,smem-states = <&ipa_smp2p_out 0>, <&ipa_smp2p_out 1>; qcom,smem-state-names = "ipa-clock-enabled-valid", From 34a081761e4e3c35381cbfad609ebae2962fe2f8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 1 Feb 2022 09:02:05 -0600 Subject: [PATCH 1037/1295] net: ipa: request IPA register values be retained In some cases, the IPA hardware needs to request the always-on subsystem (AOSS) to coordinate with the IPA microcontroller to retain IPA register values at power collapse. This is done by issuing a QMP request to the AOSS microcontroller. A similar request ondoes that request. We must get and hold the "QMP" handle early, because we might get back EPROBE_DEFER for that. But the actual request should be sent while we know the IPA clock is active, and when we know the microcontroller is operational. Fixes: 1aac309d3207 ("net: ipa: use autosuspend") Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- drivers/net/ipa/ipa_power.c | 52 +++++++++++++++++++++++++++++++++++++ drivers/net/ipa/ipa_power.h | 7 +++++ drivers/net/ipa/ipa_uc.c | 5 ++++ 3 files changed, 64 insertions(+) diff --git a/drivers/net/ipa/ipa_power.c b/drivers/net/ipa/ipa_power.c index b1c6c0fcb654..f2989aac47a6 100644 --- a/drivers/net/ipa/ipa_power.c +++ b/drivers/net/ipa/ipa_power.c @@ -11,6 +11,8 @@ #include #include +#include "linux/soc/qcom/qcom_aoss.h" + #include "ipa.h" #include "ipa_power.h" #include "ipa_endpoint.h" @@ -64,6 +66,7 @@ enum ipa_power_flag { * struct ipa_power - IPA power management information * @dev: IPA device pointer * @core: IPA core clock + * @qmp: QMP handle for AOSS communication * @spinlock: Protects modem TX queue enable/disable * @flags: Boolean state flags * @interconnect_count: Number of elements in interconnect[] @@ -72,6 +75,7 @@ enum ipa_power_flag { struct ipa_power { struct device *dev; struct clk *core; + struct qmp *qmp; spinlock_t spinlock; /* used with STOPPED/STARTED power flags */ DECLARE_BITMAP(flags, IPA_POWER_FLAG_COUNT); u32 interconnect_count; @@ -382,6 +386,47 @@ void ipa_power_modem_queue_active(struct ipa *ipa) clear_bit(IPA_POWER_FLAG_STARTED, ipa->power->flags); } +static int ipa_power_retention_init(struct ipa_power *power) +{ + struct qmp *qmp = qmp_get(power->dev); + + if (IS_ERR(qmp)) { + if (PTR_ERR(qmp) == -EPROBE_DEFER) + return -EPROBE_DEFER; + + /* We assume any other error means it's not defined/needed */ + qmp = NULL; + } + power->qmp = qmp; + + return 0; +} + +static void ipa_power_retention_exit(struct ipa_power *power) +{ + qmp_put(power->qmp); + power->qmp = NULL; +} + +/* Control register retention on power collapse */ +void ipa_power_retention(struct ipa *ipa, bool enable) +{ + static const char fmt[] = "{ class: bcm, res: ipa_pc, val: %c }"; + struct ipa_power *power = ipa->power; + char buf[36]; /* Exactly enough for fmt[]; size a multiple of 4 */ + int ret; + + if (!power->qmp) + return; /* Not needed on this platform */ + + (void)snprintf(buf, sizeof(buf), fmt, enable ? '1' : '0'); + + ret = qmp_send(power->qmp, buf, sizeof(buf)); + if (ret) + dev_err(power->dev, "error %d sending QMP %sable request\n", + ret, enable ? "en" : "dis"); +} + int ipa_power_setup(struct ipa *ipa) { int ret; @@ -438,12 +483,18 @@ ipa_power_init(struct device *dev, const struct ipa_power_data *data) if (ret) goto err_kfree; + ret = ipa_power_retention_init(power); + if (ret) + goto err_interconnect_exit; + pm_runtime_set_autosuspend_delay(dev, IPA_AUTOSUSPEND_DELAY); pm_runtime_use_autosuspend(dev); pm_runtime_enable(dev); return power; +err_interconnect_exit: + ipa_interconnect_exit(power); err_kfree: kfree(power); err_clk_put: @@ -460,6 +511,7 @@ void ipa_power_exit(struct ipa_power *power) pm_runtime_disable(dev); pm_runtime_dont_use_autosuspend(dev); + ipa_power_retention_exit(power); ipa_interconnect_exit(power); kfree(power); clk_put(clk); diff --git a/drivers/net/ipa/ipa_power.h b/drivers/net/ipa/ipa_power.h index 2151805d7fbb..6f84f057a209 100644 --- a/drivers/net/ipa/ipa_power.h +++ b/drivers/net/ipa/ipa_power.h @@ -40,6 +40,13 @@ void ipa_power_modem_queue_wake(struct ipa *ipa); */ void ipa_power_modem_queue_active(struct ipa *ipa); +/** + * ipa_power_retention() - Control register retention on power collapse + * @ipa: IPA pointer + * @enable: Whether retention should be enabled or disabled + */ +void ipa_power_retention(struct ipa *ipa, bool enable); + /** * ipa_power_setup() - Set up IPA power management * @ipa: IPA pointer diff --git a/drivers/net/ipa/ipa_uc.c b/drivers/net/ipa/ipa_uc.c index 856e55a080a7..fe11910518d9 100644 --- a/drivers/net/ipa/ipa_uc.c +++ b/drivers/net/ipa/ipa_uc.c @@ -11,6 +11,7 @@ #include "ipa.h" #include "ipa_uc.h" +#include "ipa_power.h" /** * DOC: The IPA embedded microcontroller @@ -154,6 +155,7 @@ static void ipa_uc_response_hdlr(struct ipa *ipa, enum ipa_irq_id irq_id) case IPA_UC_RESPONSE_INIT_COMPLETED: if (ipa->uc_powered) { ipa->uc_loaded = true; + ipa_power_retention(ipa, true); pm_runtime_mark_last_busy(dev); (void)pm_runtime_put_autosuspend(dev); ipa->uc_powered = false; @@ -184,6 +186,9 @@ void ipa_uc_deconfig(struct ipa *ipa) ipa_interrupt_remove(ipa->interrupt, IPA_IRQ_UC_1); ipa_interrupt_remove(ipa->interrupt, IPA_IRQ_UC_0); + if (ipa->uc_loaded) + ipa_power_retention(ipa, false); + if (!ipa->uc_powered) return; From 67d6212afda218d564890d1674bab28e8612170f Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Thu, 27 Jan 2022 15:39:53 -0800 Subject: [PATCH 1038/1295] Revert "module, async: async_synchronize_full() on module init iff async is used" This reverts commit 774a1221e862b343388347bac9b318767336b20b. We need to finish all async code before the module init sequence is done. In the reverted commit the PF_USED_ASYNC flag was added to mark a thread that called async_schedule(). Then the PF_USED_ASYNC flag was used to determine whether or not async_synchronize_full() needs to be invoked. This works when modprobe thread is calling async_schedule(), but it does not work if module dispatches init code to a worker thread which then calls async_schedule(). For example, PCI driver probing is invoked from a worker thread based on a node where device is attached: if (cpu < nr_cpu_ids) error = work_on_cpu(cpu, local_pci_probe, &ddi); else error = local_pci_probe(&ddi); We end up in a situation where a worker thread gets the PF_USED_ASYNC flag set instead of the modprobe thread. As a result, async_synchronize_full() is not invoked and modprobe completes without waiting for the async code to finish. The issue was discovered while loading the pm80xx driver: (scsi_mod.scan=async) modprobe pm80xx worker ... do_init_module() ... pci_call_probe() work_on_cpu(local_pci_probe) local_pci_probe() pm8001_pci_probe() scsi_scan_host() async_schedule() worker->flags |= PF_USED_ASYNC; ... < return from worker > ... if (current->flags & PF_USED_ASYNC) <--- false async_synchronize_full(); Commit 21c3c5d28007 ("block: don't request module during elevator init") fixed the deadlock issue which the reverted commit 774a1221e862 ("module, async: async_synchronize_full() on module init iff async is used") tried to fix. Since commit 0fdff3ec6d87 ("async, kmod: warn on synchronous request_module() from async workers") synchronous module loading from async is not allowed. Given that the original deadlock issue is fixed and it is no longer allowed to call synchronous request_module() from async we can remove PF_USED_ASYNC flag to make module init consistently invoke async_synchronize_full() unless async module probe is requested. Signed-off-by: Igor Pylypiv Reviewed-by: Changyuan Lyu Reviewed-by: Luis Chamberlain Acked-by: Tejun Heo Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - kernel/async.c | 3 --- kernel/module.c | 25 +++++-------------------- 3 files changed, 5 insertions(+), 24 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index f5b2be39a78c..75ba8aa60248 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1680,7 +1680,6 @@ extern struct pid *cad_pid; #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ -#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ diff --git a/kernel/async.c b/kernel/async.c index b8d7a663497f..b2c4ba5686ee 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -205,9 +205,6 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data, atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* mark that this task has queued an async job, used by module init */ - current->flags |= PF_USED_ASYNC; - /* schedule for execution */ queue_work_node(node, system_unbound_wq, &entry->work); diff --git a/kernel/module.c b/kernel/module.c index 24dab046e16c..46a5c2ed1928 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3725,12 +3725,6 @@ static noinline int do_init_module(struct module *mod) } freeinit->module_init = mod->init_layout.base; - /* - * We want to find out whether @mod uses async during init. Clear - * PF_USED_ASYNC. async_schedule*() will set it. - */ - current->flags &= ~PF_USED_ASYNC; - do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -3756,22 +3750,13 @@ static noinline int do_init_module(struct module *mod) /* * We need to finish all async code before the module init sequence - * is done. This has potential to deadlock. For example, a newly - * detected block device can trigger request_module() of the - * default iosched from async probing task. Once userland helper - * reaches here, async_synchronize_full() will wait on the async - * task waiting on request_module() and deadlock. + * is done. This has potential to deadlock if synchronous module + * loading is requested from async (which is not allowed!). * - * This deadlock is avoided by perfomring async_synchronize_full() - * iff module init queued any async jobs. This isn't a full - * solution as it will deadlock the same if module loading from - * async jobs nests more than once; however, due to the various - * constraints, this hack seems to be the best option for now. - * Please refer to the following thread for details. - * - * http://thread.gmane.org/gmane.linux.kernel/1420814 + * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous + * request_module() from async workers") for more details. */ - if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) + if (!mod->async_probe_requested) async_synchronize_full(); ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base + From 1f2cfdd349b7647f438c1e552dc1b983da86d830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 3 Feb 2022 15:50:29 +0100 Subject: [PATCH 1039/1295] printk: Fix incorrect __user type in proc_dointvec_minmax_sysadmin() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The move of proc_dointvec_minmax_sysadmin() from kernel/sysctl.c to kernel/printk/sysctl.c introduced an incorrect __user attribute to the buffer argument. I spotted this change in [1] as well as the kernel test robot. Revert this change to please sparse: kernel/printk/sysctl.c:20:51: warning: incorrect type in argument 3 (different address spaces) kernel/printk/sysctl.c:20:51: expected void * kernel/printk/sysctl.c:20:51: got void [noderef] __user *buffer Fixes: faaa357a55e0 ("printk: move printk sysctl to printk/sysctl.c") Link: https://lore.kernel.org/r/20220104155024.48023-2-mic@digikod.net [1] Reported-by: kernel test robot Cc: Andrew Morton Cc: John Ogness Cc: Luis Chamberlain Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Xiaoming Ni Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20220203145029.272640-1-mic@digikod.net Signed-off-by: Linus Torvalds --- kernel/printk/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c index 653ae04aab7f..c228343eeb97 100644 --- a/kernel/printk/sysctl.c +++ b/kernel/printk/sysctl.c @@ -12,7 +12,7 @@ static const int ten_thousand = 10000; static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; From 80d4609008e6d696a279e39ae7458c916fcd44c1 Mon Sep 17 00:00:00 2001 From: Yannick Vignon Date: Thu, 3 Feb 2022 17:00:25 +0100 Subject: [PATCH 1040/1295] net: stmmac: ensure PTP time register reads are consistent Even if protected from preemption and interrupts, a small time window remains when the 2 register reads could return inconsistent values, each time the "seconds" register changes. This could lead to an about 1-second error in the reported time. Add logic to ensure the "seconds" and "nanoseconds" values are consistent. Fixes: 92ba6888510c ("stmmac: add the support for PTP hw clock driver") Signed-off-by: Yannick Vignon Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20220203160025.750632-1-yannick.vignon@oss.nxp.com Signed-off-by: Jakub Kicinski --- .../ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c index 074e2cdfb0fa..a7ec9f4d46ce 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c @@ -145,15 +145,20 @@ static int adjust_systime(void __iomem *ioaddr, u32 sec, u32 nsec, static void get_systime(void __iomem *ioaddr, u64 *systime) { - u64 ns; + u64 ns, sec0, sec1; - /* Get the TSSS value */ - ns = readl(ioaddr + PTP_STNSR); - /* Get the TSS and convert sec time value to nanosecond */ - ns += readl(ioaddr + PTP_STSR) * 1000000000ULL; + /* Get the TSS value */ + sec1 = readl_relaxed(ioaddr + PTP_STSR); + do { + sec0 = sec1; + /* Get the TSSS value */ + ns = readl_relaxed(ioaddr + PTP_STNSR); + /* Get the TSS value */ + sec1 = readl_relaxed(ioaddr + PTP_STSR); + } while (sec0 != sec1); if (systime) - *systime = ns; + *systime = ns + (sec1 * 1000000000ULL); } static void get_ptptime(void __iomem *ptpaddr, u64 *ptp_time) From 87563a043cef044fed5db7967a75741cc16ad2b1 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Thu, 3 Feb 2022 23:08:11 +0800 Subject: [PATCH 1041/1295] ax25: fix reference count leaks of ax25_dev The previous commit d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") introduces refcount into ax25_dev, but there are reference leak paths in ax25_ctl_ioctl(), ax25_fwd_ioctl(), ax25_rt_add(), ax25_rt_del() and ax25_rt_opt(). This patch uses ax25_dev_put() and adjusts the position of ax25_addr_ax25dev() to fix reference cout leaks of ax25_dev. Fixes: d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") Signed-off-by: Duoming Zhou Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20220203150811.42256-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski --- include/net/ax25.h | 8 +++++--- net/ax25/af_ax25.c | 12 ++++++++---- net/ax25/ax25_dev.c | 24 +++++++++++++++++------- net/ax25/ax25_route.c | 16 +++++++++++----- 4 files changed, 41 insertions(+), 19 deletions(-) diff --git a/include/net/ax25.h b/include/net/ax25.h index 50b417df6221..8221af1811df 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -294,10 +294,12 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } } -#define ax25_dev_hold(__ax25_dev) \ - refcount_inc(&((__ax25_dev)->refcount)) +static inline void ax25_dev_hold(ax25_dev *ax25_dev) +{ + refcount_inc(&ax25_dev->refcount); +} -static __inline__ void ax25_dev_put(ax25_dev *ax25_dev) +static inline void ax25_dev_put(ax25_dev *ax25_dev) { if (refcount_dec_and_test(&ax25_dev->refcount)) { kfree(ax25_dev); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 32f61978ff29..3e49d28824ed 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -359,21 +359,25 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl))) return -EFAULT; - if ((ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr)) == NULL) - return -ENODEV; - if (ax25_ctl.digi_count > AX25_MAX_DIGIS) return -EINVAL; if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL) return -EINVAL; + ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr); + if (!ax25_dev) + return -ENODEV; + digi.ndigi = ax25_ctl.digi_count; for (k = 0; k < digi.ndigi; k++) digi.calls[k] = ax25_ctl.digi_addr[k]; - if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev)) == NULL) + ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev); + if (!ax25) { + ax25_dev_put(ax25_dev); return -ENOTCONN; + } switch (ax25_ctl.cmd) { case AX25_KILL: diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 770b787fb7bb..d2a244e1c260 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -85,8 +85,8 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); ax25_dev->next = ax25_dev_list; ax25_dev_list = ax25_dev; - ax25_dev_hold(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_hold(ax25_dev); ax25_register_dev_sysctl(ax25_dev); } @@ -115,8 +115,8 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; - ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; dev_put_track(dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); @@ -126,8 +126,8 @@ void ax25_dev_device_down(struct net_device *dev) while (s != NULL && s->next != NULL) { if (s->next == ax25_dev) { s->next = ax25_dev->next; - ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; dev_put_track(dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); @@ -150,25 +150,35 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) switch (cmd) { case SIOCAX25ADDFWD: - if ((fwd_dev = ax25_addr_ax25dev(&fwd->port_to)) == NULL) + fwd_dev = ax25_addr_ax25dev(&fwd->port_to); + if (!fwd_dev) { + ax25_dev_put(ax25_dev); return -EINVAL; - if (ax25_dev->forward != NULL) + } + if (ax25_dev->forward) { + ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = fwd_dev->dev; ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); break; case SIOCAX25DELFWD: - if (ax25_dev->forward == NULL) + if (!ax25_dev->forward) { + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = NULL; + ax25_dev_put(ax25_dev); break; default: + ax25_dev_put(ax25_dev); return -EINVAL; } - ax25_dev_put(ax25_dev); return 0; } diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 1e32693833e5..9751207f7757 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -75,11 +75,13 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_dev *ax25_dev; int i; - if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL) - return -EINVAL; if (route->digi_count > AX25_MAX_DIGIS) return -EINVAL; + ax25_dev = ax25_addr_ax25dev(&route->port_addr); + if (!ax25_dev) + return -EINVAL; + write_lock_bh(&ax25_route_lock); ax25_rt = ax25_route_list; @@ -91,6 +93,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -101,6 +104,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) } } write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } ax25_rt = ax25_rt->next; @@ -108,6 +112,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } @@ -116,11 +121,11 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->dev = ax25_dev->dev; ax25_rt->digipeat = NULL; ax25_rt->ip_mode = ' '; - ax25_dev_put(ax25_dev); if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); kfree(ax25_rt); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -133,6 +138,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->next = ax25_route_list; ax25_route_list = ax25_rt; write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } @@ -173,8 +179,8 @@ static int ax25_rt_del(struct ax25_routes_struct *route) } } } - ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } @@ -216,8 +222,8 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) } out: - ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return err; } From dcb85f85fa6f142aae1fe86f399d4503d49f2b60 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 3 Feb 2022 12:17:54 -0800 Subject: [PATCH 1042/1295] gcc-plugins/stackleak: Use noinstr in favor of notrace While the stackleak plugin was already using notrace, objtool is now a bit more picky. Update the notrace uses to noinstr. Silences the following objtool warnings when building with: CONFIG_DEBUG_ENTRY=y CONFIG_STACK_VALIDATION=y CONFIG_VMLINUX_VALIDATION=y CONFIG_GCC_PLUGIN_STACKLEAK=y vmlinux.o: warning: objtool: do_syscall_64()+0x9: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: do_int80_syscall_32()+0x9: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: exc_general_protection()+0x22: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: fixup_bad_iret()+0x20: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: do_machine_check()+0x27: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: .text+0x5346e: call to stackleak_erase() leaves .noinstr.text section vmlinux.o: warning: objtool: .entry.text+0x143: call to stackleak_erase() leaves .noinstr.text section vmlinux.o: warning: objtool: .entry.text+0x10eb: call to stackleak_erase() leaves .noinstr.text section vmlinux.o: warning: objtool: .entry.text+0x17f9: call to stackleak_erase() leaves .noinstr.text section Note that the plugin's addition of calls to stackleak_track_stack() from noinstr functions is expected to be safe, as it isn't runtime instrumentation and is self-contained. Cc: Alexander Popov Suggested-by: Peter Zijlstra Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds --- kernel/stackleak.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/stackleak.c b/kernel/stackleak.c index 66b8af394e58..ddb5a7f48d69 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -70,7 +70,7 @@ late_initcall(stackleak_sysctls_init); #define skip_erasing() false #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ -asmlinkage void notrace stackleak_erase(void) +asmlinkage void noinstr stackleak_erase(void) { /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ unsigned long kstack_ptr = current->lowest_stack; @@ -124,9 +124,8 @@ asmlinkage void notrace stackleak_erase(void) /* Reset the 'lowest_stack' value for the next syscall */ current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; } -NOKPROBE_SYMBOL(stackleak_erase); -void __used __no_caller_saved_registers notrace stackleak_track_stack(void) +void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) { unsigned long sp = current_stack_pointer; From ed14fc7a79ab43e9f2cb1fa9c1733fdc133bba30 Mon Sep 17 00:00:00 2001 From: Steen Hegelund Date: Thu, 3 Feb 2022 11:29:00 +0100 Subject: [PATCH 1043/1295] net: sparx5: Fix get_stat64 crash in tcpdump This problem was found with Sparx5 when the tcpdump tool requests the do_get_stats64 (sparx5_get_stats64) statistic. The portstats pointer was incorrectly incremented when fetching priority based statistics. Fixes: af4b11022e2d (net: sparx5: add ethtool configuration and statistics support) Signed-off-by: Steen Hegelund Link: https://lore.kernel.org/r/20220203102900.528987-1-steen.hegelund@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_ethtool.c b/drivers/net/ethernet/microchip/sparx5/sparx5_ethtool.c index 59783fc46a7b..10b866e9f726 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_ethtool.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_ethtool.c @@ -1103,7 +1103,7 @@ void sparx5_get_stats64(struct net_device *ndev, stats->tx_carrier_errors = portstats[spx5_stats_tx_csense_cnt]; stats->tx_window_errors = portstats[spx5_stats_tx_late_coll_cnt]; stats->rx_dropped = portstats[spx5_stats_ana_ac_port_stat_lsb_cnt]; - for (idx = 0; idx < 2 * SPX5_PRIOS; ++idx, ++stats) + for (idx = 0; idx < 2 * SPX5_PRIOS; ++idx) stats->rx_dropped += portstats[spx5_stats_green_p0_rx_port_drop + idx]; stats->tx_dropped = portstats[spx5_stats_tx_local_drop]; From b13e0c71856817fca67159b11abac350e41289f5 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Thu, 3 Feb 2022 22:42:09 -0500 Subject: [PATCH 1044/1295] block: bio-integrity: Advance seed correctly for larger interval sizes Commit 309a62fa3a9e ("bio-integrity: bio_integrity_advance must update integrity seed") added code to update the integrity seed value when advancing a bio. However, it failed to take into account that the integrity interval might be larger than the 512-byte block layer sector size. This broke bio splitting on PI devices with 4KB logical blocks. The seed value should be advanced by bio_integrity_intervals() and not the number of sectors. Cc: Dmitry Monakhov Cc: stable@vger.kernel.org Fixes: 309a62fa3a9e ("bio-integrity: bio_integrity_advance must update integrity seed") Tested-by: Dmitry Ivanov Reported-by: Alexey Lyashkov Signed-off-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220204034209.4193-1-martin.petersen@oracle.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index d25114715459..0827b19820c5 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -373,7 +373,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); - bip->bip_iter.bi_sector += bytes_done >> 9; + bip->bip_iter.bi_sector += bio_integrity_intervals(bi, bytes_done >> 9); bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); } From 24331050a3e6afcd4451409831dd9ae8085a42f6 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 4 Feb 2022 03:02:03 +0800 Subject: [PATCH 1045/1295] erofs: fix small compressed files inlining Prior to ztailpacking feature, it's enough that each lcluster has two pclusters at most, and the last pcluster should be turned into an uncompressed pcluster when necessary. For example, _________________________________________________ |_ pcluster n-2 _|_ pcluster n-1 _|____ EOFed ____| which should be converted into: _________________________________________________ |_ pcluster n-2 _|_ pcluster n-1 (uncompressed)' _| That is fine since either pcluster n-1 or (uncompressed)' takes one physical block. However, after ztailpacking was supported, the game is changed since the last pcluster can be inlined now. And such case above is quite common for inlining small files. Therefore, in order to inline more effectively, special EOF lclusters are now supported which can have three parts at most, as illustrated below: _________________________________________________ |_ pcluster n-2 _|_ pcluster n-1 _|____ EOFed ____| ^ i_size Actually similar code exists in Yue Hu's original patchset [1], but I removed this part on purpose. After evaluating more real cases with small files, I've changed my mind. [1] https://lore.kernel.org/r/20211215094449.15162-1-huyue2@yulong.com Link: https://lore.kernel.org/r/20220203190203.30794-1-xiang@kernel.org Fixes: ab92184ff8f1 ("erofs: add on-disk compressed tail-packing inline support") Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/zmap.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 18d7fd1a5064..361b1d6e4bf9 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -630,6 +630,13 @@ static int z_erofs_do_map_blocks(struct inode *inode, if (endoff >= m.clusterofs) { m.headtype = m.type; map->m_la = (m.lcn << lclusterbits) | m.clusterofs; + /* + * For ztailpacking files, in order to inline data more + * effectively, special EOF lclusters are now supported + * which can have three parts at most. + */ + if (ztailpacking && end > inode->i_size) + end = inode->i_size; break; } /* m.lcn should be >= 1 if endoff < m.clusterofs */ From 77b337196a9d87f3d6bb9b07c0436ecafbffda1e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 Jan 2022 13:13:32 +0100 Subject: [PATCH 1046/1295] netfilter: conntrack: don't refresh sctp entries in closed state Vivek Thrivikraman reported: An SCTP server application which is accessed continuously by client application. When the session disconnects the client retries to establish a connection. After restart of SCTP server application the session is not established because of stale conntrack entry with connection state CLOSED as below. (removing this entry manually established new connection): sctp 9 CLOSED src=10.141.189.233 [..] [ASSURED] Just skip timeout update of closed entries, we don't want them to stay around forever. Reported-and-tested-by: Vivek Thrivikraman Closes: https://bugzilla.netfilter.org/show_bug.cgi?id=1579 Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_sctp.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 2394238d01c9..5a936334b517 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -489,6 +489,15 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; + + /* don't renew timeout on init retransmit so + * port reuse by client or NAT middlebox cannot + * keep entry alive indefinitely (incl. nat info). + */ + if (new_state == SCTP_CONNTRACK_CLOSED && + old_state == SCTP_CONNTRACK_CLOSED && + nf_ct_is_confirmed(ct)) + ignore = true; } ct->proto.sctp.state = new_state; From a9e8503def0fd4ed89ade1f61c315f904581d439 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 29 Jan 2022 17:13:23 +0100 Subject: [PATCH 1047/1295] netfilter: nft_payload: don't allow th access for fragments Loads relative to ->thoff naturally expect that this points to the transport header, but this is only true if pkt->fragoff == 0. This has little effect for rulesets with connection tracking/nat because these enable ip defra. For other rulesets this prevents false matches. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_exthdr.c | 2 +- net/netfilter/nft_payload.c | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index dbe1f2e7dd9e..9e927ab4df15 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -167,7 +167,7 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt, { struct tcphdr *tcph; - if (pkt->tprot != IPPROTO_TCP) + if (pkt->tprot != IPPROTO_TCP || pkt->fragoff) return NULL; tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(*tcph), buffer); diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 940fed9a760b..5cc06aef4345 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -83,7 +83,7 @@ static int __nft_payload_inner_offset(struct nft_pktinfo *pkt) { unsigned int thoff = nft_thoff(pkt); - if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) + if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff) return -1; switch (pkt->tprot) { @@ -147,7 +147,7 @@ void nft_payload_eval(const struct nft_expr *expr, offset = skb_network_offset(skb); break; case NFT_PAYLOAD_TRANSPORT_HEADER: - if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) + if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff) goto err; offset = nft_thoff(pkt); break; @@ -688,7 +688,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr, offset = skb_network_offset(skb); break; case NFT_PAYLOAD_TRANSPORT_HEADER: - if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) + if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff) goto err; offset = nft_thoff(pkt); break; @@ -728,7 +728,8 @@ static void nft_payload_set_eval(const struct nft_expr *expr, if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP && pkt->tprot == IPPROTO_SCTP && skb->ip_summed != CHECKSUM_PARTIAL) { - if (nft_payload_csum_sctp(skb, nft_thoff(pkt))) + if (pkt->fragoff == 0 && + nft_payload_csum_sctp(skb, nft_thoff(pkt))) goto err; } From cc4f9d62037ebcb811f4908bba2986c01df1bd50 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 29 Jan 2022 17:47:00 +0100 Subject: [PATCH 1048/1295] netfilter: conntrack: move synack init code to helper It seems more readable to use a common helper in the followup fix rather than copypaste or goto. No functional change intended. The function is only called for syn-ack or syn in repy direction in case of simultaneous open. Signed-off-by: Florian Westphal Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 47 ++++++++++++++++---------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index af5115e127cf..88c89e97d8a2 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -446,6 +446,32 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, } } +static void tcp_init_sender(struct ip_ct_tcp_state *sender, + struct ip_ct_tcp_state *receiver, + const struct sk_buff *skb, + unsigned int dataoff, + const struct tcphdr *tcph, + u32 end, u32 win) +{ + /* SYN-ACK in reply to a SYN + * or SYN from reply direction in simultaneous open. + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, dataoff, tcph, sender); + /* RFC 1323: + * Both sides must send the Window Scale option + * to enable window scaling in either direction. + */ + if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE && + receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) { + sender->td_scale = 0; + receiver->td_scale = 0; + } +} + static bool tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned int index, @@ -499,24 +525,9 @@ static bool tcp_in_window(struct nf_conn *ct, * Initialize sender data. */ if (tcph->syn) { - /* - * SYN-ACK in reply to a SYN - * or SYN from reply direction in simultaneous open. - */ - sender->td_end = - sender->td_maxend = end; - sender->td_maxwin = (win == 0 ? 1 : win); - - tcp_options(skb, dataoff, tcph, sender); - /* - * RFC 1323: - * Both sides must send the Window Scale option - * to enable window scaling in either direction. - */ - if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE - && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) - sender->td_scale = - receiver->td_scale = 0; + tcp_init_sender(sender, receiver, + skb, dataoff, tcph, + end, win); if (!tcph->ack) /* Simultaneous open */ return true; From 82b72cb94666b3dbd7152bb9f441b068af7a921b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 29 Jan 2022 17:47:01 +0100 Subject: [PATCH 1049/1295] netfilter: conntrack: re-init state for retransmitted syn-ack TCP conntrack assumes that a syn-ack retransmit is identical to the previous syn-ack. This isn't correct and causes stuck 3whs in some more esoteric scenarios. tcpdump to illustrate the problem: client > server: Flags [S] seq 1365731894, win 29200, [mss 1460,sackOK,TS val 2083035583 ecr 0,wscale 7] server > client: Flags [S.] seq 145824453, ack 643160523, win 65535, [mss 8952,wscale 5,TS val 3215367629 ecr 2082921663] Note the invalid/outdated synack ack number. Conntrack marks this syn-ack as out-of-window/invalid, but it did initialize the reply direction parameters based on this packets content. client > server: Flags [S] seq 1365731894, win 29200, [mss 1460,sackOK,TS val 2083036623 ecr 0,wscale 7] ... retransmit... server > client: Flags [S.], seq 145824453, ack 643160523, win 65535, [mss 8952,wscale 5,TS val 3215368644 ecr 2082921663] and another bogus synack. This repeats, then client re-uses for a new attempt: client > server: Flags [S], seq 2375731741, win 29200, [mss 1460,sackOK,TS val 2083100223 ecr 0,wscale 7] server > client: Flags [S.], seq 145824453, ack 643160523, win 65535, [mss 8952,wscale 5,TS val 3215430754 ecr 2082921663] ... but still gets a invalid syn-ack. This repeats until: server > client: Flags [S.], seq 145824453, ack 643160523, win 65535, [mss 8952,wscale 5,TS val 3215437785 ecr 2082921663] server > client: Flags [R.], seq 145824454, ack 643160523, win 65535, [mss 8952,wscale 5,TS val 3215443451 ecr 2082921663] client > server: Flags [S], seq 2375731741, win 29200, [mss 1460,sackOK,TS val 2083115583 ecr 0,wscale 7] server > client: Flags [S.], seq 162602410, ack 2375731742, win 65535, [mss 8952,wscale 5,TS val 3215445754 ecr 2083115583] This syn-ack has the correct ack number, but conntrack flags it as invalid: The internal state was created from the first syn-ack seen so the sequence number of the syn-ack is treated as being outside of the announced window. Don't assume that retransmitted syn-ack is identical to previous one. Treat it like the first syn-ack and reinit state. Signed-off-by: Florian Westphal Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 88c89e97d8a2..d1582b888c0d 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -571,6 +571,18 @@ static bool tcp_in_window(struct nf_conn *ct, sender->td_maxwin = (win == 0 ? 1 : win); tcp_options(skb, dataoff, tcph, sender); + } else if (tcph->syn && dir == IP_CT_DIR_REPLY && + state->state == TCP_CONNTRACK_SYN_SENT) { + /* Retransmitted syn-ack, or syn (simultaneous open). + * + * Re-init state for this direction, just like for the first + * syn(-ack) reply, it might differ in seq, ack or tcp options. + */ + tcp_init_sender(sender, receiver, + skb, dataoff, tcph, + end, win); + if (!tcph->ack) + return true; } if (!(tcph->ack)) { From 1f6339e034d5780ad7097c8d8c11b26e0762afba Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 29 Jan 2022 18:30:18 +0100 Subject: [PATCH 1050/1295] MAINTAINERS: netfilter: update git links nf and nf-next have a new location. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index bb83dcbcb619..3a9cb567d47c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13297,8 +13297,8 @@ W: http://www.iptables.org/ W: http://www.nftables.org/ Q: http://patchwork.ozlabs.org/project/netfilter-devel/list/ C: irc://irc.libera.chat/netfilter -T: git git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next.git F: include/linux/netfilter* F: include/linux/netfilter/ F: include/net/netfilter/ From d1ca60efc53d665cf89ed847a14a510a81770b81 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 2 Feb 2022 12:00:56 +0100 Subject: [PATCH 1051/1295] netfilter: ctnetlink: disable helper autoassign When userspace, e.g. conntrackd, inserts an entry with a specified helper, its possible that the helper is lost immediately after its added: ctnetlink_create_conntrack -> nf_ct_helper_ext_add + assign helper -> ctnetlink_setup_nat -> ctnetlink_parse_nat_setup -> parse_nat_setup -> nfnetlink_parse_nat_setup -> nf_nat_setup_info -> nf_conntrack_alter_reply -> __nf_ct_try_assign_helper ... and __nf_ct_try_assign_helper will zero the helper again. Set IPS_HELPER bit to bypass auto-assign logic, its unwanted, just like when helper is assigned via ruleset. Dropped old 'not strictly necessary' comment, it referred to use of rcu_assign_pointer() before it got replaced by RCU_INIT_POINTER(). NB: Fixes tag intentionally incorrect, this extends the referenced commit, but this change won't build without IPS_HELPER introduced there. Fixes: 6714cf5465d280 ("netfilter: nf_conntrack: fix explicit helper attachment and NAT") Reported-by: Pham Thanh Tuyen Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 2 +- net/netfilter/nf_conntrack_netlink.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 4b3395082d15..26071021e986 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -106,7 +106,7 @@ enum ip_conntrack_status { IPS_NAT_CLASH = IPS_UNTRACKED, #endif - /* Conntrack got a helper explicitly attached via CT target. */ + /* Conntrack got a helper explicitly attached (ruleset, ctnetlink). */ IPS_HELPER_BIT = 13, IPS_HELPER = (1 << IPS_HELPER_BIT), diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index ac438370f94a..7032402ffd33 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2311,7 +2311,8 @@ ctnetlink_create_conntrack(struct net *net, if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); - /* not in hash table yet so not strictly necessary */ + /* disable helper auto-assignment for this entry */ + ct->status |= IPS_HELPER; RCU_INIT_POINTER(help->helper, helper); } } else { From 6d896d3b44cf64ab9b2483697e222098e7b72f70 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Thu, 20 Jan 2022 21:10:11 +0900 Subject: [PATCH 1052/1295] ksmbd: smbd: validate buffer descriptor structures Check ChannelInfoOffset and ChannelInfoLength to validate buffer descriptor structures. And add a debug log to print the structures' content. Acked-by: Namjae Jeon Signed-off-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 3926ca18dca4..6806994383d9 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -6126,13 +6126,26 @@ static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, __le16 ChannelInfoOffset, __le16 ChannelInfoLength) { + unsigned int i, ch_count; + if (work->conn->dialect == SMB30_PROT_ID && Channel != SMB2_CHANNEL_RDMA_V1) return -EINVAL; - if (ChannelInfoOffset == 0 || - le16_to_cpu(ChannelInfoLength) < sizeof(*desc)) + ch_count = le16_to_cpu(ChannelInfoLength) / sizeof(*desc); + if (ksmbd_debug_types & KSMBD_DEBUG_RDMA) { + for (i = 0; i < ch_count; i++) { + pr_info("RDMA r/w request %#x: token %#x, length %#x\n", + i, + le32_to_cpu(desc[i].token), + le32_to_cpu(desc[i].length)); + } + } + if (ch_count != 1) { + ksmbd_debug(RDMA, "RDMA multiple buffer descriptors %d are not supported yet\n", + ch_count); return -EINVAL; + } work->need_invalidate_rkey = (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); @@ -6185,9 +6198,15 @@ int smb2_read(struct ksmbd_work *work) if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || req->Channel == SMB2_CHANNEL_RDMA_V1) { + unsigned int ch_offset = le16_to_cpu(req->ReadChannelInfoOffset); + + if (ch_offset < offsetof(struct smb2_read_req, Buffer)) { + err = -EINVAL; + goto out; + } err = smb2_set_remote_key_for_rdma(work, (struct smb2_buffer_desc_v1 *) - &req->Buffer[0], + ((char *)req + ch_offset), req->Channel, req->ReadChannelInfoOffset, req->ReadChannelInfoLength); @@ -6428,11 +6447,16 @@ int smb2_write(struct ksmbd_work *work) if (req->Channel == SMB2_CHANNEL_RDMA_V1 || req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) { - if (req->Length != 0 || req->DataOffset != 0) - return -EINVAL; + unsigned int ch_offset = le16_to_cpu(req->WriteChannelInfoOffset); + + if (req->Length != 0 || req->DataOffset != 0 || + ch_offset < offsetof(struct smb2_write_req, Buffer)) { + err = -EINVAL; + goto out; + } err = smb2_set_remote_key_for_rdma(work, (struct smb2_buffer_desc_v1 *) - &req->Buffer[0], + ((char *)req + ch_offset), req->Channel, req->WriteChannelInfoOffset, req->WriteChannelInfoLength); From 97550c7478a2da93e348d8c3075d92cddd473a78 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 30 Jan 2022 18:28:56 +0900 Subject: [PATCH 1053/1295] ksmbd: fix same UniqueId for dot and dotdot entries ksmbd sets the inode number to UniqueId. However, the same UniqueId for dot and dotdot entry is set to the inode number of the parent inode. This patch set them using the current inode and parent inode. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb_common.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index ef7f42b0290a..9a7e211dbf4f 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -308,14 +308,17 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, for (i = 0; i < 2; i++) { struct kstat kstat; struct ksmbd_kstat ksmbd_kstat; + struct dentry *dentry; if (!dir->dot_dotdot[i]) { /* fill dot entry info */ if (i == 0) { d_info->name = "."; d_info->name_len = 1; + dentry = dir->filp->f_path.dentry; } else { d_info->name = ".."; d_info->name_len = 2; + dentry = dir->filp->f_path.dentry->d_parent; } if (!match_pattern(d_info->name, d_info->name_len, @@ -327,7 +330,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, ksmbd_kstat.kstat = &kstat; ksmbd_vfs_fill_dentry_attrs(work, user_ns, - dir->filp->f_path.dentry->d_parent, + dentry, &ksmbd_kstat); rc = fn(conn, info_level, d_info, &ksmbd_kstat); if (rc) From 04e260948a160d3b7d622bf4c8a96fa4577c09bd Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 30 Jan 2022 18:31:01 +0900 Subject: [PATCH 1054/1295] ksmbd: don't align last entry offset in smb2 query directory When checking smb2 query directory packets from other servers, OutputBufferLength is different with ksmbd. Other servers add an unaligned next offset to OutputBufferLength for the last entry. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 7 ++++--- fs/ksmbd/vfs.h | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 6806994383d9..67e8e28e3fc3 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -3422,9 +3422,9 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, goto free_conv_name; } - struct_sz = readdir_info_level_struct_sz(info_level); - next_entry_offset = ALIGN(struct_sz - 1 + conv_len, - KSMBD_DIR_INFO_ALIGNMENT); + struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len; + next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT); + d_info->last_entry_off_align = next_entry_offset - struct_sz; if (next_entry_offset > d_info->out_buf_len) { d_info->out_buf_len = 0; @@ -3976,6 +3976,7 @@ int smb2_query_dir(struct ksmbd_work *work) ((struct file_directory_info *) ((char *)rsp->Buffer + d_info.last_entry_offset)) ->NextEntryOffset = 0; + d_info.data_count -= d_info.last_entry_off_align; rsp->StructureSize = cpu_to_le16(9); rsp->OutputBufferOffset = cpu_to_le16(72); diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index adf94a4f22fa..8c37aaf936ab 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -47,6 +47,7 @@ struct ksmbd_dir_info { int last_entry_offset; bool hide_dot_file; int flags; + int last_entry_off_align; }; struct ksmbd_readdir_data { From deae24b0b13ff5f46022124fbfc2c72fc534bc6a Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 30 Jan 2022 18:28:11 +0900 Subject: [PATCH 1055/1295] ksmbd: reduce smb direct max read/write size ksmbd does not support more than one Buffer Descriptor V1 element in an smbdirect protocol request. Reducing the maximum read/write size to about 512KB allows interoperability with Windows over a wider variety of RDMA NICs, as an interim workaround. Reviewed-by: Tom Talpey Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 3c1ec1ac0b27..ba5a22bc2e6d 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -80,7 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ static int smb_direct_max_receive_size = 8192; -static int smb_direct_max_read_write_size = 1048512; +static int smb_direct_max_read_write_size = 524224; static int smb_direct_max_outstanding_rw_ops = 8; From f9929ef6a2a55f03aac61248c6a3a987b8546f2a Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 1 Feb 2022 18:20:34 +0900 Subject: [PATCH 1056/1295] ksmbd: add support for key exchange When mounting cifs client, can see the following warning message. CIFS: decode_ntlmssp_challenge: authentication has been weakened as server does not support key exchange To remove this warning message, Add support for key exchange feature to ksmbd. This patch decrypts 16-byte ciphertext value sent by the client using RC4 with session key. The decrypted value is the recovered secondary key that will use instead of the session key for signing and sealing. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/Kconfig | 4 ++-- fs/ksmbd/auth.c | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index 7a2b11c0b803..6c7dc1387beb 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -369,8 +369,8 @@ source "fs/ksmbd/Kconfig" config SMBFS_COMMON tristate - default y if CIFS=y - default m if CIFS=m + default y if CIFS=y || SMB_SERVER=y + default m if CIFS=m || SMB_SERVER=m source "fs/coda/Kconfig" source "fs/afs/Kconfig" diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index dc3d061edda9..911444d21267 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -29,6 +29,7 @@ #include "mgmt/user_config.h" #include "crypto_ctx.h" #include "transport_ipc.h" +#include "../smbfs_common/arc4.h" /* * Fixed format data defining GSS header and fixed string @@ -336,6 +337,29 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, nt_len - CIFS_ENCPWD_SIZE, domain_name, conn->ntlmssp.cryptkey); kfree(domain_name); + + /* The recovered secondary session key */ + if (conn->ntlmssp.client_flags & NTLMSSP_NEGOTIATE_KEY_XCH) { + struct arc4_ctx *ctx_arc4; + unsigned int sess_key_off, sess_key_len; + + sess_key_off = le32_to_cpu(authblob->SessionKey.BufferOffset); + sess_key_len = le16_to_cpu(authblob->SessionKey.Length); + + if (blob_len < (u64)sess_key_off + sess_key_len) + return -EINVAL; + + ctx_arc4 = kmalloc(sizeof(*ctx_arc4), GFP_KERNEL); + if (!ctx_arc4) + return -ENOMEM; + + cifs_arc4_setkey(ctx_arc4, sess->sess_key, + SMB2_NTLMV2_SESSKEY_SIZE); + cifs_arc4_crypt(ctx_arc4, sess->sess_key, + (char *)authblob + sess_key_off, sess_key_len); + kfree_sensitive(ctx_arc4); + } + return ret; } @@ -408,6 +432,9 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, (cflags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) flags |= NTLMSSP_NEGOTIATE_EXTENDED_SEC; + if (cflags & NTLMSSP_NEGOTIATE_KEY_XCH) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; + chgblob->NegotiateFlags = cpu_to_le32(flags); len = strlen(ksmbd_netbios_name()); name = kmalloc(2 + UNICODE_LEN(len), GFP_KERNEL); From d052c5d3a35fcea2d9089d76e295d7af713e8865 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Thu, 3 Feb 2022 22:47:09 +0300 Subject: [PATCH 1057/1295] MAINTAINERS: add myself as Renesas R-Car SATA driver reviewer Add myself as a reviewer for the Renesas R-Car SATA driver -- I don't have the hardware anymore (Geert Uytterhoeven does have a lot of hardware!) but I do have the manuals still! :-) Signed-off-by: Sergey Shtylyov Acked-by: Geert Uytterhoeven Signed-off-by: Damien Le Moal --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 3dbc66c37335..82d1b0d2fab1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16475,6 +16475,14 @@ F: Documentation/devicetree/bindings/i2c/renesas,rmobile-iic.yaml F: drivers/i2c/busses/i2c-rcar.c F: drivers/i2c/busses/i2c-sh_mobile.c +RENESAS R-CAR SATA DRIVER +R: Sergey Shtylyov +S: Supported +L: linux-ide@vger.kernel.org +L: linux-renesas-soc@vger.kernel.org +F: Documentation/devicetree/bindings/ata/renesas,rcar-sata.yaml +F: drivers/ata/sata_rcar.c + RENESAS R-CAR THERMAL DRIVERS M: Niklas Söderlund L: linux-renesas-soc@vger.kernel.org From ac9f0c810684a1b161c18eb4b91ce84cbc13c91d Mon Sep 17 00:00:00 2001 From: Anton Lundin Date: Thu, 3 Feb 2022 10:41:35 +0100 Subject: [PATCH 1058/1295] ata: libata-core: Introduce ATA_HORKAGE_NO_LOG_DIR horkage 06f6c4c6c3e8 ("ata: libata: add missing ata_identify_page_supported() calls") introduced additional calls to ata_identify_page_supported(), thus also adding indirectly accesses to the device log directory log page through ata_log_supported(). Reading this log page causes SATADOM-ML 3ME devices to lock up. Introduce the horkage flag ATA_HORKAGE_NO_LOG_DIR to prevent accesses to the log directory in ata_log_supported() and add a blacklist entry with this flag for "SATADOM-ML 3ME" devices. Fixes: 636f6e2af4fb ("libata: add horkage for missing Identify Device log") Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Anton Lundin Signed-off-by: Damien Le Moal --- drivers/ata/libata-core.c | 10 ++++++++++ include/linux/libata.h | 1 + 2 files changed, 11 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 67f88027680a..e1b1dd215267 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2007,6 +2007,9 @@ static bool ata_log_supported(struct ata_device *dev, u8 log) { struct ata_port *ap = dev->link->ap; + if (dev->horkage & ATA_HORKAGE_NO_LOG_DIR) + return false; + if (ata_read_log_page(dev, ATA_LOG_DIRECTORY, 0, ap->sector_buf, 1)) return false; return get_unaligned_le16(&ap->sector_buf[log * 2]) ? true : false; @@ -4073,6 +4076,13 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { "WDC WD3000JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM }, { "WDC WD3200JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM }, + /* + * This sata dom device goes on a walkabout when the ATA_LOG_DIRECTORY + * log page is accessed. Ensure we never ask for this log page with + * these devices. + */ + { "SATADOM-ML 3ME", NULL, ATA_HORKAGE_NO_LOG_DIR }, + /* End Marker */ { } }; diff --git a/include/linux/libata.h b/include/linux/libata.h index 605756f645be..7f99b4d78822 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -380,6 +380,7 @@ enum { ATA_HORKAGE_MAX_TRIM_128M = (1 << 26), /* Limit max trim size to 128M */ ATA_HORKAGE_NO_NCQ_ON_ATI = (1 << 27), /* Disable NCQ on ATI chipset */ ATA_HORKAGE_NO_ID_DEV_LOG = (1 << 28), /* Identify device log missing */ + ATA_HORKAGE_NO_LOG_DIR = (1 << 29), /* Do not read log directory */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ From e3bcfda012edd3564e12551b212afbd2521a1f68 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 3 Feb 2022 16:13:48 -0800 Subject: [PATCH 1059/1295] KVM: x86: Report deprecated x87 features in supported CPUID CPUID.(EAX=7,ECX=0):EBX.FDP_EXCPTN_ONLY[bit 6] and CPUID.(EAX=7,ECX=0):EBX.ZERO_FCS_FDS[bit 13] are "defeature" bits. Unlike most of the other CPUID feature bits, these bits are clear if the features are present and set if the features are not present. These bits should be reported in KVM_GET_SUPPORTED_CPUID, because if these bits are set on hardware, they cannot be cleared in the guest CPUID. Doing so would claim guest support for a feature that the hardware doesn't support and that can't be efficiently emulated. Of course, any software (e.g WIN87EM.DLL) expecting these features to be present likely predates these CPUID feature bits and therefore doesn't know to check for them anyway. Aaron Lewis added the corresponding X86_FEATURE macros in commit cbb99c0f5887 ("x86/cpufeatures: Add FDP_EXCPTN_ONLY and ZERO_FCS_FDS"), with the intention of reporting these bits in KVM_GET_SUPPORTED_CPUID, but I was unable to find a proposed patch on the kvm list. Opportunistically reordered the CPUID_7_0_EBX capability bits from least to most significant. Cc: Aaron Lewis Signed-off-by: Jim Mattson Message-Id: <20220204001348.2844660-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 28be02adc669..494d4d351859 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -554,12 +554,13 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_mask(CPUID_7_0_EBX, - F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) | - F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | - F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/ - ); + F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | + F(FDP_EXCPTN_ONLY) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) | + F(RTM) | F(ZERO_FCS_FDS) | 0 /*MPX*/ | F(AVX512F) | + F(AVX512DQ) | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | + F(CLFLUSHOPT) | F(CLWB) | 0 /*INTEL_PT*/ | F(AVX512PF) | + F(AVX512ER) | F(AVX512CD) | F(SHA_NI) | F(AVX512BW) | + F(AVX512VL)); kvm_cpu_cap_mask(CPUID_7_ECX, F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | From bc41099f060ea74ac8d02c51bd0f5f46d969bedf Mon Sep 17 00:00:00 2001 From: Dongjin Kim Date: Thu, 27 Jan 2022 21:29:25 +0900 Subject: [PATCH 1060/1295] arm64: dts: meson-g12b-odroid-n2: fix typo 'dio2133' Typo in audio amplifier node, dioo2133 -> dio2133 Signed-off-by: Dongjin Kim Fixes: ef599f5f3e10 ("arm64: dts: meson: convert ODROID-N2 to dtsi") Fixes: 67d141c1f8e6 ("arm64: dts: meson: odroid-n2: add jack audio output support") Reviewed-by: Neil Armstrong Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/YfKQJejh0bfGYvof@anyang --- arch/arm64/boot/dts/amlogic/meson-g12b-odroid-n2.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-g12b-odroid-n2.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12b-odroid-n2.dtsi index 3e968b244191..fd3fa82e4c33 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12b-odroid-n2.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-g12b-odroid-n2.dtsi @@ -17,7 +17,7 @@ rtc1 = &vrtc; }; - dioo2133: audio-amplifier-0 { + dio2133: audio-amplifier-0 { compatible = "simple-audio-amplifier"; enable-gpios = <&gpio_ao GPIOAO_2 GPIO_ACTIVE_HIGH>; VCC-supply = <&vcc_5v>; @@ -219,7 +219,7 @@ audio-widgets = "Line", "Lineout"; audio-aux-devs = <&tdmout_b>, <&tdmout_c>, <&tdmin_a>, <&tdmin_b>, <&tdmin_c>, <&tdmin_lb>, - <&dioo2133>; + <&dio2133>; audio-routing = "TDMOUT_B IN 0", "FRDDR_A OUT 1", "TDMOUT_B IN 1", "FRDDR_B OUT 1", "TDMOUT_B IN 2", "FRDDR_C OUT 1", From 323ca765bfe9d637fa774373baec0bc41e51fcfa Mon Sep 17 00:00:00 2001 From: Lutz Koschorreck Date: Thu, 27 Jan 2022 14:05:37 +0100 Subject: [PATCH 1061/1295] arm64: dts: meson-sm1-odroid: use correct enable-gpio pin for tf-io regulator The interrupt pin of the external ethernet phy is used, instead of the enable-gpio pin of the tf-io regulator. The GPIOE_2 pin is located in the gpio_ao bank. This causes phy interrupt problems at system startup. [ 76.645190] irq 36: nobody cared (try booting with the "irqpoll" option) [ 76.649617] CPU: 0 PID: 1416 Comm: irq/36-0.0:00 Not tainted 5.16.0 #2 [ 76.649629] Hardware name: Hardkernel ODROID-HC4 (DT) [ 76.649635] Call trace: [ 76.649638] dump_backtrace+0x0/0x1c8 [ 76.649658] show_stack+0x14/0x60 [ 76.649667] dump_stack_lvl+0x64/0x7c [ 76.649676] dump_stack+0x14/0x2c [ 76.649683] __report_bad_irq+0x38/0xe8 [ 76.649695] note_interrupt+0x220/0x3a0 [ 76.649704] handle_irq_event_percpu+0x58/0x88 [ 76.649713] handle_irq_event+0x44/0xd8 [ 76.649721] handle_fasteoi_irq+0xa8/0x130 [ 76.649730] generic_handle_domain_irq+0x38/0x58 [ 76.649738] gic_handle_irq+0x9c/0xb8 [ 76.649747] call_on_irq_stack+0x28/0x38 [ 76.649755] do_interrupt_handler+0x7c/0x80 [ 76.649763] el1_interrupt+0x34/0x80 [ 76.649772] el1h_64_irq_handler+0x14/0x20 [ 76.649781] el1h_64_irq+0x74/0x78 [ 76.649788] irq_finalize_oneshot.part.56+0x68/0xf8 [ 76.649796] irq_thread_fn+0x5c/0x98 [ 76.649804] irq_thread+0x13c/0x260 [ 76.649812] kthread+0x144/0x178 [ 76.649822] ret_from_fork+0x10/0x20 [ 76.649830] handlers: [ 76.653170] [<0000000025a6cd31>] irq_default_primary_handler threaded [<0000000093580eb7>] phy_interrupt [ 76.661256] Disabling IRQ #36 Fixes: 1f80a5cf74a6 ("arm64: dts: meson-sm1-odroid: add missing enable gpio and supply for tf_io regulator") Signed-off-by: Lutz Koschorreck Reviewed-by: Neil Armstrong Signed-off-by: Neil Armstrong [narmstrong: removed spurious invalid & blank lines from commit message] Link: https://lore.kernel.org/r/20220127130537.GA187347@odroid-VirtualBox --- arch/arm64/boot/dts/amlogic/meson-sm1-odroid.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-sm1-odroid.dtsi b/arch/arm64/boot/dts/amlogic/meson-sm1-odroid.dtsi index 0bd1e98a0eef..ed7cd5f53046 100644 --- a/arch/arm64/boot/dts/amlogic/meson-sm1-odroid.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-sm1-odroid.dtsi @@ -48,7 +48,7 @@ regulator-max-microvolt = <3300000>; vin-supply = <&vcc_5v>; - enable-gpio = <&gpio GPIOE_2 GPIO_ACTIVE_HIGH>; + enable-gpio = <&gpio_ao GPIOE_2 GPIO_ACTIVE_HIGH>; enable-active-high; regulator-always-on; From a5be3e5d46f373fe1d2ee835c7ede31769c241cd Mon Sep 17 00:00:00 2001 From: Dongjin Kim Date: Fri, 28 Jan 2022 00:16:56 +0900 Subject: [PATCH 1062/1295] arm64: dts: meson-sm1-bananapi-m5: fix wrong GPIO domain for GPIOE_2 GPIOE_2 is in AO domain and "<&gpio GPIOE_2 ...>" changes the state of TF_PWR_EN of 'FC8731' on BPI-M5 Fixes: 976e920183e4 ("arm64: dts: meson-sm1: add Banana PI BPI-M5 board dts") Signed-off-by: Dongjin Kim Reviewed-by: Neil Armstrong Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20220127151656.GA2419733@paju --- arch/arm64/boot/dts/amlogic/meson-sm1-bananapi-m5.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-sm1-bananapi-m5.dts b/arch/arm64/boot/dts/amlogic/meson-sm1-bananapi-m5.dts index 212c6aa5a3b8..5751c48620ed 100644 --- a/arch/arm64/boot/dts/amlogic/meson-sm1-bananapi-m5.dts +++ b/arch/arm64/boot/dts/amlogic/meson-sm1-bananapi-m5.dts @@ -123,7 +123,7 @@ regulator-min-microvolt = <1800000>; regulator-max-microvolt = <3300000>; - enable-gpio = <&gpio GPIOE_2 GPIO_ACTIVE_HIGH>; + enable-gpio = <&gpio_ao GPIOE_2 GPIO_ACTIVE_HIGH>; enable-active-high; regulator-always-on; From 76577c9137456febb05b0e17d244113196a98968 Mon Sep 17 00:00:00 2001 From: Christian Hewitt Date: Wed, 26 Jan 2022 04:49:52 +0000 Subject: [PATCH 1063/1295] arm64: dts: meson-gx: add ATF BL32 reserved-memory region Add an additional reserved memory region for the BL32 trusted firmware present in many devices that boot from Amlogic vendor u-boot. Suggested-by: Mateusz Krzak Signed-off-by: Christian Hewitt Reviewed-by: Neil Armstrong Reviewed-by: Kevin Hilman Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20220126044954.19069-2-christianshewitt@gmail.com --- arch/arm64/boot/dts/amlogic/meson-gx.dtsi | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi index 6b457b2c30a4..aa14ea017a61 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi @@ -49,6 +49,12 @@ no-map; }; + /* 32 MiB reserved for ARM Trusted Firmware (BL32) */ + secmon_reserved_bl32: secmon@5300000 { + reg = <0x0 0x05300000 0x0 0x2000000>; + no-map; + }; + linux,cma { compatible = "shared-dma-pool"; reusable; From 08982a1b3aa2611c9c711d24825c9002d28536f4 Mon Sep 17 00:00:00 2001 From: Christian Hewitt Date: Wed, 26 Jan 2022 04:49:53 +0000 Subject: [PATCH 1064/1295] arm64: dts: meson-g12: add ATF BL32 reserved-memory region Add an additional reserved memory region for the BL32 trusted firmware present in many devices that boot from Amlogic vendor u-boot. Signed-off-by: Christian Hewitt Reviewed-by: Neil Armstrong Reviewed-by: Kevin Hilman Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20220126044954.19069-3-christianshewitt@gmail.com --- arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi index 517519e6e87f..f84d4b489e0b 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi @@ -107,6 +107,12 @@ no-map; }; + /* 32 MiB reserved for ARM Trusted Firmware (BL32) */ + secmon_reserved_bl32: secmon@5300000 { + reg = <0x0 0x05300000 0x0 0x2000000>; + no-map; + }; + linux,cma { compatible = "shared-dma-pool"; reusable; From f26573e2bc9dfd551a0d5c6971f18cc546543312 Mon Sep 17 00:00:00 2001 From: Christian Hewitt Date: Wed, 26 Jan 2022 04:49:54 +0000 Subject: [PATCH 1065/1295] arm64: dts: meson-g12: drop BL32 region from SEI510/SEI610 The BL32/TEE reserved-memory region is now inherited from the common family dtsi (meson-g12-common) so we can drop it from board files. Signed-off-by: Christian Hewitt Reviewed-by: Neil Armstrong Reviewed-by: Kevin Hilman Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20220126044954.19069-4-christianshewitt@gmail.com --- arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts | 8 -------- arch/arm64/boot/dts/amlogic/meson-sm1-sei610.dts | 8 -------- 2 files changed, 16 deletions(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts b/arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts index d8838dde0f0f..4fb31c2ba31c 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts +++ b/arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts @@ -157,14 +157,6 @@ regulator-always-on; }; - reserved-memory { - /* TEE Reserved Memory */ - bl32_reserved: bl32@5000000 { - reg = <0x0 0x05300000 0x0 0x2000000>; - no-map; - }; - }; - sdio_pwrseq: sdio-pwrseq { compatible = "mmc-pwrseq-simple"; reset-gpios = <&gpio GPIOX_6 GPIO_ACTIVE_LOW>; diff --git a/arch/arm64/boot/dts/amlogic/meson-sm1-sei610.dts b/arch/arm64/boot/dts/amlogic/meson-sm1-sei610.dts index 427475846fc7..a5d79f2f7c19 100644 --- a/arch/arm64/boot/dts/amlogic/meson-sm1-sei610.dts +++ b/arch/arm64/boot/dts/amlogic/meson-sm1-sei610.dts @@ -203,14 +203,6 @@ regulator-always-on; }; - reserved-memory { - /* TEE Reserved Memory */ - bl32_reserved: bl32@5000000 { - reg = <0x0 0x05300000 0x0 0x2000000>; - no-map; - }; - }; - sdio_pwrseq: sdio-pwrseq { compatible = "mmc-pwrseq-simple"; reset-gpios = <&gpio GPIOX_6 GPIO_ACTIVE_LOW>; From e6b03375132fefddc55cf700418cf794b3884e0c Mon Sep 17 00:00:00 2001 From: Lutz Koschorreck Date: Fri, 28 Jan 2022 20:31:50 +0100 Subject: [PATCH 1066/1295] arm64: dts: meson-sm1-odroid: fix boot loop after reboot Since the correct gpio pin is used for enabling tf-io regulator the system did not boot correctly after calling reboot. [ 36.862443] reboot: Restarting system bl31 reboot reason: 0xd bl31 reboot reason: 0x0 system cmd 1. SM1:BL:511f6b:81ca2f;FEAT:A0F83180:20282000;POC:B;RCY:0;SPINOR:0;CHK:1F;EMMC:800;NAND:81;SD?:0;SD:0;READ:0;0.0;CHK:0; bl2_stage_init 0x01 bl2_stage_init 0x81 hw id: SM1:BL:511f6b:81ca2f;FEAT:A0F83180:20282000;POC:B;RCY:0;SPINOR:0;CHK:1F;EMMC:800;NAND:81;SD?:0;SD:400;USB:8;LOOP:1;... Setting the gpio to open drain solves the issue. Fixes: 1f80a5cf74a6 ("arm64: dts: meson-sm1-odroid: add missing enable gpio and supply for tf_io regulator") Signed-off-by: Lutz Koschorreck Reviewed-by: Neil Armstrong [narmstrong: reduced serial log & removed invalid character in commit message] Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20220128193150.GA1304381@odroid-VirtualBox --- arch/arm64/boot/dts/amlogic/meson-sm1-odroid.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-sm1-odroid.dtsi b/arch/arm64/boot/dts/amlogic/meson-sm1-odroid.dtsi index ed7cd5f53046..ddb1b345397f 100644 --- a/arch/arm64/boot/dts/amlogic/meson-sm1-odroid.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-sm1-odroid.dtsi @@ -48,7 +48,7 @@ regulator-max-microvolt = <3300000>; vin-supply = <&vcc_5v>; - enable-gpio = <&gpio_ao GPIOE_2 GPIO_ACTIVE_HIGH>; + enable-gpio = <&gpio_ao GPIOE_2 GPIO_OPEN_DRAIN>; enable-active-high; regulator-always-on; From 6e37ec8825a113bc2dd1b280be10e5ac6eb4f6b1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 2 Feb 2022 00:51:57 +0000 Subject: [PATCH 1067/1295] KVM: x86: Use ERR_PTR_USR() to return -EFAULT as a __user pointer Use ERR_PTR_USR() when returning -EFAULT from kvm_get_attr_addr(), sparse complains about implicitly casting the kernel pointer from ERR_PTR() into a __user pointer. >> arch/x86/kvm/x86.c:4342:31: sparse: sparse: incorrect type in return expression (different address spaces) @@ expected void [noderef] __user * @@ got void * @@ arch/x86/kvm/x86.c:4342:31: sparse: expected void [noderef] __user * arch/x86/kvm/x86.c:4342:31: sparse: got void * >> arch/x86/kvm/x86.c:4342:31: sparse: sparse: incorrect type in return expression (different address spaces) @@ expected void [noderef] __user * @@ got void * @@ arch/x86/kvm/x86.c:4342:31: sparse: expected void [noderef] __user * arch/x86/kvm/x86.c:4342:31: sparse: got void * No functional change intended. Fixes: 56f289a8d23a ("KVM: x86: Add a helper to retrieve userspace address from kvm_device_attr") Reported-by: kernel test robot Signed-off-by: Sean Christopherson Message-Id: <20220202005157.2545816-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fec3dd4f0718..b533aab98172 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -90,6 +90,8 @@ u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); +#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) + #define emul_to_vcpu(ctxt) \ ((struct kvm_vcpu *)(ctxt)->vcpu) @@ -4340,7 +4342,7 @@ static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr) void __user *uaddr = (void __user*)(unsigned long)attr->addr; if ((u64)(unsigned long)uaddr != attr->addr) - return ERR_PTR(-EFAULT); + return ERR_PTR_USR(-EFAULT); return uaddr; } @@ -11684,8 +11686,6 @@ void kvm_arch_sync_events(struct kvm *kvm) kvm_free_pit(kvm); } -#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) - /** * __x86_set_memory_region: Setup KVM internal memory slot * From dd7f5a11ac5a6f733f422dc22b4d145d3260304e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 31 Jan 2022 22:02:46 +0100 Subject: [PATCH 1068/1295] PCI/MSI: Remove bogus warning in pci_irq_get_affinity() The recent overhaul of pci_irq_get_affinity() introduced a regression when pci_irq_get_affinity() is called for an MSI-X interrupt which was not allocated with affinity descriptor information. The original code just returned a NULL pointer in that case, but the rework added a WARN_ON() under the assumption that the corresponding WARN_ON() in the MSI case can be applied to MSI-X as well. In fact the MSI warning in the original code does not make sense either because it's legitimate to invoke pci_irq_get_affinity() for a MSI interrupt which was not allocated with affinity descriptor information. Remove it and just return NULL as the original code did. Fixes: f48235900182 ("PCI/MSI: Simplify pci_irq_get_affinity()") Reported-by: Guenter Roeck Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/87ee4n38sm.ffs@tglx --- drivers/pci/msi/msi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index c19c7ca58186..9037a7827eca 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -1111,7 +1111,8 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr) if (!desc) return cpu_possible_mask; - if (WARN_ON_ONCE(!desc->affinity)) + /* MSI[X] interrupts can be allocated without affinity descriptor */ + if (!desc->affinity) return NULL; /* From fe68195daf34d5dddacd3f93dd3eafc4beca3a0e Mon Sep 17 00:00:00 2001 From: Samuel Mendoza-Jonas Date: Thu, 3 Feb 2022 14:49:16 -0800 Subject: [PATCH 1069/1295] ixgbevf: Require large buffers for build_skb on 82599VF From 4.17 onwards the ixgbevf driver uses build_skb() to build an skb around new data in the page buffer shared with the ixgbe PF. This uses either a 2K or 3K buffer, and offsets the DMA mapping by NET_SKB_PAD + NET_IP_ALIGN. When using a smaller buffer RXDCTL is set to ensure the PF does not write a full 2K bytes into the buffer, which is actually 2K minus the offset. However on the 82599 virtual function, the RXDCTL mechanism is not available. The driver attempts to work around this by using the SET_LPE mailbox method to lower the maximm frame size, but the ixgbe PF driver ignores this in order to keep the PF and all VFs in sync[0]. This means the PF will write up to the full 2K set in SRRCTL, causing it to write NET_SKB_PAD + NET_IP_ALIGN bytes past the end of the buffer. With 4K pages split into two buffers, this means it either writes NET_SKB_PAD + NET_IP_ALIGN bytes past the first buffer (and into the second), or NET_SKB_PAD + NET_IP_ALIGN bytes past the end of the DMA mapping. Avoid this by only enabling build_skb when using "large" buffers (3K). These are placed in each half of an order-1 page, preventing the PF from writing past the end of the mapping. [0]: Technically it only ever raises the max frame size, see ixgbe_set_vf_lpe() in ixgbe_sriov.c Fixes: f15c5ba5b6cd ("ixgbevf: add support for using order 1 pages to receive large frames") Signed-off-by: Samuel Mendoza-Jonas Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 0015fcf1df2b..0f293acd17e8 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1984,14 +1984,15 @@ static void ixgbevf_set_rx_buffer_len(struct ixgbevf_adapter *adapter, if (adapter->flags & IXGBEVF_FLAGS_LEGACY_RX) return; + if (PAGE_SIZE < 8192) + if (max_frame > IXGBEVF_MAX_FRAME_BUILD_SKB) + set_ring_uses_large_buffer(rx_ring); + + /* 82599 can't rely on RXDCTL.RLPML to restrict the size of the frame */ + if (adapter->hw.mac.type == ixgbe_mac_82599_vf && !ring_uses_large_buffer(rx_ring)) + return; + set_ring_build_skb_enabled(rx_ring); - - if (PAGE_SIZE < 8192) { - if (max_frame <= IXGBEVF_MAX_FRAME_BUILD_SKB) - return; - - set_ring_uses_large_buffer(rx_ring); - } } /** From 9b45a7738eec52bf0f5d8d3d54e822962781c5f2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 4 Feb 2022 12:55:37 +0100 Subject: [PATCH 1070/1295] iommu/amd: Fix loop timeout issue in iommu_ga_log_enable() The polling loop for the register change in iommu_ga_log_enable() needs to have a udelay() in it. Otherwise the CPU might be faster than the IOMMU hardware and wrongly trigger the WARN_ON() further down the code stream. Use a 10us for udelay(), has there is some hardware where activation of the GA log can take more than a 100ms. A future optimization should move the activation check of the GA log to the point where it gets used for the first time. But that is a bigger change and not suitable for a fix. Fixes: 8bda0cfbdc1a ("iommu/amd: Detect and initialize guest vAPIC log") Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20220204115537.3894-1-joro@8bytes.org --- drivers/iommu/amd/init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index dc338acf3338..b10fb52ea442 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -834,6 +835,7 @@ static int iommu_ga_log_enable(struct amd_iommu *iommu) status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); if (status & (MMIO_STATUS_GALOG_RUN_MASK)) break; + udelay(10); } if (WARN_ON(i >= LOOP_TIMEOUT)) From 5d5ead5e1cafa6a6076e8c3b55f707eafc76bfbd Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 3 Feb 2022 17:00:26 +0200 Subject: [PATCH 1071/1295] serial: 8250_pericom: Revert "Re-enable higher baud rates" UPF_MAGIC_MULTIPLIER is userspace available bit and can be changed at any time. There is no sense to rely on it to be always present. This reverts commit b4ccaf5aa2d795ee7f47a6eeb209f3de981e1929. Note, that code was not reliably worked before, hence it implies no functional change. Signed-off-by: Andy Shevchenko Fixes: b4ccaf5aa2d7 ("serial: 8250_pericom: Re-enable higher baud rates") Link: https://lore.kernel.org/r/20220203150026.19087-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_pericom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_pericom.c b/drivers/tty/serial/8250/8250_pericom.c index 025b055363c3..95ff10f25d58 100644 --- a/drivers/tty/serial/8250/8250_pericom.c +++ b/drivers/tty/serial/8250/8250_pericom.c @@ -117,7 +117,7 @@ static int pericom8250_probe(struct pci_dev *pdev, const struct pci_device_id *i uart.port.private_data = pericom; uart.port.iotype = UPIO_PORT; uart.port.uartclk = 921600 * 16; - uart.port.flags = UPF_SKIP_TEST | UPF_BOOT_AUTOCONF | UPF_SHARE_IRQ | UPF_MAGIC_MULTIPLIER; + uart.port.flags = UPF_SKIP_TEST | UPF_BOOT_AUTOCONF | UPF_SHARE_IRQ; uart.port.set_divisor = pericom_do_set_divisor; for (i = 0; i < nr && i < maxnr; i++) { unsigned int offset = (i == 3 && nr == 4) ? 0x38 : i * 0x8; From 61cc70d9e8ef5b042d4ed87994d20100ec8896d9 Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Thu, 27 Jan 2022 15:44:04 +0100 Subject: [PATCH 1072/1295] vt_ioctl: fix array_index_nospec in vt_setactivate array_index_nospec ensures that an out-of-bounds value is set to zero on the transient path. Decreasing the value by one afterwards causes a transient integer underflow. vsa.console should be decreased first and then sanitized with array_index_nospec. Kasper Acknowledgements: Jakob Koschel, Brian Johannesmeyer, Kaveh Razavi, Herbert Bos, Cristiano Giuffrida from the VUSec group at VU Amsterdam. Co-developed-by: Brian Johannesmeyer Signed-off-by: Brian Johannesmeyer Signed-off-by: Jakob Koschel Link: https://lore.kernel.org/r/20220127144406.3589293-1-jakobkoschel@gmail.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c index 3639bb6dc372..e0714a9c9fd7 100644 --- a/drivers/tty/vt/vt_ioctl.c +++ b/drivers/tty/vt/vt_ioctl.c @@ -599,8 +599,8 @@ static int vt_setactivate(struct vt_setactivate __user *sa) if (vsa.console == 0 || vsa.console > MAX_NR_CONSOLES) return -ENXIO; - vsa.console = array_index_nospec(vsa.console, MAX_NR_CONSOLES + 1); vsa.console--; + vsa.console = array_index_nospec(vsa.console, MAX_NR_CONSOLES); console_lock(); ret = vc_allocate(vsa.console); if (ret) { From 28cb138f559f8c1a1395f5564f86b8bbee83631b Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Thu, 27 Jan 2022 15:44:05 +0100 Subject: [PATCH 1073/1295] vt_ioctl: add array_index_nospec to VT_ACTIVATE in vt_setactivate an almost identical code path has been patched with array_index_nospec. In the VT_ACTIVATE path the user input is from a system call argument instead of a usercopy. For consistency both code paths should have the same mitigations applied. Kasper Acknowledgements: Jakob Koschel, Brian Johannesmeyer, Kaveh Razavi, Herbert Bos, Cristiano Giuffrida from the VUSec group at VU Amsterdam. Co-developed-by: Brian Johannesmeyer Signed-off-by: Brian Johannesmeyer Signed-off-by: Jakob Koschel Link: https://lore.kernel.org/r/20220127144406.3589293-2-jakobkoschel@gmail.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt_ioctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c index e0714a9c9fd7..58013698635f 100644 --- a/drivers/tty/vt/vt_ioctl.c +++ b/drivers/tty/vt/vt_ioctl.c @@ -845,6 +845,7 @@ int vt_ioctl(struct tty_struct *tty, return -ENXIO; arg--; + arg = array_index_nospec(arg, MAX_NR_CONSOLES); console_lock(); ret = vc_allocate(arg); console_unlock(); From c0689e46be23160d925dca95dfc411f1a0462708 Mon Sep 17 00:00:00 2001 From: Jonas Malaco Date: Thu, 3 Feb 2022 13:49:52 -0300 Subject: [PATCH 1074/1295] eeprom: ee1004: limit i2c reads to I2C_SMBUS_BLOCK_MAX Commit effa453168a7 ("i2c: i801: Don't silently correct invalid transfer size") revealed that ee1004_eeprom_read() did not properly limit how many bytes to read at once. In particular, i2c_smbus_read_i2c_block_data_or_emulated() takes the length to read as an u8. If count == 256 after taking into account the offset and page boundary, the cast to u8 overflows. And this is common when user space tries to read the entire EEPROM at once. To fix it, limit each read to I2C_SMBUS_BLOCK_MAX (32) bytes, already the maximum length i2c_smbus_read_i2c_block_data_or_emulated() allows. Fixes: effa453168a7 ("i2c: i801: Don't silently correct invalid transfer size") Cc: stable@vger.kernel.org Reviewed-by: Heiner Kallweit Signed-off-by: Jonas Malaco Link: https://lore.kernel.org/r/20220203165024.47767-1-jonas@protocubo.io Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/ee1004.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/misc/eeprom/ee1004.c b/drivers/misc/eeprom/ee1004.c index bb9c4512c968..9fbfe784d710 100644 --- a/drivers/misc/eeprom/ee1004.c +++ b/drivers/misc/eeprom/ee1004.c @@ -114,6 +114,9 @@ static ssize_t ee1004_eeprom_read(struct i2c_client *client, char *buf, if (offset + count > EE1004_PAGE_SIZE) count = EE1004_PAGE_SIZE - offset; + if (count > I2C_SMBUS_BLOCK_MAX) + count = I2C_SMBUS_BLOCK_MAX; + return i2c_smbus_read_i2c_block_data_or_emulated(client, offset, count, buf); } From a85468b766d3bc17c8b17ed23a36ef6469340bb2 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Thu, 3 Feb 2022 20:49:06 -0800 Subject: [PATCH 1075/1295] Revert "mm/page_isolation: unset migratetype directly for non Buddy page" This reverts commit 721fb891ad0b3956d5c168b2931e3e5e4fb7ca40. Commit 721fb891ad0b ("mm/page_isolation: unset migratetype directly for non Buddy page") will result memory that should in buddy disappear by mistake. move_freepages_block moves all pages in pageblock instead of pages indicated by input parameter, so if input pages is not in buddy but other pages in pageblock is in buddy, it will result in page out of control. Link: https://lkml.kernel.org/r/20220126024436.13921-1-chenwandun@huawei.com Fixes: 721fb891ad0b ("mm/page_isolation: unset migratetype directly for non Buddy page") Signed-off-by: Chen Wandun Reported-by: "kernelci.org bot" Acked-by: David Hildenbrand Tested-by: Dong Aisheng Tested-by: Francesco Dolcini Acked-by: Vlastimil Babka Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 6a0ddda6b3c5..f67c4c70f17f 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -115,7 +115,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * onlining - just onlined memory won't immediately be considered for * allocation. */ - if (!isolated_page && PageBuddy(page)) { + if (!isolated_page) { nr_pages = move_freepages_block(zone, page, migratetype, NULL); __mod_zone_freepage_state(zone, nr_pages, migratetype); } From fb5222aae64fe25e5f3ebefde8214dcf3ba33ca5 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 3 Feb 2022 20:49:10 -0800 Subject: [PATCH 1076/1295] mm/debug_vm_pgtable: remove pte entry from the page table Patch series "page table check fixes and cleanups", v5. This patch (of 4): The pte entry that is used in pte_advanced_tests() is never removed from the page table at the end of the test. The issue is detected by page_table_check, to repro compile kernel with the following configs: CONFIG_DEBUG_VM_PGTABLE=y CONFIG_PAGE_TABLE_CHECK=y CONFIG_PAGE_TABLE_CHECK_ENFORCED=y During the boot the following BUG is printed: debug_vm_pgtable: [debug_vm_pgtable ]: Validating architecture page table helpers ------------[ cut here ]------------ kernel BUG at mm/page_table_check.c:162! invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.16.0-11413-g2c271fe77d52 #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 ... The entry should be properly removed from the page table before the page is released to the free list. Link: https://lkml.kernel.org/r/20220131203249.2832273-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20220131203249.2832273-2-pasha.tatashin@soleen.com Fixes: a5c3b9ffb0f4 ("mm/debug_vm_pgtable: add tests validating advanced arch page table helpers") Signed-off-by: Pasha Tatashin Reviewed-by: Zi Yan Tested-by: Zi Yan Acked-by: David Rientjes Reviewed-by: Anshuman Khandual Cc: Paul Turner Cc: Wei Xu Cc: Greg Thelen Cc: Ingo Molnar Cc: Will Deacon Cc: Mike Rapoport Cc: Dave Hansen Cc: H. Peter Anvin Cc: Aneesh Kumar K.V Cc: Jiri Slaby Cc: Muchun Song Cc: Hugh Dickins Cc: [5.9+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index a7ac97c76762..db2abd9e415b 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -171,6 +171,8 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args) ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep); pte = ptep_get(args->ptep); WARN_ON(pte_young(pte)); + + ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1); } static void __init pte_savedwrite_tests(struct pgtable_debug_args *args) From 64d8b9e14512ceb7bf11b235faeb8531aeb4d9d3 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 3 Feb 2022 20:49:15 -0800 Subject: [PATCH 1077/1295] mm/page_table_check: use unsigned long for page counters and cleanup For consistency, use "unsigned long" for all page counters. Also, reduce code duplication by calling __page_table_check_*_clear() from __page_table_check_*_set() functions. Link: https://lkml.kernel.org/r/20220131203249.2832273-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Wei Xu Acked-by: David Rientjes Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Dave Hansen Cc: Greg Thelen Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_table_check.c | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 7504e7caa2a1..c61d7ebe13b1 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -86,8 +86,8 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, { struct page_ext *page_ext; struct page *page; + unsigned long i; bool anon; - int i; if (!pfn_valid(pfn)) return; @@ -121,8 +121,8 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, { struct page_ext *page_ext; struct page *page; + unsigned long i; bool anon; - int i; if (!pfn_valid(pfn)) return; @@ -152,10 +152,10 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, void __page_table_check_zero(struct page *page, unsigned int order) { struct page_ext *page_ext = lookup_page_ext(page); - int i; + unsigned long i; BUG_ON(!page_ext); - for (i = 0; i < (1 << order); i++) { + for (i = 0; i < (1ul << order); i++) { struct page_table_check *ptc = get_page_table_check(page_ext); BUG_ON(atomic_read(&ptc->anon_map_count)); @@ -206,17 +206,10 @@ EXPORT_SYMBOL(__page_table_check_pud_clear); void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - pte_t old_pte; - if (&init_mm == mm) return; - old_pte = *ptep; - if (pte_user_accessible_page(old_pte)) { - page_table_check_clear(mm, addr, pte_pfn(old_pte), - PAGE_SIZE >> PAGE_SHIFT); - } - + __page_table_check_pte_clear(mm, addr, *ptep); if (pte_user_accessible_page(pte)) { page_table_check_set(mm, addr, pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, @@ -228,17 +221,10 @@ EXPORT_SYMBOL(__page_table_check_pte_set); void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - pmd_t old_pmd; - if (&init_mm == mm) return; - old_pmd = *pmdp; - if (pmd_user_accessible_page(old_pmd)) { - page_table_check_clear(mm, addr, pmd_pfn(old_pmd), - PMD_PAGE_SIZE >> PAGE_SHIFT); - } - + __page_table_check_pmd_clear(mm, addr, *pmdp); if (pmd_user_accessible_page(pmd)) { page_table_check_set(mm, addr, pmd_pfn(pmd), PMD_PAGE_SIZE >> PAGE_SHIFT, @@ -250,17 +236,10 @@ EXPORT_SYMBOL(__page_table_check_pmd_set); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - pud_t old_pud; - if (&init_mm == mm) return; - old_pud = *pudp; - if (pud_user_accessible_page(old_pud)) { - page_table_check_clear(mm, addr, pud_pfn(old_pud), - PUD_PAGE_SIZE >> PAGE_SHIFT); - } - + __page_table_check_pud_clear(mm, addr, *pudp); if (pud_user_accessible_page(pud)) { page_table_check_set(mm, addr, pud_pfn(pud), PUD_PAGE_SIZE >> PAGE_SHIFT, From e59a47b8a45353d9ee234aab2d229474e09885df Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 3 Feb 2022 20:49:20 -0800 Subject: [PATCH 1078/1295] mm/khugepaged: unify collapse pmd clear, flush and free Unify the code that flushes, clears pmd entry, and frees the PTE table level into a new function collapse_and_free_pmd(). This cleanup is useful as in the next patch we will add another call to this function to iterate through PTE prior to freeing the level for page table check. Link: https://lkml.kernel.org/r/20220131203249.2832273-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Dave Hansen Cc: Greg Thelen Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Wei Xu Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 35f14d0a00a6..30e59e4af272 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1416,6 +1416,19 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, return 0; } +static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + spinlock_t *ptl; + pmd_t pmd; + + ptl = pmd_lock(vma->vm_mm, pmdp); + pmd = pmdp_collapse_flush(vma, addr, pmdp); + spin_unlock(ptl); + mm_dec_nr_ptes(mm); + pte_free(mm, pmd_pgtable(pmd)); +} + /** * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at * address haddr. @@ -1433,7 +1446,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma = find_vma(mm, haddr); struct page *hpage; pte_t *start_pte, *pte; - pmd_t *pmd, _pmd; + pmd_t *pmd; spinlock_t *ptl; int count = 0; int i; @@ -1509,12 +1522,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) } /* step 4: collapse pmd */ - ptl = pmd_lock(vma->vm_mm, pmd); - _pmd = pmdp_collapse_flush(vma, haddr, pmd); - spin_unlock(ptl); - mm_dec_nr_ptes(mm); - pte_free(mm, pmd_pgtable(_pmd)); - + collapse_and_free_pmd(mm, vma, haddr, pmd); drop_hpage: unlock_page(hpage); put_page(hpage); @@ -1552,7 +1560,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) struct vm_area_struct *vma; struct mm_struct *mm; unsigned long addr; - pmd_t *pmd, _pmd; + pmd_t *pmd; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1591,14 +1599,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * reverse order. Trylock is a way to avoid deadlock. */ if (mmap_write_trylock(mm)) { - if (!khugepaged_test_exit(mm)) { - spinlock_t *ptl = pmd_lock(mm, pmd); - /* assume page table is clear */ - _pmd = pmdp_collapse_flush(vma, addr, pmd); - spin_unlock(ptl); - mm_dec_nr_ptes(mm); - pte_free(mm, pmd_pgtable(_pmd)); - } + if (!khugepaged_test_exit(mm)) + collapse_and_free_pmd(mm, vma, addr, pmd); mmap_write_unlock(mm); } else { /* Try again later */ From 80110bbfbba6f0078d5a1cbc8df004506db8ffe5 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 3 Feb 2022 20:49:24 -0800 Subject: [PATCH 1079/1295] mm/page_table_check: check entries at pmd levels syzbot detected a case where the page table counters were not properly updated. syzkaller login: ------------[ cut here ]------------ kernel BUG at mm/page_table_check.c:162! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 3099 Comm: pasha Not tainted 5.16.0+ #48 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIO4 RIP: 0010:__page_table_check_zero+0x159/0x1a0 Call Trace: free_pcp_prepare+0x3be/0xaa0 free_unref_page+0x1c/0x650 free_compound_page+0xec/0x130 free_transhuge_page+0x1be/0x260 __put_compound_page+0x90/0xd0 release_pages+0x54c/0x1060 __pagevec_release+0x7c/0x110 shmem_undo_range+0x85e/0x1250 ... The repro involved having a huge page that is split due to uprobe event temporarily replacing one of the pages in the huge page. Later the huge page was combined again, but the counters were off, as the PTE level was not properly updated. Make sure that when PMD is cleared and prior to freeing the level the PTEs are updated. Link: https://lkml.kernel.org/r/20220131203249.2832273-5-pasha.tatashin@soleen.com Fixes: df4e817b7108 ("mm: page table check") Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Dave Hansen Cc: Greg Thelen Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Wei Xu Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_table_check.h | 19 +++++++++++++++++++ mm/khugepaged.c | 3 +++ mm/page_table_check.c | 20 ++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 38cace1da7b6..01e16c7696ec 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -26,6 +26,9 @@ void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud); +void __page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd); static inline void page_table_check_alloc(struct page *page, unsigned int order) { @@ -100,6 +103,16 @@ static inline void page_table_check_pud_set(struct mm_struct *mm, __page_table_check_pud_set(mm, addr, pudp, pud); } +static inline void page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pte_clear_range(mm, addr, pmd); +} + #else static inline void page_table_check_alloc(struct page *page, unsigned int order) @@ -143,5 +156,11 @@ static inline void page_table_check_pud_set(struct mm_struct *mm, { } +static inline void page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ +} + #endif /* CONFIG_PAGE_TABLE_CHECK */ #endif /* __LINUX_PAGE_TABLE_CHECK_H */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 30e59e4af272..131492fd1148 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -1422,10 +1423,12 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v spinlock_t *ptl; pmd_t pmd; + mmap_assert_write_locked(mm); ptl = pmd_lock(vma->vm_mm, pmdp); pmd = pmdp_collapse_flush(vma, addr, pmdp); spin_unlock(ptl); mm_dec_nr_ptes(mm); + page_table_check_pte_clear_range(mm, addr, pmd); pte_free(mm, pmd_pgtable(pmd)); } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index c61d7ebe13b1..3763bd077861 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -247,3 +247,23 @@ void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, } } EXPORT_SYMBOL(__page_table_check_pud_set); + +void __page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ + if (&init_mm == mm) + return; + + if (!pmd_bad(pmd) && !pmd_leaf(pmd)) { + pte_t *ptep = pte_offset_map(&pmd, addr); + unsigned long i; + + pte_unmap(ptep); + for (i = 0; i < PTRS_PER_PTE; i++) { + __page_table_check_pte_clear(mm, addr, *ptep); + addr += PAGE_SIZE; + ptep++; + } + } +} From 314c459a6fe0957b5885fbc65c53d51444092880 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 3 Feb 2022 20:49:29 -0800 Subject: [PATCH 1080/1295] mm/pgtable: define pte_index so that preprocessor could recognize it Since commit 974b9b2c68f3 ("mm: consolidate pte_index() and pte_offset_*() definitions") pte_index is a static inline and there is no define for it that can be recognized by the preprocessor. As a result, vm_insert_pages() uses slower loop over vm_insert_page() instead of insert_pages() that amortizes the cost of spinlock operations when inserting multiple pages. Link: https://lkml.kernel.org/r/20220111145457.20748-1-rppt@kernel.org Fixes: 974b9b2c68f3 ("mm: consolidate pte_index() and pte_offset_*() definitions") Signed-off-by: Mike Rapoport Reported-by: Christian Dietrich Reviewed-by: Khalid Aziz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pgtable.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index bc8713a76e03..f4f4077b97aa 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -62,6 +62,7 @@ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } +#define pte_index pte_index #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) From 520ba724061cef59763e2b6f5b26e8387c2e5822 Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Thu, 3 Feb 2022 20:49:33 -0800 Subject: [PATCH 1081/1295] ipc/sem: do not sleep with a spin lock held We can't call kvfree() with a spin lock held, so defer it. Link: https://lkml.kernel.org/r/20211223031207.556189-1-chi.minghao@zte.com.cn Fixes: fc37a3b8b438 ("[PATCH] ipc sem: use kvmalloc for sem_undo allocation") Reported-by: Zeal Robot Signed-off-by: Minghao Chi Reviewed-by: Shakeel Butt Reviewed-by: Manfred Spraul Cc: Arnd Bergmann Cc: Yang Guang Cc: Davidlohr Bueso Cc: Randy Dunlap Cc: Bhaskar Chowdhury Cc: Vasily Averin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index 6693daf4fe11..0dbdb98fdf2d 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1964,6 +1964,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) */ un = lookup_undo(ulp, semid); if (un) { + spin_unlock(&ulp->lock); kvfree(new); goto success; } @@ -1976,9 +1977,8 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) ipc_assert_locked_object(&sma->sem_perm); list_add(&new->list_id, &sma->list_id); un = new; - -success: spin_unlock(&ulp->lock); +success: sem_unlock(sma, -1); out: return un; From c10a0f877fe007021d70f9cada240f42adc2b5db Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Thu, 3 Feb 2022 20:49:37 -0800 Subject: [PATCH 1082/1295] mm/kmemleak: avoid scanning potential huge holes When using devm_request_free_mem_region() and devm_memremap_pages() to add ZONE_DEVICE memory, if requested free mem region's end pfn were huge(e.g., 0x400000000), the node_end_pfn() will be also huge (see move_pfn_range_to_zone()). Thus it creates a huge hole between node_start_pfn() and node_end_pfn(). We found on some AMD APUs, amdkfd requested such a free mem region and created a huge hole. In such a case, following code snippet was just doing busy test_bit() looping on the huge hole. for (pfn = start_pfn; pfn < end_pfn; pfn++) { struct page *page = pfn_to_online_page(pfn); if (!page) continue; ... } So we got a soft lockup: watchdog: BUG: soft lockup - CPU#6 stuck for 26s! [bash:1221] CPU: 6 PID: 1221 Comm: bash Not tainted 5.15.0-custom #1 RIP: 0010:pfn_to_online_page+0x5/0xd0 Call Trace: ? kmemleak_scan+0x16a/0x440 kmemleak_write+0x306/0x3a0 ? common_file_perm+0x72/0x170 full_proxy_write+0x5c/0x90 vfs_write+0xb9/0x260 ksys_write+0x67/0xe0 __x64_sys_write+0x1a/0x20 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae I did some tests with the patch. (1) amdgpu module unloaded before the patch: real 0m0.976s user 0m0.000s sys 0m0.968s after the patch: real 0m0.981s user 0m0.000s sys 0m0.973s (2) amdgpu module loaded before the patch: real 0m35.365s user 0m0.000s sys 0m35.354s after the patch: real 0m1.049s user 0m0.000s sys 0m1.042s Link: https://lkml.kernel.org/r/20211108140029.721144-1-lang.yu@amd.com Signed-off-by: Lang Yu Acked-by: David Hildenbrand Acked-by: Catalin Marinas Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index dc3758fdba68..7580baa76af1 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1410,7 +1410,8 @@ static void kmemleak_scan(void) { unsigned long flags; struct kmemleak_object *object; - int i; + struct zone *zone; + int __maybe_unused i; int new_leaks = 0; jiffies_last_scan = jiffies; @@ -1450,9 +1451,9 @@ static void kmemleak_scan(void) * Struct page scanning for each node. */ get_online_mems(); - for_each_online_node(i) { - unsigned long start_pfn = node_start_pfn(i); - unsigned long end_pfn = node_end_pfn(i); + for_each_populated_zone(zone) { + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn++) { @@ -1461,8 +1462,8 @@ static void kmemleak_scan(void) if (!page) continue; - /* only scan pages belonging to this node */ - if (page_to_nid(page) != i) + /* only scan pages belonging to this zone */ + if (page_zone(page) != zone) continue; /* only scan if page is in use */ if (page_count(page) == 0) From 6a0fb704b05cd143dfe2c6a4969c41c59a04b330 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 3 Feb 2022 20:49:41 -0800 Subject: [PATCH 1083/1295] MAINTAINERS: update rppt's email Use my @kernel.org address Link: https://lkml.kernel.org/r/20220203090324.3701774-1-rppt@kernel.org Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f41088418aae..c820e03c4414 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12400,7 +12400,7 @@ F: include/uapi/linux/membarrier.h F: kernel/sched/membarrier.c MEMBLOCK -M: Mike Rapoport +M: Mike Rapoport L: linux-mm@kvack.org S: Maintained F: Documentation/core-api/boot-time-mm.rst From 07d2505b963b2d30f747dce338211f51068b8765 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 3 Feb 2022 20:49:45 -0800 Subject: [PATCH 1084/1295] kselftest/vm: revert "tools/testing/selftests/vm/userfaultfd.c: use swap() to make code cleaner" With this change, userfaultfd fails to build with undefined reference swap() error: userfaultfd.c: In function `userfaultfd_stress': userfaultfd.c:1530:17: warning: implicit declaration of function `swap'; did you mean `swab'? [-Wimplicit-function-declaration] 1530 | swap(area_src, area_dst); | ^~~~ | swab /usr/bin/ld: /tmp/ccDGOAdV.o: in function `userfaultfd_stress': userfaultfd.c:(.text+0x549e): undefined reference to `swap' /usr/bin/ld: userfaultfd.c:(.text+0x54bc): undefined reference to `swap' collect2: error: ld returned 1 exit status Revert the commit to fix the problem. Link: https://lkml.kernel.org/r/20220202003340.87195-1-skhan@linuxfoundation.org Fixes: 2c769ed7137a ("tools/testing/selftests/vm/userfaultfd.c: use swap() to make code cleaner") Signed-off-by: Shuah Khan Cc: Shuah Khan Cc: Minghao Chi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index d3fd24f9fae8..2f49c9af1b58 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -1417,6 +1417,7 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize) static int userfaultfd_stress(void) { void *area; + char *tmp_area; unsigned long nr; struct uffdio_register uffdio_register; struct uffd_stats uffd_stats[nr_cpus]; @@ -1527,9 +1528,13 @@ static int userfaultfd_stress(void) count_verify[nr], nr); /* prepare next bounce */ - swap(area_src, area_dst); + tmp_area = area_src; + area_src = area_dst; + area_dst = tmp_area; - swap(area_src_alias, area_dst_alias); + tmp_area = area_src_alias; + area_src_alias = area_dst_alias; + area_dst_alias = tmp_area; uffd_stats_report(uffd_stats, nr_cpus); } From 8619e32825fd0af82d243e585b9aa6917b99a975 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 4 Feb 2022 17:24:02 +0100 Subject: [PATCH 1085/1295] rtla: Follow kernel version To avoid having commits with new version, it is just easier to follow kernel version. Link: https://lkml.kernel.org/r/9c2df0d1de65cea96c7d731fe64781a2bb90c5b3.1643990447.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile index 7c39728d08de..5a1eda617992 100644 --- a/tools/tracing/rtla/Makefile +++ b/tools/tracing/rtla/Makefile @@ -1,5 +1,6 @@ NAME := rtla -VERSION := 0.5 +# Follow the kernel version +VERSION := $(shell cat VERSION 2> /dev/null || make -sC ../../.. kernelversion) # From libtracefs: # Makefiles suck: This macro sets a default value of $(2) for the @@ -85,6 +86,7 @@ clean: doc_clean tarball: clean rm -rf $(NAME)-$(VERSION) && mkdir $(NAME)-$(VERSION) + echo $(VERSION) > $(NAME)-$(VERSION)/VERSION cp -r $(DIRS) $(FILES) $(NAME)-$(VERSION) mkdir $(NAME)-$(VERSION)/Documentation/ cp -rp $(SRCTREE)/../../../Documentation/tools/rtla/* $(NAME)-$(VERSION)/Documentation/ From f35491b861290a2c8258b5f70f9bb5d5ed2a1c6f Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 4 Feb 2022 17:24:03 +0100 Subject: [PATCH 1086/1295] rtla/utils: Fix session duration parsing Use gmtime to format the duration time. This avoids problems when the system uses local time different of Pisa's Local Time. Link: https://lkml.kernel.org/r/a2f0a37bc006c2561bb8ecd871cd70532b4a9f2d.1643990447.git.bristot@kernel.org Fixes: b1696371d865 ("rtla: Helper functions for rtla") Cc: Daniel Bristot de Oliveira Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/src/utils.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c index 1c9f0eea6166..ffaf8ec84001 100644 --- a/tools/tracing/rtla/src/utils.c +++ b/tools/tracing/rtla/src/utils.c @@ -77,11 +77,11 @@ void get_duration(time_t start_time, char *output, int output_size) time_t duration; duration = difftime(now, start_time); - tm_info = localtime(&duration); + tm_info = gmtime(&duration); snprintf(output, output_size, "%3d %02d:%02d:%02d", tm_info->tm_yday, - tm_info->tm_hour - 1, + tm_info->tm_hour, tm_info->tm_min, tm_info->tm_sec); } From 1a6229096bb501495442ab47761d746c1ae791e1 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 4 Feb 2022 17:24:04 +0100 Subject: [PATCH 1087/1295] rtla/trace: Error message fixup Use capital and change "tracer %s" to "%s tracer". No functional change. Link: https://lkml.kernel.org/r/361697d27431afefa64c67c323564205385c418d.1643990447.git.bristot@kernel.org Fixes: b1696371d865 ("rtla: Helper functions for rtla") Cc: Daniel Bristot de Oliveira Cc: linux-kernel@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/src/trace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c index 107a0c6387f7..83de259abcc1 100644 --- a/tools/tracing/rtla/src/trace.c +++ b/tools/tracing/rtla/src/trace.c @@ -20,14 +20,14 @@ int enable_tracer_by_name(struct tracefs_instance *inst, const char *tracer_name tracer = TRACEFS_TRACER_CUSTOM; - debug_msg("enabling %s tracer\n", tracer_name); + debug_msg("Enabling %s tracer\n", tracer_name); retval = tracefs_tracer_set(inst, tracer, tracer_name); if (retval < 0) { if (errno == ENODEV) - err_msg("tracer %s not found!\n", tracer_name); + err_msg("Tracer %s not found!\n", tracer_name); - err_msg("failed to enable the tracer %s\n", tracer_name); + err_msg("Failed to enable the %s tracer\n", tracer_name); return -1; } @@ -44,7 +44,7 @@ void disable_tracer(struct tracefs_instance *inst) retval = tracefs_tracer_set(inst, t); if (retval < 0) - err_msg("oops, error disabling tracer\n"); + err_msg("Oops, error disabling tracer\n"); } /* From 6a00ef4493706a23120057fafbc62379bcde11ec Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Mon, 17 Jan 2022 23:44:33 +0800 Subject: [PATCH 1088/1295] riscv: eliminate unreliable __builtin_frame_address(1) I tried different pieces of code which uses __builtin_frame_address(1) (with both gcc version 7.5.0 and 10.3.0) to verify whether it works as expected on riscv64. The result is negative. What the compiler had generated is as below: 31 fp = (unsigned long)__builtin_frame_address(1); 0xffffffff80006024 <+200>: ld s1,0(s0) It takes '0(s0)' as the address of frame 1 (caller), but the actual address should be '-16(s0)'. | ... | <-+ +-----------------+ | | return address | | | previous fp | | | saved registers | | | local variables | | $fp --> | ... | | +-----------------+ | | return address | | | previous fp --------+ | saved registers | $sp --> | local variables | +-----------------+ This leads the kernel can not dump the full stack trace on riscv. [ 7.222126][ T1] Call Trace: [ 7.222804][ T1] [] dump_backtrace+0x2c/0x3a This problem is not exposed on most riscv builds just because the '0(s0)' occasionally is the address frame 2 (caller's caller), if only ra and fp are stored in frame 1 (caller). | ... | <-+ +-----------------+ | | return address | | $fp --> | previous fp | | +-----------------+ | | return address | | | previous fp --------+ | saved registers | $sp --> | local variables | +-----------------+ This could be a *bug* of gcc that should be fixed. But as noted in gcc manual "Calling this function with a nonzero argument can have unpredictable effects, including crashing the calling program.", let's remove the '__builtin_frame_address(1)' in backtrace code. With this fix now it can show full stack trace: [ 10.444838][ T1] Call Trace: [ 10.446199][ T1] [] dump_backtrace+0x2c/0x3a [ 10.447711][ T1] [] show_stack+0x32/0x3e [ 10.448710][ T1] [] dump_stack_lvl+0x58/0x7a [ 10.449941][ T1] [] dump_stack+0x14/0x1c [ 10.450929][ T1] [] ubsan_epilogue+0x10/0x5a [ 10.451869][ T1] [] __ubsan_handle_load_invalid_value+0x6c/0x78 [ 10.453049][ T1] [] __pagevec_release+0x62/0x64 [ 10.455476][ T1] [] truncate_inode_pages_range+0x132/0x5be [ 10.456798][ T1] [] truncate_inode_pages+0x24/0x30 [ 10.457853][ T1] [] kill_bdev+0x32/0x3c ... Signed-off-by: Changbin Du Fixes: eac2f3059e02 ("riscv: stacktrace: fix the riscv stacktrace when CONFIG_FRAME_POINTER enabled") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/stacktrace.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 201ee206fb57..14d2b53ec322 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -22,15 +22,16 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, bool (*fn)(void *, unsigned long), void *arg) { unsigned long fp, sp, pc; + int level = 0; if (regs) { fp = frame_pointer(regs); sp = user_stack_pointer(regs); pc = instruction_pointer(regs); } else if (task == NULL || task == current) { - fp = (unsigned long)__builtin_frame_address(1); - sp = (unsigned long)__builtin_frame_address(0); - pc = (unsigned long)__builtin_return_address(0); + fp = (unsigned long)__builtin_frame_address(0); + sp = sp_in_global; + pc = (unsigned long)walk_stackframe; } else { /* task blocked in __switch_to */ fp = task->thread.s[0]; @@ -42,7 +43,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, unsigned long low, high; struct stackframe *frame; - if (unlikely(!__kernel_text_address(pc) || !fn(arg, pc))) + if (unlikely(!__kernel_text_address(pc) || (level++ >= 1 && !fn(arg, pc)))) break; /* Validate frame pointer */ From d2a02e3c8bb6b347818518edff5a4b40ff52d6d8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 19 Jan 2022 14:35:06 +0100 Subject: [PATCH 1089/1295] lib/crypto: blake2s: avoid indirect calls to compression function for Clang CFI blake2s_compress_generic is weakly aliased by blake2s_compress. The current harness for function selection uses a function pointer, which is ordinarily inlined and resolved at compile time. But when Clang's CFI is enabled, CFI still triggers when making an indirect call via a weak symbol. This seems like a bug in Clang's CFI, as though it's bucketing weak symbols and strong symbols differently. It also only seems to trigger when "full LTO" mode is used, rather than "thin LTO". [ 0.000000][ T0] Kernel panic - not syncing: CFI failure (target: blake2s_compress_generic+0x0/0x1444) [ 0.000000][ T0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.16.0-mainline-06981-g076c855b846e #1 [ 0.000000][ T0] Hardware name: MT6873 (DT) [ 0.000000][ T0] Call trace: [ 0.000000][ T0] dump_backtrace+0xfc/0x1dc [ 0.000000][ T0] dump_stack_lvl+0xa8/0x11c [ 0.000000][ T0] panic+0x194/0x464 [ 0.000000][ T0] __cfi_check_fail+0x54/0x58 [ 0.000000][ T0] __cfi_slowpath_diag+0x354/0x4b0 [ 0.000000][ T0] blake2s_update+0x14c/0x178 [ 0.000000][ T0] _extract_entropy+0xf4/0x29c [ 0.000000][ T0] crng_initialize_primary+0x24/0x94 [ 0.000000][ T0] rand_initialize+0x2c/0x6c [ 0.000000][ T0] start_kernel+0x2f8/0x65c [ 0.000000][ T0] __primary_switched+0xc4/0x7be4 [ 0.000000][ T0] Rebooting in 5 seconds.. Nonetheless, the function pointer method isn't so terrific anyway, so this patch replaces it with a simple boolean, which also gets inlined away. This successfully works around the Clang bug. In general, I'm not too keen on all of the indirection involved here; it clearly does more harm than good. Hopefully the whole thing can get cleaned up down the road when lib/crypto is overhauled more comprehensively. But for now, we go with a simple bandaid. Fixes: 6048fdcc5f26 ("lib/crypto: blake2s: include as built-in") Link: https://github.com/ClangBuiltLinux/linux/issues/1567 Reported-by: Miles Chen Tested-by: Miles Chen Tested-by: Nathan Chancellor Tested-by: John Stultz Acked-by: Nick Desaulniers Reviewed-by: Eric Biggers Signed-off-by: Jason A. Donenfeld --- arch/arm/crypto/blake2s-shash.c | 4 ++-- arch/x86/crypto/blake2s-shash.c | 4 ++-- crypto/blake2s_generic.c | 4 ++-- include/crypto/internal/blake2s.h | 40 +++++++++++++++++++------------ lib/crypto/blake2s.c | 4 ++-- 5 files changed, 33 insertions(+), 23 deletions(-) diff --git a/arch/arm/crypto/blake2s-shash.c b/arch/arm/crypto/blake2s-shash.c index 17c1c3bfe2f5..763c73beea2d 100644 --- a/arch/arm/crypto/blake2s-shash.c +++ b/arch/arm/crypto/blake2s-shash.c @@ -13,12 +13,12 @@ static int crypto_blake2s_update_arm(struct shash_desc *desc, const u8 *in, unsigned int inlen) { - return crypto_blake2s_update(desc, in, inlen, blake2s_compress); + return crypto_blake2s_update(desc, in, inlen, false); } static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out) { - return crypto_blake2s_final(desc, out, blake2s_compress); + return crypto_blake2s_final(desc, out, false); } #define BLAKE2S_ALG(name, driver_name, digest_size) \ diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c index f9e2fecdb761..59ae28abe35c 100644 --- a/arch/x86/crypto/blake2s-shash.c +++ b/arch/x86/crypto/blake2s-shash.c @@ -18,12 +18,12 @@ static int crypto_blake2s_update_x86(struct shash_desc *desc, const u8 *in, unsigned int inlen) { - return crypto_blake2s_update(desc, in, inlen, blake2s_compress); + return crypto_blake2s_update(desc, in, inlen, false); } static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out) { - return crypto_blake2s_final(desc, out, blake2s_compress); + return crypto_blake2s_final(desc, out, false); } #define BLAKE2S_ALG(name, driver_name, digest_size) \ diff --git a/crypto/blake2s_generic.c b/crypto/blake2s_generic.c index 72fe480f9bd6..5f96a21f8788 100644 --- a/crypto/blake2s_generic.c +++ b/crypto/blake2s_generic.c @@ -15,12 +15,12 @@ static int crypto_blake2s_update_generic(struct shash_desc *desc, const u8 *in, unsigned int inlen) { - return crypto_blake2s_update(desc, in, inlen, blake2s_compress_generic); + return crypto_blake2s_update(desc, in, inlen, true); } static int crypto_blake2s_final_generic(struct shash_desc *desc, u8 *out) { - return crypto_blake2s_final(desc, out, blake2s_compress_generic); + return crypto_blake2s_final(desc, out, true); } #define BLAKE2S_ALG(name, driver_name, digest_size) \ diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h index d39cfa0d333e..52363eee2b20 100644 --- a/include/crypto/internal/blake2s.h +++ b/include/crypto/internal/blake2s.h @@ -24,14 +24,11 @@ static inline void blake2s_set_lastblock(struct blake2s_state *state) state->f[0] = -1; } -typedef void (*blake2s_compress_t)(struct blake2s_state *state, - const u8 *block, size_t nblocks, u32 inc); - /* Helper functions for BLAKE2s shared by the library and shash APIs */ -static inline void __blake2s_update(struct blake2s_state *state, - const u8 *in, size_t inlen, - blake2s_compress_t compress) +static __always_inline void +__blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen, + bool force_generic) { const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; @@ -39,7 +36,12 @@ static inline void __blake2s_update(struct blake2s_state *state, return; if (inlen > fill) { memcpy(state->buf + state->buflen, in, fill); - (*compress)(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); + if (force_generic) + blake2s_compress_generic(state, state->buf, 1, + BLAKE2S_BLOCK_SIZE); + else + blake2s_compress(state, state->buf, 1, + BLAKE2S_BLOCK_SIZE); state->buflen = 0; in += fill; inlen -= fill; @@ -47,7 +49,12 @@ static inline void __blake2s_update(struct blake2s_state *state, if (inlen > BLAKE2S_BLOCK_SIZE) { const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); /* Hash one less (full) block than strictly possible */ - (*compress)(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); + if (force_generic) + blake2s_compress_generic(state, in, nblocks - 1, + BLAKE2S_BLOCK_SIZE); + else + blake2s_compress(state, in, nblocks - 1, + BLAKE2S_BLOCK_SIZE); in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); } @@ -55,13 +62,16 @@ static inline void __blake2s_update(struct blake2s_state *state, state->buflen += inlen; } -static inline void __blake2s_final(struct blake2s_state *state, u8 *out, - blake2s_compress_t compress) +static __always_inline void +__blake2s_final(struct blake2s_state *state, u8 *out, bool force_generic) { blake2s_set_lastblock(state); memset(state->buf + state->buflen, 0, BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ - (*compress)(state, state->buf, 1, state->buflen); + if (force_generic) + blake2s_compress_generic(state, state->buf, 1, state->buflen); + else + blake2s_compress(state, state->buf, 1, state->buflen); cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); memcpy(out, state->h, state->outlen); } @@ -99,20 +109,20 @@ static inline int crypto_blake2s_init(struct shash_desc *desc) static inline int crypto_blake2s_update(struct shash_desc *desc, const u8 *in, unsigned int inlen, - blake2s_compress_t compress) + bool force_generic) { struct blake2s_state *state = shash_desc_ctx(desc); - __blake2s_update(state, in, inlen, compress); + __blake2s_update(state, in, inlen, force_generic); return 0; } static inline int crypto_blake2s_final(struct shash_desc *desc, u8 *out, - blake2s_compress_t compress) + bool force_generic) { struct blake2s_state *state = shash_desc_ctx(desc); - __blake2s_final(state, out, compress); + __blake2s_final(state, out, force_generic); return 0; } diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c index 9364f79937b8..c71c09621c09 100644 --- a/lib/crypto/blake2s.c +++ b/lib/crypto/blake2s.c @@ -18,14 +18,14 @@ void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen) { - __blake2s_update(state, in, inlen, blake2s_compress); + __blake2s_update(state, in, inlen, false); } EXPORT_SYMBOL(blake2s_update); void blake2s_final(struct blake2s_state *state, u8 *out) { WARN_ON(IS_ENABLED(DEBUG) && !out); - __blake2s_final(state, out, blake2s_compress); + __blake2s_final(state, out, false); memzero_explicit(state, sizeof(*state)); } EXPORT_SYMBOL(blake2s_final); From c321e907aa4803d562d6e70ebed9444ad082f953 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Tue, 25 Jan 2022 21:14:57 +0100 Subject: [PATCH 1090/1295] random: continually use hwgenerator randomness The rngd kernel thread may sleep indefinitely if the entropy count is kept above random_write_wakeup_bits by other entropy sources. To make best use of multiple sources of randomness, mix entropy from hardware RNGs into the pool at least once within CRNG_RESEED_INTERVAL. Cc: Herbert Xu Cc: Jason A. Donenfeld Signed-off-by: Dominik Brodowski Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 68613f0b6887..3e133d9583a0 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -2205,13 +2205,15 @@ void add_hwgenerator_randomness(const char *buffer, size_t count, return; } - /* Suspend writing if we're above the trickle threshold. + /* Throttle writing if we're above the trickle threshold. * We'll be woken up again once below random_write_wakeup_thresh, - * or when the calling thread is about to terminate. + * when the calling thread is about to terminate, or once + * CRNG_RESEED_INTERVAL has lapsed. */ - wait_event_interruptible(random_write_wait, + wait_event_interruptible_timeout(random_write_wait, !system_wq || kthread_should_stop() || - POOL_ENTROPY_BITS() <= random_write_wakeup_bits); + POOL_ENTROPY_BITS() <= random_write_wakeup_bits, + CRNG_RESEED_INTERVAL); mix_pool_bytes(buffer, count); credit_entropy_bits(entropy); } From 042e293e16e3aa9794ce60c29f5b7b0c8170f933 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 28 Jan 2022 23:44:03 +0100 Subject: [PATCH 1091/1295] random: wake up /dev/random writers after zap When account() is called, and the amount of entropy dips below random_write_wakeup_bits, we wake up the random writers, so that they can write some more in. However, the RNDZAPENTCNT/RNDCLEARPOOL ioctl sets the entropy count to zero -- a potential reduction just like account() -- but does not unblock writers. This commit adds the missing logic to that ioctl to unblock waiting writers. Reviewed-by: Dominik Brodowski Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 3e133d9583a0..c336c78820cd 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1856,7 +1856,10 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; - input_pool.entropy_count = 0; + if (xchg(&input_pool.entropy_count, 0) && random_write_wakeup_bits) { + wake_up_interruptible(&random_write_wait); + kill_fasync(&fasync, SIGIO, POLL_OUT); + } return 0; case RNDRESEEDCRNG: if (!capable(CAP_SYS_ADMIN)) From ebf7606388732ecf2821ca21087e9446cb4a5b57 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 30 Jan 2022 22:03:19 +0100 Subject: [PATCH 1092/1295] random: access primary_pool directly rather than through pointer Both crng_initialize_primary() and crng_init_try_arch_early() are only called for the primary_pool. Accessing it directly instead of through a function parameter simplifies the code. Signed-off-by: Dominik Brodowski Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index c336c78820cd..c49f9c5b8110 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -762,7 +762,7 @@ static bool crng_init_try_arch(struct crng_state *crng) return arch_init; } -static bool __init crng_init_try_arch_early(struct crng_state *crng) +static bool __init crng_init_try_arch_early(void) { int i; bool arch_init = true; @@ -774,7 +774,7 @@ static bool __init crng_init_try_arch_early(struct crng_state *crng) rv = random_get_entropy(); arch_init = false; } - crng->state[i] ^= rv; + primary_crng.state[i] ^= rv; } return arch_init; @@ -788,16 +788,16 @@ static void crng_initialize_secondary(struct crng_state *crng) crng->init_time = jiffies - CRNG_RESEED_INTERVAL - 1; } -static void __init crng_initialize_primary(struct crng_state *crng) +static void __init crng_initialize_primary(void) { - _extract_entropy(&crng->state[4], sizeof(u32) * 12); - if (crng_init_try_arch_early(crng) && trust_cpu && crng_init < 2) { + _extract_entropy(&primary_crng.state[4], sizeof(u32) * 12); + if (crng_init_try_arch_early() && trust_cpu && crng_init < 2) { invalidate_batched_entropy(); numa_crng_init(); crng_init = 2; pr_notice("crng init done (trusting CPU's manufacturer)\n"); } - crng->init_time = jiffies - CRNG_RESEED_INTERVAL - 1; + primary_crng.init_time = jiffies - CRNG_RESEED_INTERVAL - 1; } static void crng_finalize_init(struct crng_state *crng) @@ -1698,7 +1698,7 @@ int __init rand_initialize(void) init_std_data(); if (crng_need_final_init) crng_finalize_init(&primary_crng); - crng_initialize_primary(&primary_crng); + crng_initialize_primary(); crng_global_init_time = jiffies; if (ratelimit_disable) { urandom_warning.interval = 0; From 9d5505f1eebeca778074a0260ed077fd85f8792c Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 30 Jan 2022 22:03:20 +0100 Subject: [PATCH 1093/1295] random: only call crng_finalize_init() for primary_crng crng_finalize_init() returns instantly if it is called for another pool than primary_crng. The test whether crng_finalize_init() is still required can be moved to the relevant caller in crng_reseed(), and crng_need_final_init can be reset to false if crng_finalize_init() is called with workqueues ready. Then, no previous callsite will call crng_finalize_init() unless it is needed, and we can get rid of the superfluous function parameter. Signed-off-by: Dominik Brodowski Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index c49f9c5b8110..3404a91edf29 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -800,10 +800,8 @@ static void __init crng_initialize_primary(void) primary_crng.init_time = jiffies - CRNG_RESEED_INTERVAL - 1; } -static void crng_finalize_init(struct crng_state *crng) +static void crng_finalize_init(void) { - if (crng != &primary_crng || crng_init >= 2) - return; if (!system_wq) { /* We can't call numa_crng_init until we have workqueues, * so mark this for processing later. */ @@ -814,6 +812,7 @@ static void crng_finalize_init(struct crng_state *crng) invalidate_batched_entropy(); numa_crng_init(); crng_init = 2; + crng_need_final_init = false; process_random_ready_list(); wake_up_interruptible(&crng_init_wait); kill_fasync(&fasync, SIGIO, POLL_IN); @@ -980,7 +979,8 @@ static void crng_reseed(struct crng_state *crng, bool use_input_pool) memzero_explicit(&buf, sizeof(buf)); WRITE_ONCE(crng->init_time, jiffies); spin_unlock_irqrestore(&crng->lock, flags); - crng_finalize_init(crng); + if (crng == &primary_crng && crng_init < 2) + crng_finalize_init(); } static void _extract_crng(struct crng_state *crng, u8 out[CHACHA_BLOCK_SIZE]) @@ -1697,7 +1697,7 @@ int __init rand_initialize(void) { init_std_data(); if (crng_need_final_init) - crng_finalize_init(&primary_crng); + crng_finalize_init(); crng_initialize_primary(); crng_global_init_time = jiffies; if (ratelimit_disable) { From 3c04d84508b54fcf524093b0d4a718680ed67f0f Mon Sep 17 00:00:00 2001 From: Myrtle Shah Date: Thu, 20 Jan 2022 15:33:37 +0000 Subject: [PATCH 1094/1295] riscv: Fix XIP_FIXUP_FLASH_OFFSET There were several problems with the calculation. Not only was an 'and' being computed into t1 but thrown away; but the 'and' itself would cause problems if the granularity of the XIP physical address was less than XIP_OFFSET - in my case I had the kernel image at 2MB in SPI flash. Fixes: f9ace4ede49b ("riscv: remove .text section size limitation for XIP") Cc: stable@vger.kernel.org Signed-off-by: Myrtle Shah Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/head.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 2363b43312fc..ec07f991866a 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -22,14 +22,13 @@ add \reg, \reg, t0 .endm .macro XIP_FIXUP_FLASH_OFFSET reg - la t1, __data_loc - li t0, XIP_OFFSET_MASK - and t1, t1, t0 - li t1, XIP_OFFSET - sub t0, t0, t1 - sub \reg, \reg, t0 + la t0, __data_loc + REG_L t1, _xip_phys_offset + sub \reg, \reg, t1 + add \reg, \reg, t0 .endm _xip_fixup: .dword CONFIG_PHYS_RAM_BASE - CONFIG_XIP_PHYS_ADDR - XIP_OFFSET +_xip_phys_offset: .dword CONFIG_XIP_PHYS_ADDR + XIP_OFFSET #else .macro XIP_FIXUP_OFFSET reg .endm From 4cbd93c3c110447adc66cb67c08af21f939ae2d7 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Thu, 27 Jan 2022 13:29:51 -0800 Subject: [PATCH 1095/1295] pidfd: fix test failure due to stack overflow on some arches When running the pidfd_fdinfo_test on arm64, it fails for me. After some digging, the reason is that the child exits due to SIGBUS, because it overflows the 1024 byte stack we've reserved for it. To fix the issue, increase the stack size to 8192 bytes (this number is somewhat arbitrary, and was arrived at through experimentation -- I kept doubling until the failure no longer occurred). Also, let's make the issue easier to debug. wait_for_pid() returns an ambiguous value: it may return -1 in all of these cases: 1. waitpid() itself returned -1 2. waitpid() returned success, but we found !WIFEXITED(status). 3. The child process exited, but it did so with a -1 exit code. There's no way for the caller to tell the difference. So, at least log which occurred, so the test runner can debug things. While debugging this, I found that we had !WIFEXITED(), because the child exited due to a signal. This seems like a reasonably common case, so also print out whether or not we have WIFSIGNALED(), and the associated WTERMSIG() (if any). This lets us see the SIGBUS I'm fixing clearly when it occurs. Finally, I'm suspicious of allocating the child's stack on our stack. man clone(2) suggests that the correct way to do this is with mmap(), and in particular by setting MAP_STACK. So, switch to doing it that way instead. Signed-off-by: Axel Rasmussen Acked-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/pidfd/pidfd.h | 13 ++++++++--- .../selftests/pidfd/pidfd_fdinfo_test.c | 22 +++++++++++++++---- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index 01f8d3c0cf2c..6922d6417e1c 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -68,7 +68,7 @@ #define PIDFD_SKIP 3 #define PIDFD_XFAIL 4 -int wait_for_pid(pid_t pid) +static inline int wait_for_pid(pid_t pid) { int status, ret; @@ -78,13 +78,20 @@ again: if (errno == EINTR) goto again; + ksft_print_msg("waitpid returned -1, errno=%d\n", errno); return -1; } - if (!WIFEXITED(status)) + if (!WIFEXITED(status)) { + ksft_print_msg( + "waitpid !WIFEXITED, WIFSIGNALED=%d, WTERMSIG=%d\n", + WIFSIGNALED(status), WTERMSIG(status)); return -1; + } - return WEXITSTATUS(status); + ret = WEXITSTATUS(status); + ksft_print_msg("waitpid WEXITSTATUS=%d\n", ret); + return ret; } static inline int sys_pidfd_open(pid_t pid, unsigned int flags) diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c index 22558524f71c..3fd8e903118f 100644 --- a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c +++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "pidfd.h" #include "../kselftest.h" @@ -80,7 +81,10 @@ static inline int error_check(struct error *err, const char *test_name) return err->code; } +#define CHILD_STACK_SIZE 8192 + struct child { + char *stack; pid_t pid; int fd; }; @@ -89,17 +93,22 @@ static struct child clone_newns(int (*fn)(void *), void *args, struct error *err) { static int flags = CLONE_PIDFD | CLONE_NEWPID | CLONE_NEWNS | SIGCHLD; - size_t stack_size = 1024; - char *stack[1024] = { 0 }; struct child ret; if (!(flags & CLONE_NEWUSER) && geteuid() != 0) flags |= CLONE_NEWUSER; + ret.stack = mmap(NULL, CHILD_STACK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); + if (ret.stack == MAP_FAILED) { + error_set(err, -1, "mmap of stack failed (errno %d)", errno); + return ret; + } + #ifdef __ia64__ - ret.pid = __clone2(fn, stack, stack_size, flags, args, &ret.fd); + ret.pid = __clone2(fn, ret.stack, CHILD_STACK_SIZE, flags, args, &ret.fd); #else - ret.pid = clone(fn, stack + stack_size, flags, args, &ret.fd); + ret.pid = clone(fn, ret.stack + CHILD_STACK_SIZE, flags, args, &ret.fd); #endif if (ret.pid < 0) { @@ -129,6 +138,11 @@ static inline int child_join(struct child *child, struct error *err) else if (r > 0) error_set(err, r, "child %d reported: %d", child->pid, r); + if (munmap(child->stack, CHILD_STACK_SIZE)) { + error_set(err, -1, "munmap of child stack failed (errno %d)", errno); + r = -1; + } + return r; } From e2aa5e650b07693477dff554053605976789fd68 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Thu, 27 Jan 2022 14:11:15 -0800 Subject: [PATCH 1096/1295] selftests: fixup build warnings in pidfd / clone3 tests These are some trivial fixups, which were needed to build the tests with clang and -Werror. The following issues are fixed: - Remove various unused variables. - In child_poll_leader_exit_test, clang isn't smart enough to realize syscall(SYS_exit, 0) won't return, so it complains we never return from a non-void function. Add an extra exit(0) to appease it. - In test_pidfd_poll_leader_exit, ret may be branched on despite being uninitialized, if we have !use_waitpid. Initialize it to zero to get the right behavior in that case. Signed-off-by: Axel Rasmussen Acked-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/clone3/clone3.c | 2 -- tools/testing/selftests/pidfd/pidfd_test.c | 6 +++--- tools/testing/selftests/pidfd/pidfd_wait.c | 5 ++--- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c index 076cf4325f78..cd4582129c7d 100644 --- a/tools/testing/selftests/clone3/clone3.c +++ b/tools/testing/selftests/clone3/clone3.c @@ -126,8 +126,6 @@ static void test_clone3(uint64_t flags, size_t size, int expected, int main(int argc, char *argv[]) { - pid_t pid; - uid_t uid = getuid(); ksft_print_header(); diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c index 529eb700ac26..9a2d64901d59 100644 --- a/tools/testing/selftests/pidfd/pidfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -441,7 +441,6 @@ static void test_pidfd_poll_exec(int use_waitpid) { int pid, pidfd = 0; int status, ret; - pthread_t t1; time_t prog_start = time(NULL); const char *test_name = "pidfd_poll check for premature notification on child thread exec"; @@ -500,13 +499,14 @@ static int child_poll_leader_exit_test(void *args) */ *child_exit_secs = time(NULL); syscall(SYS_exit, 0); + /* Never reached, but appeases compiler thinking we should return. */ + exit(0); } static void test_pidfd_poll_leader_exit(int use_waitpid) { int pid, pidfd = 0; - int status, ret; - time_t prog_start = time(NULL); + int status, ret = 0; const char *test_name = "pidfd_poll check for premature notification on non-empty" "group leader exit"; diff --git a/tools/testing/selftests/pidfd/pidfd_wait.c b/tools/testing/selftests/pidfd/pidfd_wait.c index be2943f072f6..17999e082aa7 100644 --- a/tools/testing/selftests/pidfd/pidfd_wait.c +++ b/tools/testing/selftests/pidfd/pidfd_wait.c @@ -39,7 +39,7 @@ static int sys_waitid(int which, pid_t pid, siginfo_t *info, int options, TEST(wait_simple) { - int pidfd = -1, status = 0; + int pidfd = -1; pid_t parent_tid = -1; struct clone_args args = { .parent_tid = ptr_to_u64(&parent_tid), @@ -47,7 +47,6 @@ TEST(wait_simple) .flags = CLONE_PIDFD | CLONE_PARENT_SETTID, .exit_signal = SIGCHLD, }; - int ret; pid_t pid; siginfo_t info = { .si_signo = 0, @@ -88,7 +87,7 @@ TEST(wait_simple) TEST(wait_states) { - int pidfd = -1, status = 0; + int pidfd = -1; pid_t parent_tid = -1; struct clone_args args = { .parent_tid = ptr_to_u64(&parent_tid), From 183f80fd72db42c9cc483aa7a5e8e881355d0b03 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 1 Feb 2022 18:38:36 +0000 Subject: [PATCH 1097/1295] selftests/ir: fix build with ancient kernel headers Since commit e2bcbd7769ee ("tools headers UAPI: remove stale lirc.h"), the build of the selftests fails on rhel 8 since its version of /usr/include/linux/lirc.h has no definition of RC_PROTO_RCMM32, etc [1]. [1] https://lkml.org/lkml/2022/1/28/275 Fixes: e2bcbd7769ee ("tools headers UAPI: remove stale lirc.h") Reviewed-by: Shuah Khan Reported-by: kernel test robot Signed-off-by: Sean Young Signed-off-by: Shuah Khan --- tools/testing/selftests/ir/ir_loopback.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/testing/selftests/ir/ir_loopback.c b/tools/testing/selftests/ir/ir_loopback.c index 06256c96df12..f4a15cbdd5ea 100644 --- a/tools/testing/selftests/ir/ir_loopback.c +++ b/tools/testing/selftests/ir/ir_loopback.c @@ -29,6 +29,16 @@ #define SYSFS_PATH_MAX 256 #define DNAME_PATH_MAX 256 +/* + * Support ancient lirc.h which does not have these values. Can be removed + * once RHEL 8 is no longer a relevant testing platform. + */ +#if RC_PROTO_MAX < 26 +#define RC_PROTO_RCMM12 24 +#define RC_PROTO_RCMM24 25 +#define RC_PROTO_RCMM32 26 +#endif + static const struct { enum rc_proto proto; const char *name; From 7dd3876205df92e07d824fe2264b38e0b8a9eec1 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 2 Feb 2022 09:52:41 -0600 Subject: [PATCH 1098/1295] PCI: kirin: Add dev struct for of_device_get_match_data() Bean reported that a622435fbe1a ("PCI: kirin: Prefer of_device_get_match_data()") broke kirin_pcie_probe() because it assumed match data of 0 was a failure when in fact, it meant the match data was "(void *)PCIE_KIRIN_INTERNAL_PHY". Therefore, probing of "hisilicon,kirin960-pcie" devices failed with -EINVAL and an "OF data missing" message. Add a struct kirin_pcie_data to encode the PHY type. Then the result of of_device_get_match_data() should always be a non-NULL pointer to a struct kirin_pcie_data that contains the PHY type. Fixes: a622435fbe1a ("PCI: kirin: Prefer of_device_get_match_data()") Link: https://lore.kernel.org/r/20220202162659.GA12603@bhelgaas Link: https://lore.kernel.org/r/20220201215941.1203155-1-huobean@gmail.com Reported-by: Bean Huo Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/dwc/pcie-kirin.c | 31 ++++++++++++++----------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index fa6886d66488..c625fc6bb287 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -756,22 +756,28 @@ static int __exit kirin_pcie_remove(struct platform_device *pdev) return 0; } +struct kirin_pcie_data { + enum pcie_kirin_phy_type phy_type; +}; + +static const struct kirin_pcie_data kirin_960_data = { + .phy_type = PCIE_KIRIN_INTERNAL_PHY, +}; + +static const struct kirin_pcie_data kirin_970_data = { + .phy_type = PCIE_KIRIN_EXTERNAL_PHY, +}; + static const struct of_device_id kirin_pcie_match[] = { - { - .compatible = "hisilicon,kirin960-pcie", - .data = (void *)PCIE_KIRIN_INTERNAL_PHY - }, - { - .compatible = "hisilicon,kirin970-pcie", - .data = (void *)PCIE_KIRIN_EXTERNAL_PHY - }, + { .compatible = "hisilicon,kirin960-pcie", .data = &kirin_960_data }, + { .compatible = "hisilicon,kirin970-pcie", .data = &kirin_970_data }, {}, }; static int kirin_pcie_probe(struct platform_device *pdev) { - enum pcie_kirin_phy_type phy_type; struct device *dev = &pdev->dev; + const struct kirin_pcie_data *data; struct kirin_pcie *kirin_pcie; struct dw_pcie *pci; int ret; @@ -781,13 +787,12 @@ static int kirin_pcie_probe(struct platform_device *pdev) return -EINVAL; } - phy_type = (long)of_device_get_match_data(dev); - if (!phy_type) { + data = of_device_get_match_data(dev); + if (!data) { dev_err(dev, "OF data missing\n"); return -EINVAL; } - kirin_pcie = devm_kzalloc(dev, sizeof(struct kirin_pcie), GFP_KERNEL); if (!kirin_pcie) return -ENOMEM; @@ -800,7 +805,7 @@ static int kirin_pcie_probe(struct platform_device *pdev) pci->ops = &kirin_dw_pcie_ops; pci->pp.ops = &kirin_pcie_host_ops; kirin_pcie->pci = pci; - kirin_pcie->type = phy_type; + kirin_pcie->type = data->phy_type; ret = kirin_pcie_get_resource(kirin_pcie, pdev); if (ret) From 4b1c70aa8ed8249608bb991380cb8ff423edf49e Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Fri, 4 Feb 2022 13:13:37 -0800 Subject: [PATCH 1099/1295] riscv/mm: Add XIP_FIXUP for phys_ram_base This manifests as a crash early in boot on VexRiscv. Signed-off-by: Myrtle Shah [Palmer: split commit] Fixes: 6d7f91d914bc ("riscv: Get rid of CONFIG_PHYS_RAM_BASE in kernel physical address conversion") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index cf4d018b7d66..eecfacac2cc5 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -522,6 +522,7 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) } #ifdef CONFIG_XIP_KERNEL +#define phys_ram_base (*(phys_addr_t *)XIP_FIXUP(&phys_ram_base)) extern char _xiprom[], _exiprom[], __data_loc; /* called from head.S with MMU off */ From ca0cb9a60f6d86d4b2139c6f393a78f39edcd7cb Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Fri, 4 Feb 2022 13:14:08 -0800 Subject: [PATCH 1100/1295] riscv/mm: Add XIP_FIXUP for riscv_pfn_base This manifests as a crash early in boot on VexRiscv. Signed-off-by: Myrtle Shah [Palmer: split commit] Fixes: 44c922572952 ("RISC-V: enable XIP") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index eecfacac2cc5..c27294128e18 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -232,6 +232,7 @@ static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAG #ifdef CONFIG_XIP_KERNEL #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops)) +#define riscv_pfn_base (*(unsigned long *)XIP_FIXUP(&riscv_pfn_base)) #define trampoline_pg_dir ((pgd_t *)XIP_FIXUP(trampoline_pg_dir)) #define fixmap_pte ((pte_t *)XIP_FIXUP(fixmap_pte)) #define early_pg_dir ((pgd_t *)XIP_FIXUP(early_pg_dir)) From f8d9d938514f46c4892aff6bfe32f425e84d81cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Feb 2022 14:55:47 -0800 Subject: [PATCH 1101/1295] tcp: take care of mixed splice()/sendmsg(MSG_ZEROCOPY) case syzbot found that mixing sendpage() and sendmsg(MSG_ZEROCOPY) calls over the same TCP socket would again trigger the infamous warning in inet_sock_destruct() WARN_ON(sk_forward_alloc_get(sk)); While Talal took into account a mix of regular copied data and MSG_ZEROCOPY one in the same skb, the sendpage() path has been forgotten. We want the charging to happen for sendpage(), because pages could be coming from a pipe. What is missing is the downgrading of pure zerocopy status to make sure sk_forward_alloc will stay synced. Add tcp_downgrade_zcopy_pure() helper so that we can use it from the two callers. Fixes: 9b65b17db723 ("net: avoid double accounting for pure zerocopy skbs") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Talal Ahmad Cc: Arjun Roy Cc: Willem de Bruijn Acked-by: Soheil Hassas Yeganeh Link: https://lore.kernel.org/r/20220203225547.665114-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bdf108f544a4..02cb275e5487 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -937,6 +937,22 @@ void tcp_remove_empty_skb(struct sock *sk) } } +/* skb changing from pure zc to mixed, must charge zc */ +static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) +{ + if (unlikely(skb_zcopy_pure(skb))) { + u32 extra = skb->truesize - + SKB_TRUESIZE(skb_end_offset(skb)); + + if (!sk_wmem_schedule(sk, extra)) + return -ENOMEM; + + sk_mem_charge(sk, extra); + skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; + } + return 0; +} + static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, struct page *page, int offset, size_t *size) { @@ -972,7 +988,7 @@ new_segment: tcp_mark_push(tp, skb); goto new_segment; } - if (!sk_wmem_schedule(sk, copy)) + if (tcp_downgrade_zcopy_pure(sk, skb) || !sk_wmem_schedule(sk, copy)) return NULL; if (can_coalesce) { @@ -1320,19 +1336,8 @@ new_segment: copy = min_t(int, copy, pfrag->size - pfrag->offset); - /* skb changing from pure zc to mixed, must charge zc */ - if (unlikely(skb_zcopy_pure(skb))) { - u32 extra = skb->truesize - - SKB_TRUESIZE(skb_end_offset(skb)); - - if (!sk_wmem_schedule(sk, extra)) - goto wait_for_space; - - sk_mem_charge(sk, extra); - skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; - } - - if (!sk_wmem_schedule(sk, copy)) + if (tcp_downgrade_zcopy_pure(sk, skb) || + !sk_wmem_schedule(sk, copy)) goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, From c6ce9c5831cae515d375a01b97ae1778689acf19 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 2 Feb 2022 17:46:48 +1100 Subject: [PATCH 1102/1295] crypto: api - Move cryptomgr soft dependency into algapi The soft dependency on cryptomgr is only needed in algapi because if algapi isn't present then no algorithms can be loaded. This also fixes the case where api is built-in but algapi is built as a module as the soft dependency would otherwise get lost. Fixes: 8ab23d547f65 ("crypto: api - Add softdep on cryptomgr") Reported-by: Jan Beulich Signed-off-by: Herbert Xu Tested-by: Jan Beulich Signed-off-by: Herbert Xu --- crypto/algapi.c | 1 + crypto/api.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/algapi.c b/crypto/algapi.c index a366cb3e8aa1..76fdaa16bd4a 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -1324,3 +1324,4 @@ module_exit(crypto_algapi_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Cryptographic algorithms API"); +MODULE_SOFTDEP("pre: cryptomgr"); diff --git a/crypto/api.c b/crypto/api.c index cf0869dd130b..7ddfe946dd56 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -643,4 +643,3 @@ EXPORT_SYMBOL_GPL(crypto_req_done); MODULE_DESCRIPTION("Cryptographic core API"); MODULE_LICENSE("GPL"); -MODULE_SOFTDEP("pre: cryptomgr"); From 59085208e4a2183998964844f8684fea0378128d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 5 Feb 2022 01:03:21 +0200 Subject: [PATCH 1103/1295] net: mscc: ocelot: fix all IP traffic getting trapped to CPU with PTP over IP The filters for the PTP trap keys are incorrectly configured, in the sense that is2_entry_set() only looks at trap->key.ipv4.dport or trap->key.ipv6.dport if trap->key.ipv4.proto or trap->key.ipv6.proto is set to IPPROTO_TCP or IPPROTO_UDP. But we don't do that, so is2_entry_set() goes through the "else" branch of the IP protocol check, and ends up installing a rule for "Any IP protocol match" (because msk is also 0). The UDP port is ignored. This means that when we run "ptp4l -i swp0 -4", all IP traffic is trapped to the CPU, which hinders bridging. Fix this by specifying the IP protocol in the VCAP IS2 filters for PTP over UDP. Fixes: 96ca08c05838 ("net: mscc: ocelot: set up traps for PTP packets") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 455293aa6343..354e4474bcc3 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1432,6 +1432,8 @@ static void ocelot_populate_ipv4_ptp_event_trap_key(struct ocelot_vcap_filter *trap) { trap->key_type = OCELOT_VCAP_KEY_IPV4; + trap->key.ipv4.proto.value[0] = IPPROTO_UDP; + trap->key.ipv4.proto.mask[0] = 0xff; trap->key.ipv4.dport.value = PTP_EV_PORT; trap->key.ipv4.dport.mask = 0xffff; } @@ -1440,6 +1442,8 @@ static void ocelot_populate_ipv6_ptp_event_trap_key(struct ocelot_vcap_filter *trap) { trap->key_type = OCELOT_VCAP_KEY_IPV6; + trap->key.ipv4.proto.value[0] = IPPROTO_UDP; + trap->key.ipv4.proto.mask[0] = 0xff; trap->key.ipv6.dport.value = PTP_EV_PORT; trap->key.ipv6.dport.mask = 0xffff; } @@ -1448,6 +1452,8 @@ static void ocelot_populate_ipv4_ptp_general_trap_key(struct ocelot_vcap_filter *trap) { trap->key_type = OCELOT_VCAP_KEY_IPV4; + trap->key.ipv4.proto.value[0] = IPPROTO_UDP; + trap->key.ipv4.proto.mask[0] = 0xff; trap->key.ipv4.dport.value = PTP_GEN_PORT; trap->key.ipv4.dport.mask = 0xffff; } @@ -1456,6 +1462,8 @@ static void ocelot_populate_ipv6_ptp_general_trap_key(struct ocelot_vcap_filter *trap) { trap->key_type = OCELOT_VCAP_KEY_IPV6; + trap->key.ipv4.proto.value[0] = IPPROTO_UDP; + trap->key.ipv4.proto.mask[0] = 0xff; trap->key.ipv6.dport.value = PTP_GEN_PORT; trap->key.ipv6.dport.mask = 0xffff; } From 6df4432a5eca101b5fd80fbee41d309f3d67928d Mon Sep 17 00:00:00 2001 From: Christoph Niedermaier Date: Tue, 1 Feb 2022 12:01:53 +0100 Subject: [PATCH 1104/1295] drm/panel: simple: Assign data from panel_dpi_probe() correctly In the function panel_simple_probe() the pointer panel->desc is assigned to the passed pointer desc. If function panel_dpi_probe() is called panel->desc will be updated, but further on only desc will be evaluated. So update the desc pointer to be able to use the data from the function panel_dpi_probe(). Fixes: 4a1d0dbc8332 ("drm/panel: simple: add panel-dpi support") Signed-off-by: Christoph Niedermaier Cc: Marek Vasut Cc: Thierry Reding Cc: Sam Ravnborg Cc: David Airlie Cc: Daniel Vetter To: dri-devel@lists.freedesktop.org Reviewed-by: Sam Ravnborg Signed-off-by: Marek Vasut Link: https://patchwork.freedesktop.org/patch/msgid/20220201110153.3479-1-cniedermaier@dh-electronics.com --- drivers/gpu/drm/panel/panel-simple.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/panel-simple.c b/drivers/gpu/drm/panel/panel-simple.c index 9e46db5e359c..3c08f9827acf 100644 --- a/drivers/gpu/drm/panel/panel-simple.c +++ b/drivers/gpu/drm/panel/panel-simple.c @@ -588,6 +588,7 @@ static int panel_simple_probe(struct device *dev, const struct panel_desc *desc) err = panel_dpi_probe(dev, panel); if (err) goto free_ddc; + desc = panel->desc; } else { if (!of_get_display_timing(dev->of_node, "panel-timing", &dt)) panel_simple_parse_panel_timing_node(dev, panel, &dt); From aec12836e7196e4d360b2cbf20cf7aa5139ad2ec Mon Sep 17 00:00:00 2001 From: Pavel Parkhomenko Date: Sun, 6 Feb 2022 00:49:51 +0300 Subject: [PATCH 1105/1295] net: phy: marvell: Fix MDI-x polarity setting in 88e1118-compatible PHYs When setting up autonegotiation for 88E1118R and compatible PHYs, a software reset of PHY is issued before setting up polarity. This is incorrect as changes of MDI Crossover Mode bits are disruptive to the normal operation and must be followed by a software reset to take effect. Let's patch m88e1118_config_aneg() to fix the issue mentioned before by invoking software reset of the PHY just after setting up MDI-x polarity. Fixes: 605f196efbf8 ("phy: Add support for Marvell 88E1118 PHY") Signed-off-by: Pavel Parkhomenko Reviewed-by: Serge Semin Suggested-by: Andrew Lunn Cc: stable@vger.kernel.org Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index fa71fb7a66b5..ab063961ac00 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -1213,16 +1213,15 @@ static int m88e1118_config_aneg(struct phy_device *phydev) { int err; - err = genphy_soft_reset(phydev); - if (err < 0) - return err; - err = marvell_set_polarity(phydev, phydev->mdix_ctrl); if (err < 0) return err; err = genphy_config_aneg(phydev); - return 0; + if (err < 0) + return err; + + return genphy_soft_reset(phydev); } static int m88e1118_config_init(struct phy_device *phydev) From 28f9222138868899c53e00bc1f910faa55f88546 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 5 Feb 2022 21:05:16 -0800 Subject: [PATCH 1106/1295] net/smc: fix ref_tracker issue in smc_pnet_add() I added the netdev_tracker_alloc() right after ndev was stored into the newly allocated object: new_pe->ndev = ndev; if (ndev) netdev_tracker_alloc(ndev, &new_pe->dev_tracker, GFP_KERNEL); But I missed that later, we could end up freeing new_pe, then calling dev_put(ndev) to release the reference on ndev. The new_pe->dev_tracker would not be freed. To solve this issue, move the netdev_tracker_alloc() call to the point we know for sure new_pe will be kept. syzbot report (on net-next tree, but the bug is present in net tree) WARNING: CPU: 0 PID: 6019 at lib/refcount.c:31 refcount_warn_saturate+0xbf/0x1e0 lib/refcount.c:31 Modules linked in: CPU: 0 PID: 6019 Comm: syz-executor.3 Not tainted 5.17.0-rc2-syzkaller-00650-g5a8fb33e5305 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:refcount_warn_saturate+0xbf/0x1e0 lib/refcount.c:31 Code: 1d f4 70 a0 09 31 ff 89 de e8 4d bc 99 fd 84 db 75 e0 e8 64 b8 99 fd 48 c7 c7 20 0c 06 8a c6 05 d4 70 a0 09 01 e8 9e 4e 28 05 <0f> 0b eb c4 e8 48 b8 99 fd 0f b6 1d c3 70 a0 09 31 ff 89 de e8 18 RSP: 0018:ffffc900043b7400 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000040000 RSI: ffffffff815fb318 RDI: fffff52000876e72 RBP: 0000000000000004 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff815f507e R11: 0000000000000000 R12: 1ffff92000876e85 R13: 0000000000000000 R14: ffff88805c1c6600 R15: 0000000000000000 FS: 00007f1ef6feb700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2d02b000 CR3: 00000000223f4000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __refcount_dec include/linux/refcount.h:344 [inline] refcount_dec include/linux/refcount.h:359 [inline] ref_tracker_free+0x53f/0x6c0 lib/ref_tracker.c:119 netdev_tracker_free include/linux/netdevice.h:3867 [inline] dev_put_track include/linux/netdevice.h:3884 [inline] dev_put_track include/linux/netdevice.h:3880 [inline] dev_put include/linux/netdevice.h:3910 [inline] smc_pnet_add_eth net/smc/smc_pnet.c:399 [inline] smc_pnet_enter net/smc/smc_pnet.c:493 [inline] smc_pnet_add+0x5fc/0x15f0 net/smc/smc_pnet.c:556 genl_family_rcv_msg_doit+0x228/0x320 net/netlink/genetlink.c:731 genl_family_rcv_msg net/netlink/genetlink.c:775 [inline] genl_rcv_msg+0x328/0x580 net/netlink/genetlink.c:792 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494 genl_rcv+0x24/0x40 net/netlink/genetlink.c:803 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:725 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2413 ___sys_sendmsg+0xf3/0x170 net/socket.c:2467 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2496 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: b60645248af3 ("net/smc: add net device tracker to struct smc_pnetentry") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/smc/smc_pnet.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 291f1484a1b7..fb6331d97185 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -368,9 +368,6 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, new_pe->type = SMC_PNET_ETH; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); - new_pe->ndev = ndev; - if (ndev) - netdev_tracker_alloc(ndev, &new_pe->dev_tracker, GFP_KERNEL); rc = -EEXIST; new_netdev = true; write_lock(&pnettable->lock); @@ -382,6 +379,11 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, } } if (new_netdev) { + if (ndev) { + new_pe->ndev = ndev; + netdev_tracker_alloc(ndev, &new_pe->dev_tracker, + GFP_KERNEL); + } list_add_tail(&new_pe->list, &pnettable->pnetlist); write_unlock(&pnettable->lock); } else { From b7b9825fbee708e17ab7ea8b583561587c8ff7df Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 09:39:02 -0300 Subject: [PATCH 1107/1295] tools headers UAPI: Sync linux/kvm.h with the kernel sources To pick the changes in: f6c6804c43fa18d3 ("kvm: Move KVM_GET_XSAVE2 IOCTL definition at the end of kvm.h") That just rebuilds perf, as these patches don't add any new KVM ioctl to be harvested for the the 'perf trace' ioctl syscall argument beautifiers. This is also by now used by tools/testing/selftests/kvm/, a simple test build succeeded. This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: Janosch Frank Cc: Paolo Bonzini Link: http://lore.kernel.org/lkml/Yf+4k5Fs5Q3HdSG9@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index b46bcdb0cab1..5191b57e1562 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1624,9 +1624,6 @@ struct kvm_enc_region { #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) -/* Available with KVM_CAP_XSAVE2 */ -#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) - struct kvm_s390_pv_sec_parm { __u64 origin; __u64 length; @@ -2048,4 +2045,7 @@ struct kvm_stats_desc { #define KVM_GET_STATS_FD _IO(KVMIO, 0xce) +/* Available with KVM_CAP_XSAVE2 */ +#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) + #endif /* __LINUX_KVM_H */ From ae65b443f03fca3620cb37c5e019ddca3f89a1ce Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 2 Feb 2022 16:27:23 +0530 Subject: [PATCH 1108/1295] perf tools: Add missing branch_sample_type to perf_event_attr__fprintf() This updates branch sample type with missing PERF_SAMPLE_BRANCH_TYPE_SAVE. Suggested-by: James Clark Signed-off-by: Anshuman Khandual Acked-by: Jiri Olsa Cc: James Clark Cc: Mark Rutland Cc: Peter Zijlstra Cc: linux-arm-kernel@lists.infradead.org Link: http://lore.kernel.org/lkml/1643799443-15109-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/perf_event_attr_fprintf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index 47b7531f51da..98af3fa4ea35 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -52,7 +52,7 @@ static void __p_branch_sample_type(char *buf, size_t size, u64 value) bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX), bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP), bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES), - bit_name(HW_INDEX), + bit_name(TYPE_SAVE), bit_name(HW_INDEX), { .name = NULL, } }; #undef bit_name From a663520fcc4bce2814032e3de6c4e2665b9555e5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 1 Feb 2022 23:08:25 -0800 Subject: [PATCH 1109/1295] perf annotate: Set error stream of objdump process for TUI The stderr should be set to a pipe when using TUI. Otherwise it'd print to stdout and break TUI windows with an error message. Signed-off-by: Namhyung Kim Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20220202070828.143303-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 01900689dc00..8190a124b99d 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2036,6 +2036,7 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) memset(&objdump_process, 0, sizeof(objdump_process)); objdump_process.argv = objdump_argv; objdump_process.out = -1; + objdump_process.err = -1; if (start_command(&objdump_process)) { pr_err("Failure starting to run %s\n", command); err = -1; From d792a7a94c2c3ba045247266bee2e2bced7b495a Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Tue, 25 Jan 2022 17:11:41 +0500 Subject: [PATCH 1110/1295] perf session: Check for NULL pointer before dereference Move NULL pointer check before dereferencing the variable. Addresses-Coverity: 1497622 ("Derereference before null check") Reviewed-by: James Clark Signed-off-by: Ameer Hamza Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: German Gomez Cc: Ingo Molnar Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Riccardo Mancini Link: https://lore.kernel.org/r/20220125121141.18347-1-amhamza.mgc@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 2c0d30f08e78..498b05708db5 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1503,11 +1503,12 @@ static int machines__deliver_event(struct machines *machines, ++evlist->stats.nr_unknown_id; return 0; } - dump_sample(evsel, event, sample, perf_env__arch(machine->env)); if (machine == NULL) { ++evlist->stats.nr_unprocessable_samples; + dump_sample(evsel, event, sample, perf_env__arch(NULL)); return 0; } + dump_sample(evsel, event, sample, perf_env__arch(machine->env)); return evlist__deliver_sample(evlist, tool, event, sample, evsel, machine); case PERF_RECORD_MMAP: return tool->mmap(tool, event, sample, machine); From bc9c806e524429a4b98de257179af5e3fc2cb57d Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 24 Dec 2021 20:40:13 +0800 Subject: [PATCH 1111/1295] perf synthetic-events: Return error if procfs isn't mounted for PID namespaces For perf recording, it retrieves process info by iterating nodes in proc fs. If we run perf in a non-root PID namespace with command: # unshare --fork --pid perf record -e cycles -a -- test_program ... in this case, unshare command creates a child PID namespace and launches perf tool in it, but the issue is the proc fs is not mounted for the non-root PID namespace, this leads to the perf tool gathering process info from its parent PID namespace. We can use below command to observe the process nodes under proc fs: # unshare --pid --fork ls /proc 1 137 1968 2128 3 342 48 62 78 crypto kcore net uptime 10 138 2 2142 30 35 49 63 8 devices keys pagetypeinfo version 11 139 20 2143 304 36 50 64 82 device-tree key-users partitions vmallocinfo 12 14 2011 22 305 37 51 65 83 diskstats kmsg self vmstat 128 140 2038 23 307 39 52 656 84 driver kpagecgroup slabinfo zoneinfo 129 15 2074 24 309 4 53 67 9 execdomains kpagecount softirqs 13 16 2094 241 31 40 54 68 asound fb kpageflags stat 130 164 2096 242 310 41 55 69 buddyinfo filesystems loadavg swaps 131 17 2098 25 317 42 56 70 bus fs locks sys 132 175 21 26 32 43 57 71 cgroups interrupts meminfo sysrq-trigger 133 179 2102 263 329 44 58 75 cmdline iomem misc sysvipc 134 1875 2103 27 330 45 59 76 config.gz ioports modules thread-self 135 19 2117 29 333 46 6 77 consoles irq mounts timer_list 136 1941 2121 298 34 47 60 773 cpuinfo kallsyms mtd tty So it shows many existed tasks, since unshared command has not mounted the proc fs for the new created PID namespace, it still accesses the proc fs of the root PID namespace. This leads to two prominent issues: - Firstly, PID values are mismatched between thread info and samples. The gathered thread info are coming from the proc fs of the root PID namespace, but samples record its PID from the child PID namespace. - The second issue is profiled program 'test_program' returns its forked PID number from the child PID namespace, perf tool wrongly uses this PID number to retrieve the process info via the proc fs of the root PID namespace. To avoid issues, we need to mount proc fs for the child PID namespace with the option '--mount-proc' when use unshare command: # unshare --fork --pid --mount-proc perf record -e cycles -a -- test_program Conversely, when the proc fs of the root PID namespace is used by child namespace, perf tool can detect the multiple PID levels and nsinfo__is_in_root_namespace() returns false, this patch reports error for this case: # unshare --fork --pid perf record -e cycles -a -- test_program Couldn't synthesize bpf events. Perf runs in non-root PID namespace but it tries to gather process info from its parent PID namespace. Please mount the proc file system properly, e.g. add the option '--mount-proc' for unshare command. Reviewed-by: James Clark Signed-off-by: Leo Yan Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Daniel Borkmann Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Yonghong Song Link: https://lore.kernel.org/r/20211224124014.2492751-1-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/synthetic-events.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 70f095624a0b..b654de0841f8 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1784,6 +1784,25 @@ int __machine__synthesize_threads(struct machine *machine, struct perf_tool *too perf_event__handler_t process, bool needs_mmap, bool data_mmap, unsigned int nr_threads_synthesize) { + /* + * When perf runs in non-root PID namespace, and the namespace's proc FS + * is not mounted, nsinfo__is_in_root_namespace() returns false. + * In this case, the proc FS is coming for the parent namespace, thus + * perf tool will wrongly gather process info from its parent PID + * namespace. + * + * To avoid the confusion that the perf tool runs in a child PID + * namespace but it synthesizes thread info from its parent PID + * namespace, returns failure with warning. + */ + if (!nsinfo__is_in_root_namespace()) { + pr_err("Perf runs in non-root PID namespace but it tries to "); + pr_err("gather process info from its parent PID namespace.\n"); + pr_err("Please mount the proc file system properly, e.g. "); + pr_err("add the option '--mount-proc' for unshare command.\n"); + return -EPERM; + } + if (target__has_task(target)) return perf_event__synthesize_thread_map(tool, threads, process, machine, needs_mmap, data_mmap); From a2887b9b8d1db7be971e5951e08ffe8563ea412f Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sat, 25 Dec 2021 09:55:58 +0900 Subject: [PATCH 1112/1295] perf bpf: Fix a typo in bpf_counter_cgroup.c This patch fixes a spelling typo in error message. Signed-off-by: Masanari Iida Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20211225005558.503935-1-standby24x7@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_counter_cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c index 631e34a0b66f..ac60c08e8e2a 100644 --- a/tools/perf/util/bpf_counter_cgroup.c +++ b/tools/perf/util/bpf_counter_cgroup.c @@ -266,7 +266,7 @@ static int bperf_cgrp__read(struct evsel *evsel) idx = evsel->core.idx; err = bpf_map_lookup_elem(reading_map_fd, &idx, values); if (err) { - pr_err("bpf map lookup falied: idx=%u, event=%s, cgrp=%s\n", + pr_err("bpf map lookup failed: idx=%u, event=%s, cgrp=%s\n", idx, evsel__name(evsel), evsel->cgrp->name); goto out; } From 05b5a9d6285412d97fc61b8ec113d1d4f6b950c2 Mon Sep 17 00:00:00 2001 From: German Gomez Date: Wed, 26 Jan 2022 10:59:26 +0000 Subject: [PATCH 1113/1295] perf tools: Apply correct label to user/kernel symbols in branch mode In branch mode, the branch symbols were being displayed with incorrect cpumode labels. So fix this. For example, before: # perf record -b -a -- sleep 1 # perf report -b Overhead Command Source Shared Object Source Symbol Target Symbol 0.08% swapper [kernel.kallsyms] [k] rcu_idle_enter [k] cpuidle_enter_state ==> 0.08% cmd0 [kernel.kallsyms] [.] psi_group_change [.] psi_group_change 0.08% cmd1 [kernel.kallsyms] [k] psi_group_change [k] psi_group_change After: # perf report -b Overhead Command Source Shared Object Source Symbol Target Symbol 0.08% swapper [kernel.kallsyms] [k] rcu_idle_enter [k] cpuidle_enter_state 0.08% cmd0 [kernel.kallsyms] [k] psi_group_change [k] pei_group_change 0.08% cmd1 [kernel.kallsyms] [k] psi_group_change [k] psi_group_change Reviewed-by: James Clark Signed-off-by: German Gomez Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220126105927.3411216-1-german.gomez@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 2 ++ tools/perf/util/map_symbol.h | 1 + tools/perf/util/sort.c | 4 ++-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index f70ba56912d4..394550003693 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2073,6 +2073,7 @@ static void ip__resolve_ams(struct thread *thread, ams->addr = ip; ams->al_addr = al.addr; + ams->al_level = al.level; ams->ms.maps = al.maps; ams->ms.sym = al.sym; ams->ms.map = al.map; @@ -2092,6 +2093,7 @@ static void ip__resolve_data(struct thread *thread, ams->addr = addr; ams->al_addr = al.addr; + ams->al_level = al.level; ams->ms.maps = al.maps; ams->ms.sym = al.sym; ams->ms.map = al.map; diff --git a/tools/perf/util/map_symbol.h b/tools/perf/util/map_symbol.h index 7d22ade082c8..e08817b0c30f 100644 --- a/tools/perf/util/map_symbol.h +++ b/tools/perf/util/map_symbol.h @@ -18,6 +18,7 @@ struct addr_map_symbol { struct map_symbol ms; u64 addr; u64 al_addr; + char al_level; u64 phys_addr; u64 data_page_size; }; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index cfba8c337783..2da081ef532b 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -915,7 +915,7 @@ static int hist_entry__sym_from_snprintf(struct hist_entry *he, char *bf, struct addr_map_symbol *from = &he->branch_info->from; return _hist_entry__sym_snprintf(&from->ms, from->al_addr, - he->level, bf, size, width); + from->al_level, bf, size, width); } return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A"); @@ -928,7 +928,7 @@ static int hist_entry__sym_to_snprintf(struct hist_entry *he, char *bf, struct addr_map_symbol *to = &he->branch_info->to; return _hist_entry__sym_snprintf(&to->ms, to->al_addr, - he->level, bf, size, width); + to->al_level, bf, size, width); } return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A"); From b2b1aa73ade982c175ac926a1fd34e76ad628b94 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 4 Feb 2022 17:09:41 -0800 Subject: [PATCH 1114/1295] perf stat: Fix display of grouped aliased events An event may have a number of uncore aliases that when added to the evlist are consecutive. If there are multiple uncore events in a group then parse_events__set_leader_for_uncore_aliase will reorder the evlist so that events on the same PMU are adjacent. The collect_all_aliases function assumes that aliases are in blocks so that only the first counter is printed and all others are marked merged. The reordering for groups breaks the assumption and so all counts are printed. This change removes the assumption from collect_all_aliases that the events are in blocks and instead processes the entire evlist. Before: ``` $ perf stat -e '{UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE,UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE},duration_time' -a -A -- sleep 1 Performance counter stats for 'system wide': CPU0 256,866 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 494,413 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 967 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,738 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 285,161 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 429,920 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 955 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,443 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 310,753 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 416,657 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,231 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,573 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 416,067 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 405,966 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,481 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,447 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 312,911 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 408,154 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,086 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,380 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 333,994 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 370,349 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,287 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,335 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 188,107 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 302,423 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 701 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,070 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 307,221 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 383,642 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,036 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,158 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 318,479 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 821,545 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,028 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 2,550 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 227,618 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 372,272 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 903 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,456 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 376,783 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 419,827 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,406 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,453 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 286,583 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 429,956 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 999 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,436 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 313,867 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 370,159 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,114 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,291 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 342,083 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 409,111 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,399 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,684 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 365,828 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 376,037 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,378 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,411 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 382,456 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 621,743 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,232 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,955 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 342,316 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 385,067 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,176 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,268 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 373,588 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 386,163 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,394 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,464 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 381,206 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 546,891 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,266 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,712 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 221,176 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 392,069 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 831 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,456 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 355,401 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 705,595 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,235 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 2,216 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 371,436 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 428,103 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,306 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,442 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 384,352 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 504,200 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,468 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,860 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 228,856 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 287,976 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 832 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,060 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 215,121 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 334,162 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 681 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,026 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 296,179 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 436,083 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,084 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,525 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 262,296 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 416,573 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 986 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,533 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 285,852 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 359,842 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,073 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,326 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 303,379 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 367,222 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,008 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,156 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 273,487 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 425,449 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 932 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,367 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 297,596 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 414,793 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,140 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,601 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 342,365 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 360,422 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,291 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,342 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 327,196 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 580,858 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,122 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 2,014 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 296,564 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 452,817 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,087 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,694 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 375,002 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 389,393 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,478 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 1,540 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 365,213 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 594,685 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 1,401 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 2,222 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 1,000,749,060 ns duration_time 1.000749060 seconds time elapsed ``` After: ``` Performance counter stats for 'system wide': CPU0 20,547,434 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU36 45,202,862 UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE CPU0 82,001 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU36 159,688 UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE CPU0 1,000,464,828 ns duration_time 1.000464828 seconds time elapsed ``` Fixes: 3cdc5c2cb924acb4 ("perf parse-events: Handle uncore event aliases in small groups properly") Reviewed-by: Andi Kleen Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Asaf Yaffe Cc: Caleb Biggers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Kshipra Bopardikar Cc: Mark Rutland Cc: Maxime Coquelin Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vineet Singh Cc: Zhengjun Xing Link: https://lore.kernel.org/r/20220205010941.1065469-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-display.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 5db83e51ceef..9cbe351b141f 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -585,15 +585,16 @@ static void collect_all_aliases(struct perf_stat_config *config, struct evsel *c alias = list_prepare_entry(counter, &(evlist->core.entries), core.node); list_for_each_entry_continue (alias, &evlist->core.entries, core.node) { - if (strcmp(evsel__name(alias), evsel__name(counter)) || - alias->scale != counter->scale || - alias->cgrp != counter->cgrp || - strcmp(alias->unit, counter->unit) || - evsel__is_clock(alias) != evsel__is_clock(counter) || - !strcmp(alias->pmu_name, counter->pmu_name)) - break; - alias->merged_stat = true; - cb(config, alias, data, false); + /* Merge events with the same name, etc. but on different PMUs. */ + if (!strcmp(evsel__name(alias), evsel__name(counter)) && + alias->scale == counter->scale && + alias->cgrp == counter->cgrp && + !strcmp(alias->unit, counter->unit) && + evsel__is_clock(alias) == evsel__is_clock(counter) && + strcmp(alias->pmu_name, counter->pmu_name)) { + alias->merged_stat = true; + cb(config, alias, data, false); + } } } From 4f2492731ada9d702ffdfaa6ec1ff64820a1664c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Feb 2020 11:04:23 -0300 Subject: [PATCH 1115/1295] tools include UAPI: Sync sound/asound.h copy with the kernel sources Picking the changes from: 06feec6005c9d950 ("ASoC: hdmi-codec: Fix OOB memory accesses") Which entails no changes in the tooling side as it doesn't introduce new SNDRV_PCM_IOCTL_ ioctls. To silence this perf tools build warning: Warning: Kernel ABI header at 'tools/include/uapi/sound/asound.h' differs from latest version at 'include/uapi/sound/asound.h' diff -u tools/include/uapi/sound/asound.h include/uapi/sound/asound.h Cc: Dmitry Osipenko Cc: Mark Brown Cc: Takashi Iwai Link: https://lore.kernel.org/lkml/Yf+6OT+2eMrYDEeX@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/sound/asound.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index ef0cafe295b2..2d3e5df39a59 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -56,8 +56,10 @@ * * ****************************************************************************/ +#define AES_IEC958_STATUS_SIZE 24 + struct snd_aes_iec958 { - unsigned char status[24]; /* AES/IEC958 channel status bits */ + unsigned char status[AES_IEC958_STATUS_SIZE]; /* AES/IEC958 channel status bits */ unsigned char subcode[147]; /* AES/IEC958 subcode bits */ unsigned char pad; /* nothing */ unsigned char dig_subframe[4]; /* AES/IEC958 subframe bits */ From 407eb43ae87c969d98746c3274ae5d0f977b102e Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 1 Feb 2022 15:40:56 -0600 Subject: [PATCH 1116/1295] libperf: Add arm64 support to perf_mmap__read_self() Add the arm64 variants for read_perf_counter() and read_timestamp(). Unfortunately the counter number is encoded into the instruction, so the code is a bit verbose to enumerate all possible counters. Tested-by: Masayoshi Mizuma Signed-off-by: Rob Herring Acked-by: Jiri Olsa Tested-by: John Garry Signed-off-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/20220201214056.702854-1-robh@kernel.org Cc: Mark Rutland Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Will Deacon Cc: Alexander Shishkin Cc: Ingo Molnar Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org --- tools/lib/perf/mmap.c | 98 +++++++++++++++++++++++++++++++ tools/lib/perf/tests/test-evsel.c | 5 +- 2 files changed, 102 insertions(+), 1 deletion(-) diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c index f7ee07cb5818..0d1634cedf44 100644 --- a/tools/lib/perf/mmap.c +++ b/tools/lib/perf/mmap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "internal.h" void perf_mmap__init(struct perf_mmap *map, struct perf_mmap *prev, @@ -294,6 +295,103 @@ static u64 read_timestamp(void) return low | ((u64)high) << 32; } +#elif defined(__aarch64__) +#define read_sysreg(r) ({ \ + u64 __val; \ + asm volatile("mrs %0, " __stringify(r) : "=r" (__val)); \ + __val; \ +}) + +static u64 read_pmccntr(void) +{ + return read_sysreg(pmccntr_el0); +} + +#define PMEVCNTR_READ(idx) \ + static u64 read_pmevcntr_##idx(void) { \ + return read_sysreg(pmevcntr##idx##_el0); \ + } + +PMEVCNTR_READ(0); +PMEVCNTR_READ(1); +PMEVCNTR_READ(2); +PMEVCNTR_READ(3); +PMEVCNTR_READ(4); +PMEVCNTR_READ(5); +PMEVCNTR_READ(6); +PMEVCNTR_READ(7); +PMEVCNTR_READ(8); +PMEVCNTR_READ(9); +PMEVCNTR_READ(10); +PMEVCNTR_READ(11); +PMEVCNTR_READ(12); +PMEVCNTR_READ(13); +PMEVCNTR_READ(14); +PMEVCNTR_READ(15); +PMEVCNTR_READ(16); +PMEVCNTR_READ(17); +PMEVCNTR_READ(18); +PMEVCNTR_READ(19); +PMEVCNTR_READ(20); +PMEVCNTR_READ(21); +PMEVCNTR_READ(22); +PMEVCNTR_READ(23); +PMEVCNTR_READ(24); +PMEVCNTR_READ(25); +PMEVCNTR_READ(26); +PMEVCNTR_READ(27); +PMEVCNTR_READ(28); +PMEVCNTR_READ(29); +PMEVCNTR_READ(30); + +/* + * Read a value direct from PMEVCNTR + */ +static u64 read_perf_counter(unsigned int counter) +{ + static u64 (* const read_f[])(void) = { + read_pmevcntr_0, + read_pmevcntr_1, + read_pmevcntr_2, + read_pmevcntr_3, + read_pmevcntr_4, + read_pmevcntr_5, + read_pmevcntr_6, + read_pmevcntr_7, + read_pmevcntr_8, + read_pmevcntr_9, + read_pmevcntr_10, + read_pmevcntr_11, + read_pmevcntr_13, + read_pmevcntr_12, + read_pmevcntr_14, + read_pmevcntr_15, + read_pmevcntr_16, + read_pmevcntr_17, + read_pmevcntr_18, + read_pmevcntr_19, + read_pmevcntr_20, + read_pmevcntr_21, + read_pmevcntr_22, + read_pmevcntr_23, + read_pmevcntr_24, + read_pmevcntr_25, + read_pmevcntr_26, + read_pmevcntr_27, + read_pmevcntr_28, + read_pmevcntr_29, + read_pmevcntr_30, + read_pmccntr + }; + + if (counter < ARRAY_SIZE(read_f)) + return (read_f[counter])(); + + return 0; +} + +static u64 read_timestamp(void) { return read_sysreg(cntvct_el0); } + #else static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; } static u64 read_timestamp(void) { return 0; } diff --git a/tools/lib/perf/tests/test-evsel.c b/tools/lib/perf/tests/test-evsel.c index 33ae9334861a..89be89afb24d 100644 --- a/tools/lib/perf/tests/test-evsel.c +++ b/tools/lib/perf/tests/test-evsel.c @@ -130,6 +130,9 @@ static int test_stat_user_read(int event) struct perf_event_attr attr = { .type = PERF_TYPE_HARDWARE, .config = event, +#ifdef __aarch64__ + .config1 = 0x2, /* Request user access */ +#endif }; int err, i; @@ -150,7 +153,7 @@ static int test_stat_user_read(int event) pc = perf_evsel__mmap_base(evsel, 0, 0); __T("failed to get mmapped address", pc); -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) __T("userspace counter access not supported", pc->cap_user_rdpmc); __T("userspace counter access not enabled", pc->index); __T("userspace counter width not set", pc->pmc_width >= 32); From a0572cea8866230ac13da6358c88075f89e99b20 Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Sat, 5 Feb 2022 19:27:30 +0530 Subject: [PATCH 1117/1295] bus: mhi: pci_generic: Add mru_default for Foxconn SDX55 For default mechanism, product would use default MRU 3500 if they didn't define it. But for Foxconn SDX55, there is a known issue which MRU 3500 would lead to data connection lost. So we align it with Qualcomm default MRU settings. Link: https://lore.kernel.org/r/20220119101213.5008-1-slark_xiao@163.com [mani: Added pci_generic prefix to subject and CCed stable] Fixes: aac426562f56 ("bus: mhi: pci_generic: Introduce Foxconn T99W175 support") Cc: stable@vger.kernel.org # v5.12+ Reviewed-by: Manivannan Sadhasivam Signed-off-by: Slark Xiao Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220205135731.157871-2-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/bus/mhi/pci_generic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bus/mhi/pci_generic.c b/drivers/bus/mhi/pci_generic.c index 3a258a677df8..74e8fc342cfd 100644 --- a/drivers/bus/mhi/pci_generic.c +++ b/drivers/bus/mhi/pci_generic.c @@ -366,6 +366,7 @@ static const struct mhi_pci_dev_info mhi_foxconn_sdx55_info = { .config = &modem_foxconn_sdx55_config, .bar_num = MHI_PCI_DEFAULT_BAR_NUM, .dma_data_width = 32, + .mru_default = 32768, .sideband_wake = false, }; From 05daa805a86c831ad9692f6f15e1b877c8f10638 Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Sat, 5 Feb 2022 19:27:31 +0530 Subject: [PATCH 1118/1295] bus: mhi: pci_generic: Add mru_default for Cinterion MV31-W For default mechanism, product would use default MRU 3500 if they didn't define it. But for Cinterion MV31-W, there is a known issue which MRU 3500 would lead to data connection lost. So we align it with Qualcomm default MRU settings. Link: https://lore.kernel.org/r/20220119102519.5342-1-slark_xiao@163.com [mani: Modified the commit message to reflect Cinterion MV31-W and CCed stable] Fixes: 87693e092bd0 ("bus: mhi: pci_generic: Add Cinterion MV31-W PCIe to MHI") Cc: stable@vger.kernel.org # v5.14 + Reviewed-by: Manivannan Sadhasivam Signed-off-by: Slark Xiao Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220205135731.157871-3-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/bus/mhi/pci_generic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bus/mhi/pci_generic.c b/drivers/bus/mhi/pci_generic.c index 74e8fc342cfd..b79895810c52 100644 --- a/drivers/bus/mhi/pci_generic.c +++ b/drivers/bus/mhi/pci_generic.c @@ -402,6 +402,7 @@ static const struct mhi_pci_dev_info mhi_mv31_info = { .config = &modem_mv31_config, .bar_num = MHI_PCI_DEFAULT_BAR_NUM, .dma_data_width = 32, + .mru_default = 32768, }; static const struct mhi_channel_config mhi_sierra_em919x_channels[] = { From fceb62124d8fe1f6fb4b64e8f11c095dca8e7ea7 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 27 Jan 2022 21:20:10 +0800 Subject: [PATCH 1119/1295] perf ftrace: system_wide collection is not effective by default The ftrace.target.system_wide must be set before invoking evlist__create_maps(), otherwise it has no effect. Fixes: 53be50282269b46c ("perf ftrace: Add 'latency' subcommand") Signed-off-by: Changbin Du Acked-by: Namhyung Kim Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220127132010.4836-1-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-ftrace.c | 45 ++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index dec24dc0e767..a8785dec5ca6 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -1115,6 +1115,7 @@ enum perf_ftrace_subcommand { int cmd_ftrace(int argc, const char **argv) { int ret; + int (*cmd_func)(struct perf_ftrace *) = NULL; struct perf_ftrace ftrace = { .tracer = DEFAULT_TRACER, .target = { .uid = UINT_MAX, }, @@ -1221,6 +1222,28 @@ int cmd_ftrace(int argc, const char **argv) goto out_delete_filters; } + switch (subcmd) { + case PERF_FTRACE_TRACE: + if (!argc && target__none(&ftrace.target)) + ftrace.target.system_wide = true; + cmd_func = __cmd_ftrace; + break; + case PERF_FTRACE_LATENCY: + if (list_empty(&ftrace.filters)) { + pr_err("Should provide a function to measure\n"); + parse_options_usage(ftrace_usage, options, "T", 1); + ret = -EINVAL; + goto out_delete_filters; + } + cmd_func = __cmd_latency; + break; + case PERF_FTRACE_NONE: + default: + pr_err("Invalid subcommand\n"); + ret = -EINVAL; + goto out_delete_filters; + } + ret = target__validate(&ftrace.target); if (ret) { char errbuf[512]; @@ -1248,27 +1271,7 @@ int cmd_ftrace(int argc, const char **argv) goto out_delete_evlist; } - switch (subcmd) { - case PERF_FTRACE_TRACE: - if (!argc && target__none(&ftrace.target)) - ftrace.target.system_wide = true; - ret = __cmd_ftrace(&ftrace); - break; - case PERF_FTRACE_LATENCY: - if (list_empty(&ftrace.filters)) { - pr_err("Should provide a function to measure\n"); - parse_options_usage(ftrace_usage, options, "T", 1); - ret = -EINVAL; - goto out_delete_evlist; - } - ret = __cmd_latency(&ftrace); - break; - case PERF_FTRACE_NONE: - default: - pr_err("Invalid subcommand\n"); - ret = -EINVAL; - break; - } + ret = cmd_func(&ftrace); out_delete_evlist: evlist__delete(ftrace.evlist); From dfd42facf1e4ada021b939b4e19c935dcdd55566 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 6 Feb 2022 12:20:50 -0800 Subject: [PATCH 1120/1295] Linux 5.17-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1fc3491096cb..ceb987e5c87b 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Gobble Gobble # *DOCUMENTATION* From 13765de8148f71fa795e0a6607de37c49ea5915a Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Thu, 3 Feb 2022 08:18:46 -0800 Subject: [PATCH 1121/1295] sched/fair: Fix fault in reweight_entity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Syzbot found a GPF in reweight_entity. This has been bisected to commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") There is a race between sched_post_fork() and setpriority(PRIO_PGRP) within a thread group that causes a null-ptr-deref in reweight_entity() in CFS. The scenario is that the main process spawns number of new threads, which then call setpriority(PRIO_PGRP, 0, -20), wait, and exit. For each of the new threads the copy_process() gets invoked, which adds the new task_struct and calls sched_post_fork() for it. In the above scenario there is a possibility that setpriority(PRIO_PGRP) and set_one_prio() will be called for a thread in the group that is just being created by copy_process(), and for which the sched_post_fork() has not been executed yet. This will trigger a null pointer dereference in reweight_entity(), as it will try to access the run queue pointer, which hasn't been set. Before the mentioned change the cfs_rq pointer for the task has been set in sched_fork(), which is called much earlier in copy_process(), before the new task is added to the thread_group. Now it is done in the sched_post_fork(), which is called after that. To fix the issue the remove the update_load param from the update_load param() function and call reweight_task() only if the task flag doesn't have the TASK_NEW flag set. Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") Reported-by: syzbot+af7a719bc92395ee41b3@syzkaller.appspotmail.com Signed-off-by: Tadeusz Struk Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20220203161846.1160750-1-tadeusz.struk@linaro.org --- kernel/sched/core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 848eaa0efe0e..fcf0c180617c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1214,8 +1214,9 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p, bool update_load) +static void set_load_weight(struct task_struct *p) { + bool update_load = !(READ_ONCE(p->__state) & TASK_NEW); int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; @@ -4406,7 +4407,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = p->static_prio; - set_load_weight(p, false); + set_load_weight(p); /* * We don't need the reset flag anymore after the fork. It has @@ -6921,7 +6922,7 @@ void set_user_nice(struct task_struct *p, long nice) put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); + set_load_weight(p); old_prio = p->prio; p->prio = effective_prio(p); @@ -7212,7 +7213,7 @@ static void __setscheduler_params(struct task_struct *p, */ p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - set_load_weight(p, true); + set_load_weight(p); } /* @@ -9445,7 +9446,7 @@ void __init sched_init(void) #endif } - set_load_weight(&init_task, false); + set_load_weight(&init_task); /* * The boot idle thread does lazy MMU switching as well: From 5f4e5ce638e6a490b976ade4a40017b40abb2da0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 3 Feb 2022 16:40:57 -0800 Subject: [PATCH 1122/1295] perf: Fix list corruption in perf_cgroup_switch() There's list corruption on cgrp_cpuctx_list. This happens on the following path: perf_cgroup_switch: list_for_each_entry(cgrp_cpuctx_list) cpu_ctx_sched_in ctx_sched_in ctx_pinned_sched_in merge_sched_in perf_cgroup_event_disable: remove the event from the list Use list_for_each_entry_safe() to allow removing an entry during iteration. Fixes: 058fe1c0440e ("perf/core: Make cgroup switch visit only cpuctxs with cgroup events") Signed-off-by: Song Liu Reviewed-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220204004057.2961252-1-song@kernel.org --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 57c7197838db..6859229497b1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -839,7 +839,7 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); */ static void perf_cgroup_switch(struct task_struct *task, int mode) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_context *cpuctx, *tmp; struct list_head *list; unsigned long flags; @@ -850,7 +850,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_save(flags); list = this_cpu_ptr(&cgrp_cpuctx_list); - list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { + list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) { WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); perf_ctx_lock(cpuctx, cpuctx->task_ctx); From 4f9e67f5e03ab92ecbe51399a8cc55968106e8f9 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 27 Jan 2022 15:06:31 +0100 Subject: [PATCH 1123/1295] MAINTAINERS: add Alexander Gordeev as maintainer for s390 Change Alexander Gordeev's status so he is maintainer instead of reviewer for s390. Acked-by: Alexander Gordeev Acked-by: Christian Borntraeger Acked-by: Sven Schnelle Acked-by: Vasily Gorbik Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 69a2935daf6c..68b2ba510674 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16818,7 +16818,7 @@ S390 M: Heiko Carstens M: Vasily Gorbik M: Christian Borntraeger -R: Alexander Gordeev +M: Alexander Gordeev R: Sven Schnelle L: linux-s390@vger.kernel.org S: Supported From 72fc40931d7a488e1646d04f988f4d5a97b02100 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 27 Jan 2022 15:24:49 +0100 Subject: [PATCH 1124/1295] MAINTAINERS: downgrade myself to Reviewer for s390 Now that Alexander Gordeev has volunteered to be a co-maintainer for s390, I can act as a reviewer instead of being a maintainer for s390. With Alexander, Heiko, and Vasily we are in really good shape. I will continue to act as the maintainer for KVM on s390 together with Janosch. Signed-off-by: Christian Borntraeger Acked-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 68b2ba510674..5c56a172ce11 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16817,8 +16817,8 @@ F: drivers/video/fbdev/savage/ S390 M: Heiko Carstens M: Vasily Gorbik -M: Christian Borntraeger M: Alexander Gordeev +R: Christian Borntraeger R: Sven Schnelle L: linux-s390@vger.kernel.org S: Supported From e286f231eab410793f3e91c924e6dbd23edee05a Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 31 Jan 2022 14:17:11 +0100 Subject: [PATCH 1125/1295] s390/module: fix building test_modules_helpers.o with clang Move test_modules_return_* prototypes into a header file in order to placate -Wmissing-prototypes. Fixes: 90c5318795ee ("s390/module: test loading modules with a lot of relocations") Reported-by: kernel test robot Reviewed-by: Heiko Carstens Signed-off-by: Ilya Leoshkevich Signed-off-by: Vasily Gorbik --- arch/s390/lib/test_modules.c | 3 --- arch/s390/lib/test_modules.h | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/lib/test_modules.c b/arch/s390/lib/test_modules.c index d056baa8fbb0..9894009fc1f2 100644 --- a/arch/s390/lib/test_modules.c +++ b/arch/s390/lib/test_modules.c @@ -5,9 +5,6 @@ #include "test_modules.h" -#define DECLARE_RETURN(i) int test_modules_return_ ## i(void) -REPEAT_10000(DECLARE_RETURN); - /* * Test that modules with many relocations are loaded properly. */ diff --git a/arch/s390/lib/test_modules.h b/arch/s390/lib/test_modules.h index 43b5e4b4af3e..6371fcf17684 100644 --- a/arch/s390/lib/test_modules.h +++ b/arch/s390/lib/test_modules.h @@ -47,4 +47,7 @@ __REPEAT_10000_1(f, 8); \ __REPEAT_10000_1(f, 9) +#define DECLARE_RETURN(i) int test_modules_return_ ## i(void) +REPEAT_10000(DECLARE_RETURN); + #endif From d0cbe56a7d5ac170f6cf3757ef5a14dd854e7da9 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 6 Feb 2022 18:59:57 -0600 Subject: [PATCH 1126/1295] [smb3] improve error message when mount options conflict with posix POSIX extensions require SMB3.1.1 (so improve the error message when vers=3.0, 2.1 or 2.0 is specified on mount) Signed-off-by: Steve French --- fs/cifs/connect.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0b742bd50642..cff6c01feae2 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2340,10 +2340,19 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) if (ses->server->posix_ext_supported) { tcon->posix_extensions = true; pr_warn_once("SMB3.11 POSIX Extensions are experimental\n"); - } else { + } else if ((ses->server->vals->protocol_id == SMB311_PROT_ID) || + (strcmp(ses->server->vals->version_string, + SMB3ANY_VERSION_STRING) == 0) || + (strcmp(ses->server->vals->version_string, + SMBDEFAULT_VERSION_STRING) == 0)) { cifs_dbg(VFS, "Server does not support mounting with posix SMB3.11 extensions\n"); rc = -EOPNOTSUPP; goto out_fail; + } else { + cifs_dbg(VFS, "Check vers= mount option. SMB3.11 " + "disabled but required for POSIX extensions\n"); + rc = -EOPNOTSUPP; + goto out_fail; } } From 3037b174b1876aae6b2d1a27a878c681c78ccadc Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 7 Feb 2022 09:44:04 +0100 Subject: [PATCH 1127/1295] ARM: socfpga: fix missing RESET_CONTROLLER The SocFPGA machine since commit b3ca9888f35f ("reset: socfpga: add an early reset driver for SoCFPGA") uses reset controller, so it should select RESET_CONTROLLER explicitly. Selecting ARCH_HAS_RESET_CONTROLLER is not enough because it affects only default choice still allowing a non-buildable configuration: /usr/bin/arm-linux-gnueabi-ld: arch/arm/mach-socfpga/socfpga.o: in function `socfpga_init_irq': arch/arm/mach-socfpga/socfpga.c:56: undefined reference to `socfpga_reset_init' Reported-by: kernel test robot Cc: Fixes: b3ca9888f35f ("reset: socfpga: add an early reset driver for SoCFPGA") Signed-off-by: Krzysztof Kozlowski Signed-off-by: Dinh Nguyen --- arch/arm/mach-socfpga/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/mach-socfpga/Kconfig b/arch/arm/mach-socfpga/Kconfig index 43ddec677c0b..594edf9bbea4 100644 --- a/arch/arm/mach-socfpga/Kconfig +++ b/arch/arm/mach-socfpga/Kconfig @@ -2,6 +2,7 @@ menuconfig ARCH_INTEL_SOCFPGA bool "Altera SOCFPGA family" depends on ARCH_MULTI_V7 + select ARCH_HAS_RESET_CONTROLLER select ARCH_SUPPORTS_BIG_ENDIAN select ARM_AMBA select ARM_GIC @@ -18,6 +19,7 @@ menuconfig ARCH_INTEL_SOCFPGA select PL310_ERRATA_727915 select PL310_ERRATA_753970 if PL310 select PL310_ERRATA_769419 + select RESET_CONTROLLER if ARCH_INTEL_SOCFPGA config SOCFPGA_SUSPEND From fc764b103b81ebe88f69f9ae4f116551ab8cfc5d Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 3 Feb 2022 16:11:51 +0100 Subject: [PATCH 1128/1295] drm/vc4: crtc: Fix redundant variable assignment The variable is assigned twice to the same value. Let's drop one. Reported-by: kernel test robot Signed-off-by: Maxime Ripard Reviewed-by: Sam Ravnborg Link: https://patchwork.freedesktop.org/patch/msgid/20220203151151.1270461-1-maxime@cerno.tech --- drivers/gpu/drm/vc4/vc4_crtc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c index 287dbc89ad64..e6cc47470e03 100644 --- a/drivers/gpu/drm/vc4/vc4_crtc.c +++ b/drivers/gpu/drm/vc4/vc4_crtc.c @@ -671,7 +671,6 @@ static int vc4_crtc_atomic_check(struct drm_crtc *crtc, const struct drm_display_mode *mode = &crtc_state->adjusted_mode; struct vc4_encoder *vc4_encoder = to_vc4_encoder(encoder); - mode = &crtc_state->adjusted_mode; if (vc4_encoder->type == VC4_ENCODER_TYPE_HDMI0) { vc4_state->hvs_load = max(mode->clock * mode->hdisplay / mode->htotal + 1000, mode->clock * 9 / 10) * 1000; From 94fdd7c02a56d0316d20e417a1141b71a8dcee82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 6 Feb 2022 06:33:48 -0800 Subject: [PATCH 1129/1295] net/smc: use GFP_ATOMIC allocation in smc_pnet_add_eth() My last patch moved the netdev_tracker_alloc() call to a section protected by a write_lock(). I should have replaced GFP_KERNEL with GFP_ATOMIC to avoid the infamous: BUG: sleeping function called from invalid context at include/linux/sched/mm.h:256 Fixes: 28f922213886 ("net/smc: fix ref_tracker issue in smc_pnet_add()") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/smc/smc_pnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index fb6331d97185..0599246c0376 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -382,7 +382,7 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, if (ndev) { new_pe->ndev = ndev; netdev_tracker_alloc(ndev, &new_pe->dev_tracker, - GFP_KERNEL); + GFP_ATOMIC); } list_add_tail(&new_pe->list, &pnettable->pnetlist); write_unlock(&pnettable->lock); From eb48d42198792f1330bbb3e82ac725d43c13fe02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Wed, 26 Jan 2022 10:15:38 +0200 Subject: [PATCH 1130/1295] drm/i915: Fix oops due to missing stack depot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We call __save_depot_stack() unconditionally so the stack depot must always be initialized or else we'll oops on platforms without runtime pm support. Presumably we've not seen this in CI due to stack_depot_init() already getting called via drm_mm_init()+CONFIG_DRM_DEBUG_MM. Cc: Vlastimil Babka Cc: Dmitry Vyukov Cc: Marco Elver # stackdepot Cc: Chris Wilson Cc: Imre Deak Fixes: 2dba5eb1c73b ("lib/stackdepot: allow optional init and stack_table allocation by kvmalloc()") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220126081539.23227-1-ville.syrjala@linux.intel.com Acked-by: Vlastimil Babka Reviewed-by: Imre Deak (cherry picked from commit 751a9d69b19702af35b0fedfb8ff362027c1cf0c) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/intel_runtime_pm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c index 53f1ccb78849..64c2708efc9e 100644 --- a/drivers/gpu/drm/i915/intel_runtime_pm.c +++ b/drivers/gpu/drm/i915/intel_runtime_pm.c @@ -68,9 +68,7 @@ static noinline depot_stack_handle_t __save_depot_stack(void) static void init_intel_runtime_pm_wakeref(struct intel_runtime_pm *rpm) { spin_lock_init(&rpm->debug.lock); - - if (rpm->available) - stack_depot_init(); + stack_depot_init(); } static noinline depot_stack_handle_t From ee59792c97176f12c1da31f29fc4c2aab187f06e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 28 Jan 2022 12:37:50 +0200 Subject: [PATCH 1131/1295] drm/i915: Disable DRRS on IVB/HSW port != A MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we allow DRRS on IVB PCH ports, but we're missing a few programming steps meaning it is guaranteed to not work. And on HSW DRRS is not supported on anything but port A ever as only transcoder EDP has the M2/N2 registers (though I'm not sure if HSW ever has eDP on any other port). Starting from BDW all transcoders have the dynamically reprogrammable M/N registers so DRRS could work on any port. Stop initializing DRRS on ports where it cannot possibly work. Cc: stable@vger.kernel.org Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220128103757.22461-11-ville.syrjala@linux.intel.com Reviewed-by: Jani Nikula (cherry picked from commit f0d4ce59f4d48622044933054a0e0cefa91ba15e) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_drrs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_drrs.c b/drivers/gpu/drm/i915/display/intel_drrs.c index c1439fcb5a95..3ff149df4a77 100644 --- a/drivers/gpu/drm/i915/display/intel_drrs.c +++ b/drivers/gpu/drm/i915/display/intel_drrs.c @@ -405,6 +405,7 @@ intel_drrs_init(struct intel_connector *connector, struct drm_display_mode *fixed_mode) { struct drm_i915_private *dev_priv = to_i915(connector->base.dev); + struct intel_encoder *encoder = connector->encoder; struct drm_display_mode *downclock_mode = NULL; INIT_DELAYED_WORK(&dev_priv->drrs.work, intel_drrs_downclock_work); @@ -416,6 +417,13 @@ intel_drrs_init(struct intel_connector *connector, return NULL; } + if ((DISPLAY_VER(dev_priv) < 8 && !HAS_GMCH(dev_priv)) && + encoder->port != PORT_A) { + drm_dbg_kms(&dev_priv->drm, + "DRRS only supported on eDP port A\n"); + return NULL; + } + if (dev_priv->vbt.drrs_type != SEAMLESS_DRRS_SUPPORT) { drm_dbg_kms(&dev_priv->drm, "VBT doesn't support DRRS\n"); return NULL; From 3526b607b02397cdb6d459594e4f1d63133d6655 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Tue, 1 Feb 2022 08:03:40 +0100 Subject: [PATCH 1132/1295] drm/i915/ttm: Return some errors instead of trying memcpy move MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The i915_ttm_accel_move() function may return error codes that should be propagated further up the stack rather than consumed assuming that the accel move failed and could be replaced with a memcpy move. For -EINTR, -ERESTARTSYS and -EAGAIN, just propagate those codes, rather than retrying with a memcpy move. Fixes: 2b0a750caf33 ("drm/i915/ttm: Failsafe migration blits") Cc: Matthew Auld Signed-off-by: Thomas Hellström Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20220201070340.16457-1-thomas.hellstrom@linux.intel.com (cherry picked from commit 29b9702ffe70d83b9970abbccaeb287dfda4409f) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c index ee9612a3ee5e..e130c820ae4e 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c @@ -427,11 +427,17 @@ __i915_ttm_move(struct ttm_buffer_object *bo, if (!IS_ERR(fence)) goto out; - } else if (move_deps) { - int err = i915_deps_sync(move_deps, ctx); + } else { + int err = PTR_ERR(fence); - if (err) - return ERR_PTR(err); + if (err == -EINTR || err == -ERESTARTSYS || err == -EAGAIN) + return fence; + + if (move_deps) { + err = i915_deps_sync(move_deps, ctx); + if (err) + return ERR_PTR(err); + } } /* Error intercept failed or no accelerated migration to start with */ From 9d7516b16f2a7ecbddd7940e582c78fcdc9136ef Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Mon, 31 Jan 2022 08:59:25 -0800 Subject: [PATCH 1133/1295] drm/i915: Fix header test for !CONFIG_X86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Architectures others than x86 have a stub implementation calling WARN_ON_ONCE(). The appropriate headers need to be included, otherwise the header-test target will fail with: HDRTEST drivers/gpu/drm/i915/i915_mm.h In file included from : ./drivers/gpu/drm/i915/i915_mm.h: In function ‘remap_io_mapping’: ./drivers/gpu/drm/i915/i915_mm.h:26:2: error: implicit declaration of function ‘WARN_ON_ONCE’ [-Werror=implicit-function-declaration] 26 | WARN_ON_ONCE(1); | ^~~~~~~~~~~~ v2: Do not include since call to pr_err() has been removed Fixes: 67c430bbaae1 ("drm/i915: Skip remap_io_mapping() for non-x86 platforms") Cc: Siva Mullati Signed-off-by: Lucas De Marchi Reviewed-by: Siva Mullati Reviewed-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20220131165926.3230642-3-lucas.demarchi@intel.com (cherry picked from commit 377c675f3c17ffaefd023ee283bb366bbd6bbcea) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_mm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/i915_mm.h b/drivers/gpu/drm/i915/i915_mm.h index 76f1d53bdf34..3ad22bbe80eb 100644 --- a/drivers/gpu/drm/i915/i915_mm.h +++ b/drivers/gpu/drm/i915/i915_mm.h @@ -6,6 +6,7 @@ #ifndef __I915_MM_H__ #define __I915_MM_H__ +#include #include struct vm_area_struct; From 8fd5a26e43859547790a7995494c952b708ab3b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 4 Feb 2022 16:18:16 +0200 Subject: [PATCH 1134/1295] drm/i915: Allow !join_mbus cases for adlp+ dbuf configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reintroduce the !join_mbus single pipe cases for adlp+. Due to the mbus relative dbuf offsets in PLANE_BUF_CFG we need to know the actual slices used by the pipe when doing readout, even when mbus joining isn't enabled. Accurate readout will be needed to properly sanitize invalid BIOS dbuf configurations. This will also make it much easier to play around with the !join_mbus configs for testin/workaround purposes. Cc: # v5.14+ Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220204141818.1900-1-ville.syrjala@linux.intel.com Reviewed-by: Stanislav Lisovskiy (cherry picked from commit eef173954432fe0612acb63421a95deb41155cdc) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/intel_pm.c | 66 +++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 434b1f8b7fe3..ddae296ced9e 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -4717,6 +4717,10 @@ static const struct dbuf_slice_conf_entry dg2_allowed_dbufs[] = { }; static const struct dbuf_slice_conf_entry adlp_allowed_dbufs[] = { + /* + * Keep the join_mbus cases first so check_mbus_joined() + * will prefer them over the !join_mbus cases. + */ { .active_pipes = BIT(PIPE_A), .dbuf_mask = { @@ -4731,6 +4735,20 @@ static const struct dbuf_slice_conf_entry adlp_allowed_dbufs[] = { }, .join_mbus = true, }, + { + .active_pipes = BIT(PIPE_A), + .dbuf_mask = { + [PIPE_A] = BIT(DBUF_S1) | BIT(DBUF_S2), + }, + .join_mbus = false, + }, + { + .active_pipes = BIT(PIPE_B), + .dbuf_mask = { + [PIPE_B] = BIT(DBUF_S3) | BIT(DBUF_S4), + }, + .join_mbus = false, + }, { .active_pipes = BIT(PIPE_A) | BIT(PIPE_B), .dbuf_mask = { @@ -4847,13 +4865,14 @@ static bool adlp_check_mbus_joined(u8 active_pipes) return check_mbus_joined(active_pipes, adlp_allowed_dbufs); } -static u8 compute_dbuf_slices(enum pipe pipe, u8 active_pipes, +static u8 compute_dbuf_slices(enum pipe pipe, u8 active_pipes, bool join_mbus, const struct dbuf_slice_conf_entry *dbuf_slices) { int i; for (i = 0; i < dbuf_slices[i].active_pipes; i++) { - if (dbuf_slices[i].active_pipes == active_pipes) + if (dbuf_slices[i].active_pipes == active_pipes && + dbuf_slices[i].join_mbus == join_mbus) return dbuf_slices[i].dbuf_mask[pipe]; } return 0; @@ -4864,7 +4883,7 @@ static u8 compute_dbuf_slices(enum pipe pipe, u8 active_pipes, * returns correspondent DBuf slice mask as stated in BSpec for particular * platform. */ -static u8 icl_compute_dbuf_slices(enum pipe pipe, u8 active_pipes) +static u8 icl_compute_dbuf_slices(enum pipe pipe, u8 active_pipes, bool join_mbus) { /* * FIXME: For ICL this is still a bit unclear as prev BSpec revision @@ -4878,37 +4897,41 @@ static u8 icl_compute_dbuf_slices(enum pipe pipe, u8 active_pipes) * still here - we will need it once those additional constraints * pop up. */ - return compute_dbuf_slices(pipe, active_pipes, icl_allowed_dbufs); + return compute_dbuf_slices(pipe, active_pipes, join_mbus, + icl_allowed_dbufs); } -static u8 tgl_compute_dbuf_slices(enum pipe pipe, u8 active_pipes) +static u8 tgl_compute_dbuf_slices(enum pipe pipe, u8 active_pipes, bool join_mbus) { - return compute_dbuf_slices(pipe, active_pipes, tgl_allowed_dbufs); + return compute_dbuf_slices(pipe, active_pipes, join_mbus, + tgl_allowed_dbufs); } -static u32 adlp_compute_dbuf_slices(enum pipe pipe, u32 active_pipes) +static u8 adlp_compute_dbuf_slices(enum pipe pipe, u8 active_pipes, bool join_mbus) { - return compute_dbuf_slices(pipe, active_pipes, adlp_allowed_dbufs); + return compute_dbuf_slices(pipe, active_pipes, join_mbus, + adlp_allowed_dbufs); } -static u32 dg2_compute_dbuf_slices(enum pipe pipe, u32 active_pipes) +static u8 dg2_compute_dbuf_slices(enum pipe pipe, u8 active_pipes, bool join_mbus) { - return compute_dbuf_slices(pipe, active_pipes, dg2_allowed_dbufs); + return compute_dbuf_slices(pipe, active_pipes, join_mbus, + dg2_allowed_dbufs); } -static u8 skl_compute_dbuf_slices(struct intel_crtc *crtc, u8 active_pipes) +static u8 skl_compute_dbuf_slices(struct intel_crtc *crtc, u8 active_pipes, bool join_mbus) { struct drm_i915_private *dev_priv = to_i915(crtc->base.dev); enum pipe pipe = crtc->pipe; if (IS_DG2(dev_priv)) - return dg2_compute_dbuf_slices(pipe, active_pipes); + return dg2_compute_dbuf_slices(pipe, active_pipes, join_mbus); else if (IS_ALDERLAKE_P(dev_priv)) - return adlp_compute_dbuf_slices(pipe, active_pipes); + return adlp_compute_dbuf_slices(pipe, active_pipes, join_mbus); else if (DISPLAY_VER(dev_priv) == 12) - return tgl_compute_dbuf_slices(pipe, active_pipes); + return tgl_compute_dbuf_slices(pipe, active_pipes, join_mbus); else if (DISPLAY_VER(dev_priv) == 11) - return icl_compute_dbuf_slices(pipe, active_pipes); + return icl_compute_dbuf_slices(pipe, active_pipes, join_mbus); /* * For anything else just return one slice yet. * Should be extended for other platforms. @@ -6127,11 +6150,16 @@ skl_compute_ddb(struct intel_atomic_state *state) return ret; } + if (IS_ALDERLAKE_P(dev_priv)) + new_dbuf_state->joined_mbus = + adlp_check_mbus_joined(new_dbuf_state->active_pipes); + for_each_intel_crtc(&dev_priv->drm, crtc) { enum pipe pipe = crtc->pipe; new_dbuf_state->slices[pipe] = - skl_compute_dbuf_slices(crtc, new_dbuf_state->active_pipes); + skl_compute_dbuf_slices(crtc, new_dbuf_state->active_pipes, + new_dbuf_state->joined_mbus); if (old_dbuf_state->slices[pipe] == new_dbuf_state->slices[pipe]) continue; @@ -6143,9 +6171,6 @@ skl_compute_ddb(struct intel_atomic_state *state) new_dbuf_state->enabled_slices = intel_dbuf_enabled_slices(new_dbuf_state); - if (IS_ALDERLAKE_P(dev_priv)) - new_dbuf_state->joined_mbus = adlp_check_mbus_joined(new_dbuf_state->active_pipes); - if (old_dbuf_state->enabled_slices != new_dbuf_state->enabled_slices || old_dbuf_state->joined_mbus != new_dbuf_state->joined_mbus) { ret = intel_atomic_serialize_global_state(&new_dbuf_state->base); @@ -6646,7 +6671,8 @@ void skl_wm_get_hw_state(struct drm_i915_private *dev_priv) } dbuf_state->slices[pipe] = - skl_compute_dbuf_slices(crtc, dbuf_state->active_pipes); + skl_compute_dbuf_slices(crtc, dbuf_state->active_pipes, + dbuf_state->joined_mbus); dbuf_state->weight[pipe] = intel_crtc_ddb_weight(crtc_state); From 85bb289215cf37e05e9581b39b114db1293f9ecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 4 Feb 2022 16:18:17 +0200 Subject: [PATCH 1135/1295] drm/i915: Populate pipe dbuf slices more accurately during readout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During readout we cannot assume the planes are actually using the slices they are supposed to use. The BIOS may have misprogrammed things and put the planes onto the wrong dbuf slices. So let's do the readout more carefully to make sure we really know which dbuf slices are actually in use by the pipe at the time. Cc: # v5.14+ Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220204141818.1900-2-ville.syrjala@linux.intel.com Reviewed-by: Stanislav Lisovskiy (cherry picked from commit b3dcc6dc0f32612d04839c2fb32e94d0ebf92c98) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/intel_pm.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index ddae296ced9e..a298846dd8cf 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -6651,6 +6651,7 @@ void skl_wm_get_hw_state(struct drm_i915_private *dev_priv) enum pipe pipe = crtc->pipe; unsigned int mbus_offset; enum plane_id plane_id; + u8 slices; skl_pipe_wm_get_hw_state(crtc, &crtc_state->wm.skl.optimal); crtc_state->wm.skl.raw = crtc_state->wm.skl.optimal; @@ -6670,20 +6671,22 @@ void skl_wm_get_hw_state(struct drm_i915_private *dev_priv) skl_ddb_entry_union(&dbuf_state->ddb[pipe], ddb_uv); } - dbuf_state->slices[pipe] = - skl_compute_dbuf_slices(crtc, dbuf_state->active_pipes, - dbuf_state->joined_mbus); - dbuf_state->weight[pipe] = intel_crtc_ddb_weight(crtc_state); /* * Used for checking overlaps, so we need absolute * offsets instead of MBUS relative offsets. */ - mbus_offset = mbus_ddb_offset(dev_priv, dbuf_state->slices[pipe]); + slices = skl_compute_dbuf_slices(crtc, dbuf_state->active_pipes, + dbuf_state->joined_mbus); + mbus_offset = mbus_ddb_offset(dev_priv, slices); crtc_state->wm.skl.ddb.start = mbus_offset + dbuf_state->ddb[pipe].start; crtc_state->wm.skl.ddb.end = mbus_offset + dbuf_state->ddb[pipe].end; + /* The slices actually used by the planes on the pipe */ + dbuf_state->slices[pipe] = + skl_ddb_dbuf_slice_mask(dev_priv, &crtc_state->wm.skl.ddb); + drm_dbg_kms(&dev_priv->drm, "[CRTC:%d:%s] dbuf slices 0x%x, ddb (%d - %d), active pipes 0x%x, mbus joined: %s\n", crtc->base.base.id, crtc->base.name, From 4e6f55120c7eccf6f9323bb681632e23cbcb3f3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 4 Feb 2022 16:18:18 +0200 Subject: [PATCH 1136/1295] drm/i915: Workaround broken BIOS DBUF configuration on TGL/RKL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On TGL/RKL the BIOS likes to use some kind of bogus DBUF layout that doesn't match what the spec recommends. With a single active pipe that is not going to be a problem, but with multiple pipes active skl_commit_modeset_enables() goes into an infinite loop since it can't figure out any order in which it can commit the pipes without causing DBUF overlaps between the planes. We'd need some kind of extra DBUF defrag stage in between to make the transition possible. But that is clearly way too complex a solution, so in the name of simplicity let's just sanitize the DBUF state by simply turning off all planes when we detect a pipe encroaching on its neighbours' DBUF slices. We only have to disable the primary planes as all other planes should have already been disabled (if they somehow were enabled) by earlier sanitization steps. And for good measure let's also sanitize in case the DBUF allocations of the pipes already seem to overlap each other. Cc: # v5.14+ Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/4762 Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220204141818.1900-3-ville.syrjala@linux.intel.com Reviewed-by: Stanislav Lisovskiy (cherry picked from commit 15512021eb3975a8c2366e3883337e252bb0eee5) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_display.c | 1 + drivers/gpu/drm/i915/intel_pm.c | 68 ++++++++++++++++++++ drivers/gpu/drm/i915/intel_pm.h | 1 + 3 files changed, 70 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index bf7ce684dd8e..bb4a85445fc6 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -10673,6 +10673,7 @@ intel_modeset_setup_hw_state(struct drm_device *dev, vlv_wm_sanitize(dev_priv); } else if (DISPLAY_VER(dev_priv) >= 9) { skl_wm_get_hw_state(dev_priv); + skl_wm_sanitize(dev_priv); } else if (HAS_PCH_SPLIT(dev_priv)) { ilk_wm_get_hw_state(dev_priv); } diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index a298846dd8cf..3edba7fd0c49 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -6698,6 +6698,74 @@ void skl_wm_get_hw_state(struct drm_i915_private *dev_priv) dbuf_state->enabled_slices = dev_priv->dbuf.enabled_slices; } +static bool skl_dbuf_is_misconfigured(struct drm_i915_private *i915) +{ + const struct intel_dbuf_state *dbuf_state = + to_intel_dbuf_state(i915->dbuf.obj.state); + struct skl_ddb_entry entries[I915_MAX_PIPES] = {}; + struct intel_crtc *crtc; + + for_each_intel_crtc(&i915->drm, crtc) { + const struct intel_crtc_state *crtc_state = + to_intel_crtc_state(crtc->base.state); + + entries[crtc->pipe] = crtc_state->wm.skl.ddb; + } + + for_each_intel_crtc(&i915->drm, crtc) { + const struct intel_crtc_state *crtc_state = + to_intel_crtc_state(crtc->base.state); + u8 slices; + + slices = skl_compute_dbuf_slices(crtc, dbuf_state->active_pipes, + dbuf_state->joined_mbus); + if (dbuf_state->slices[crtc->pipe] & ~slices) + return true; + + if (skl_ddb_allocation_overlaps(&crtc_state->wm.skl.ddb, entries, + I915_MAX_PIPES, crtc->pipe)) + return true; + } + + return false; +} + +void skl_wm_sanitize(struct drm_i915_private *i915) +{ + struct intel_crtc *crtc; + + /* + * On TGL/RKL (at least) the BIOS likes to assign the planes + * to the wrong DBUF slices. This will cause an infinite loop + * in skl_commit_modeset_enables() as it can't find a way to + * transition between the old bogus DBUF layout to the new + * proper DBUF layout without DBUF allocation overlaps between + * the planes (which cannot be allowed or else the hardware + * may hang). If we detect a bogus DBUF layout just turn off + * all the planes so that skl_commit_modeset_enables() can + * simply ignore them. + */ + if (!skl_dbuf_is_misconfigured(i915)) + return; + + drm_dbg_kms(&i915->drm, "BIOS has misprogrammed the DBUF, disabling all planes\n"); + + for_each_intel_crtc(&i915->drm, crtc) { + struct intel_plane *plane = to_intel_plane(crtc->base.primary); + const struct intel_plane_state *plane_state = + to_intel_plane_state(plane->base.state); + struct intel_crtc_state *crtc_state = + to_intel_crtc_state(crtc->base.state); + + if (plane_state->uapi.visible) + intel_plane_disable_noatomic(crtc, plane); + + drm_WARN_ON(&i915->drm, crtc_state->active_planes != 0); + + memset(&crtc_state->wm.skl.ddb, 0, sizeof(crtc_state->wm.skl.ddb)); + } +} + static void ilk_pipe_wm_get_hw_state(struct intel_crtc *crtc) { struct drm_device *dev = crtc->base.dev; diff --git a/drivers/gpu/drm/i915/intel_pm.h b/drivers/gpu/drm/i915/intel_pm.h index 990cdcaf85ce..d2243653a893 100644 --- a/drivers/gpu/drm/i915/intel_pm.h +++ b/drivers/gpu/drm/i915/intel_pm.h @@ -47,6 +47,7 @@ void skl_pipe_wm_get_hw_state(struct intel_crtc *crtc, struct skl_pipe_wm *out); void g4x_wm_sanitize(struct drm_i915_private *dev_priv); void vlv_wm_sanitize(struct drm_i915_private *dev_priv); +void skl_wm_sanitize(struct drm_i915_private *dev_priv); bool intel_can_enable_sagv(struct drm_i915_private *dev_priv, const struct intel_bw_state *bw_state); void intel_sagv_pre_plane_update(struct intel_atomic_state *state); From 200e8e3e43c4da4bd5ca83722523754ddb14ca02 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 7 Feb 2022 14:04:07 +0100 Subject: [PATCH 1137/1295] drm/privacy-screen: Fix sphinx warning Fix the following warning from "make htmldocs": drivers/gpu/drm/drm_privacy_screen.c:270: WARNING: Inline emphasis start-string without end-string. Fixes: 8a12b170558a ("drm/privacy-screen: Add notifier support (v2)") Reported-by: Stephen Rothwell Signed-off-by: Hans de Goede Reviewed-by: Simon Ser Link: https://lore.kernel.org/r/20220207130407.389585-1-hdegoede@redhat.com Link: https://patchwork.freedesktop.org/patch/msgid/20220207130407.389585-1-hdegoede@redhat.com --- drivers/gpu/drm/drm_privacy_screen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_privacy_screen.c b/drivers/gpu/drm/drm_privacy_screen.c index beaf99e9120a..b688841c18e4 100644 --- a/drivers/gpu/drm/drm_privacy_screen.c +++ b/drivers/gpu/drm/drm_privacy_screen.c @@ -269,7 +269,7 @@ EXPORT_SYMBOL(drm_privacy_screen_get_state); * * The notifier is called with no locks held. The new hw_state and sw_state * can be retrieved using the drm_privacy_screen_get_state() function. - * A pointer to the drm_privacy_screen's struct is passed as the void *data + * A pointer to the drm_privacy_screen's struct is passed as the ``void *data`` * argument of the notifier_block's notifier_call. * * The notifier will NOT be called when changes are made through From fda17afc6166e975bec1197bd94cd2a3317bce3f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 7 Feb 2022 11:27:53 +0900 Subject: [PATCH 1138/1295] ata: libata-core: Fix ata_dev_config_cpr() The concurrent positioning ranges log page 47h is a general purpose log page and not a subpage of the indentify device log. Using ata_identify_page_supported() to test for concurrent positioning ranges support is thus wrong. ata_log_supported() must be used. Furthermore, unlike other advanced ATA features (e.g. NCQ priority), accesses to the concurrent positioning ranges log page are not gated by a feature bit from the device IDENTIFY data. Since many older drives react badly to the READ LOG EXT and/or READ LOG DMA EXT commands isued to read device log pages, avoid problems with older drives by limiting the concurrent positioning ranges support detection to drives implementing at least the ACS-4 ATA standard (major version 11). This additional condition effectively turns ata_dev_config_cpr() into a nop for older drives, avoiding problems in the field. Fixes: fe22e1c2f705 ("libata: support concurrent positioning ranges log") BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215519 Cc: stable@vger.kernel.org Reviewed-by: Hannes Reinecke Tested-by: Abderraouf Adjal Signed-off-by: Damien Le Moal --- drivers/ata/libata-core.c | 14 ++++++-------- include/linux/ata.h | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index e1b1dd215267..ba9273f80069 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2448,23 +2448,21 @@ static void ata_dev_config_cpr(struct ata_device *dev) struct ata_cpr_log *cpr_log = NULL; u8 *desc, *buf = NULL; - if (!ata_identify_page_supported(dev, - ATA_LOG_CONCURRENT_POSITIONING_RANGES)) + if (ata_id_major_version(dev->id) < 11 || + !ata_log_supported(dev, ATA_LOG_CONCURRENT_POSITIONING_RANGES)) goto out; /* - * Read IDENTIFY DEVICE data log, page 0x47 - * (concurrent positioning ranges). We can have at most 255 32B range - * descriptors plus a 64B header. + * Read the concurrent positioning ranges log (0x47). We can have at + * most 255 32B range descriptors plus a 64B header. */ buf_len = (64 + 255 * 32 + 511) & ~511; buf = kzalloc(buf_len, GFP_KERNEL); if (!buf) goto out; - err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, - ATA_LOG_CONCURRENT_POSITIONING_RANGES, - buf, buf_len >> 9); + err_mask = ata_read_log_page(dev, ATA_LOG_CONCURRENT_POSITIONING_RANGES, + 0, buf, buf_len >> 9); if (err_mask) goto out; diff --git a/include/linux/ata.h b/include/linux/ata.h index 199e47e97d64..21292b5bbb55 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -324,12 +324,12 @@ enum { ATA_LOG_NCQ_NON_DATA = 0x12, ATA_LOG_NCQ_SEND_RECV = 0x13, ATA_LOG_IDENTIFY_DEVICE = 0x30, + ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47, /* Identify device log pages: */ ATA_LOG_SECURITY = 0x06, ATA_LOG_SATA_SETTINGS = 0x08, ATA_LOG_ZONED_INFORMATION = 0x09, - ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47, /* Identify device SATA settings log:*/ ATA_LOG_DEVSLP_OFFSET = 0x30, From bf5bdcc9f262b5afd3c0f06c39b34b4f2fcff661 Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Thu, 13 Jan 2022 18:19:20 +0100 Subject: [PATCH 1139/1295] MAINTAINERS: Update Benjamin Gaignard maintainer status Update Benjamin Gaignard address and remove it from no more maintained drivers. Signed-off-by: Benjamin Gaignard Reviewed-by: Philippe Cornu Link: https://lore.kernel.org/r/20220113171921.17466-2-philippe.cornu@foss.st.com' Signed-off-by: Arnd Bergmann --- MAINTAINERS | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 69a2935daf6c..a7a7dab2e41d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5772,7 +5772,7 @@ F: tools/testing/selftests/dma/ DMA-BUF HEAPS FRAMEWORK M: Sumit Semwal -R: Benjamin Gaignard +R: Benjamin Gaignard R: Liam Mark R: Laura Abbott R: Brian Starkey @@ -6502,7 +6502,6 @@ F: Documentation/devicetree/bindings/display/rockchip/ F: drivers/gpu/drm/rockchip/ DRM DRIVERS FOR STI -M: Benjamin Gaignard L: dri-devel@lists.freedesktop.org S: Maintained T: git git://anongit.freedesktop.org/drm/drm-misc @@ -6512,7 +6511,6 @@ F: drivers/gpu/drm/sti DRM DRIVERS FOR STM M: Yannick Fertre M: Philippe Cornu -M: Benjamin Gaignard L: dri-devel@lists.freedesktop.org S: Maintained T: git git://anongit.freedesktop.org/drm/drm-misc @@ -18441,7 +18439,6 @@ F: Documentation/devicetree/bindings/sound/st,sti-asoc-card.txt F: sound/soc/sti/ STI CEC DRIVER -M: Benjamin Gaignard S: Maintained F: Documentation/devicetree/bindings/media/stih-cec.txt F: drivers/media/cec/platform/sti/ From 60f40305529b38d6c3903bd833dc25e39b94e5b6 Mon Sep 17 00:00:00 2001 From: Philippe Cornu Date: Thu, 13 Jan 2022 18:19:21 +0100 Subject: [PATCH 1140/1295] MAINTAINERS: update drm/stm drm/sti and cec/sti maintainers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Alain as sti maintainer for both drm/sti & cec/sti. Add Raphaël as stm maintainer for drm/stm. Signed-off-by: Philippe Cornu Reviewed-by: Raphael Gallais-Pou Reviewed-by: Alain Volmat Link: https://lore.kernel.org/r/20220113171921.17466-3-philippe.cornu@foss.st.com' Signed-off-by: Arnd Bergmann --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a7a7dab2e41d..caa5b018bd36 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6502,6 +6502,7 @@ F: Documentation/devicetree/bindings/display/rockchip/ F: drivers/gpu/drm/rockchip/ DRM DRIVERS FOR STI +M: Alain Volmat L: dri-devel@lists.freedesktop.org S: Maintained T: git git://anongit.freedesktop.org/drm/drm-misc @@ -6510,6 +6511,7 @@ F: drivers/gpu/drm/sti DRM DRIVERS FOR STM M: Yannick Fertre +M: Raphael Gallais-Pou M: Philippe Cornu L: dri-devel@lists.freedesktop.org S: Maintained @@ -18439,6 +18441,7 @@ F: Documentation/devicetree/bindings/sound/st,sti-asoc-card.txt F: sound/soc/sti/ STI CEC DRIVER +M: Alain Volmat S: Maintained F: Documentation/devicetree/bindings/media/stih-cec.txt F: drivers/media/cec/platform/sti/ From 18a1d5e1945385d9b5adc3fe11427ce4a9d2826e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Feb 2022 17:16:39 +0200 Subject: [PATCH 1141/1295] parisc: Add ioread64_lo_hi() and iowrite64_lo_hi() It's a followup to the previous commit f15309d7ad5d ("parisc: Add ioread64_hi_lo() and iowrite64_hi_lo()") which does only half of the job. Add the rest, so we won't get a new kernel test robot reports. Fixes: f15309d7ad5d ("parisc: Add ioread64_hi_lo() and iowrite64_hi_lo()") Signed-off-by: Andy Shevchenko Signed-off-by: Helge Deller --- arch/parisc/lib/iomap.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/parisc/lib/iomap.c b/arch/parisc/lib/iomap.c index 367f6397bda7..860385058085 100644 --- a/arch/parisc/lib/iomap.c +++ b/arch/parisc/lib/iomap.c @@ -346,6 +346,16 @@ u64 ioread64be(const void __iomem *addr) return *((u64 *)addr); } +u64 ioread64_lo_hi(const void __iomem *addr) +{ + u32 low, high; + + low = ioread32(addr); + high = ioread32(addr + sizeof(u32)); + + return low + ((u64)high << 32); +} + u64 ioread64_hi_lo(const void __iomem *addr) { u32 low, high; @@ -419,6 +429,12 @@ void iowrite64be(u64 datum, void __iomem *addr) } } +void iowrite64_lo_hi(u64 val, void __iomem *addr) +{ + iowrite32(val, addr); + iowrite32(val >> 32, addr + sizeof(u32)); +} + void iowrite64_hi_lo(u64 val, void __iomem *addr) { iowrite32(val >> 32, addr + sizeof(u32)); @@ -530,6 +546,7 @@ EXPORT_SYMBOL(ioread32); EXPORT_SYMBOL(ioread32be); EXPORT_SYMBOL(ioread64); EXPORT_SYMBOL(ioread64be); +EXPORT_SYMBOL(ioread64_lo_hi); EXPORT_SYMBOL(ioread64_hi_lo); EXPORT_SYMBOL(iowrite8); EXPORT_SYMBOL(iowrite16); @@ -538,6 +555,7 @@ EXPORT_SYMBOL(iowrite32); EXPORT_SYMBOL(iowrite32be); EXPORT_SYMBOL(iowrite64); EXPORT_SYMBOL(iowrite64be); +EXPORT_SYMBOL(iowrite64_lo_hi); EXPORT_SYMBOL(iowrite64_hi_lo); EXPORT_SYMBOL(ioread8_rep); EXPORT_SYMBOL(ioread16_rep); From 0d7c1153d9291197c1dc473cfaade77acb874b4b Mon Sep 17 00:00:00 2001 From: Alviro Iskandar Setiawan Date: Mon, 7 Feb 2022 21:05:33 +0700 Subject: [PATCH 1142/1295] io_uring: Clean up a false-positive warning from GCC 9.3.0 In io_recv(), if import_single_range() fails, the @flags variable is uninitialized, then it will goto out_free. After the goto, the compiler doesn't know that (ret < min_ret) is always true, so it thinks the "if ((flags & MSG_WAITALL) ..." path could be taken. The complaint comes from gcc-9 (Debian 9.3.0-22) 9.3.0: ``` fs/io_uring.c:5238 io_recvfrom() error: uninitialized symbol 'flags' ``` Fix this by bypassing the @ret and @flags check when import_single_range() fails. Reasons: 1. import_single_range() only returns -EFAULT when it fails. 2. At that point, @flags is uninitialized and shouldn't be read. Reported-by: kernel test robot Reported-by: Dan Carpenter Reported-by: "Chen, Rong A" Link: https://lore.gnuweeb.org/timl/d33bb5a9-8173-f65b-f653-51fc0681c6d6@intel.com/ Cc: Pavel Begunkov Suggested-by: Ammar Faizi Fixes: 7297ce3d59449de49d3c9e1f64ae25488750a1fc ("io_uring: improve send/recv error handling") Signed-off-by: Alviro Iskandar Setiawan Signed-off-by: Ammar Faizi Link: https://lore.kernel.org/r/20220207140533.565411-1-ammarfaizi2@gnuweeb.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2e04f718319d..3445c4da0153 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5228,7 +5228,6 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) min_ret = iov_iter_count(&msg.msg_iter); ret = sock_recvmsg(sock, &msg, flags); -out_free: if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) return -EAGAIN; @@ -5236,9 +5235,9 @@ out_free: ret = -EINTR; req_set_fail(req); } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { +out_free: req_set_fail(req); } - __io_req_complete(req, issue_flags, ret, io_put_kbuf(req)); return 0; } From 0a3f1e0beacf6cc8ae5f846b0641c1df476e83d6 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 24 Jan 2022 21:17:36 -0800 Subject: [PATCH 1143/1295] mm: io_uring: allow oom-killer from io_uring_setup On an overcommitted system which is running multiple workloads of varying priorities, it is preferred to trigger an oom-killer to kill a low priority workload than to let the high priority workload receiving ENOMEMs. On our memory overcommitted systems, we are seeing a lot of ENOMEMs instead of oom-kills because io_uring_setup callchain is using __GFP_NORETRY gfp flag which avoids the oom-killer. Let's remove it and allow the oom-killer to kill a lower priority job. Signed-off-by: Shakeel Butt Link: https://lore.kernel.org/r/20220125051736.2981459-1-shakeelb@google.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3445c4da0153..77b9c7e4793b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8932,10 +8932,9 @@ static void io_mem_free(void *ptr) static void *io_mem_alloc(size_t size) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | - __GFP_NORETRY | __GFP_ACCOUNT; + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - return (void *) __get_free_pages(gfp_flags, get_order(size)); + return (void *) __get_free_pages(gfp, get_order(size)); } static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, From 8bc69f86328e87a0ffa79438430cc82f3aa6a194 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Fri, 4 Feb 2022 01:30:08 +0800 Subject: [PATCH 1144/1295] Drivers: hv: vmbus: Fix memory leak in vmbus_add_channel_kobj MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kobject_init_and_add() takes reference even when it fails. According to the doc of kobject_init_and_add(): If this function returns an error, kobject_put() must be called to properly clean up the memory associated with the object. Fix memory leak by calling kobject_put(). Fixes: c2e5df616e1a ("vmbus: add per-channel sysfs info") Signed-off-by: Miaoqian Lin Reviewed-by: Juan Vazquez Link: https://lore.kernel.org/r/20220203173008.43480-1-linmq006@gmail.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 17bf55fe3169..34a4fd21bdf5 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2028,8 +2028,10 @@ int vmbus_add_channel_kobj(struct hv_device *dev, struct vmbus_channel *channel) kobj->kset = dev->channels_kset; ret = kobject_init_and_add(kobj, &vmbus_chan_ktype, NULL, "%u", relid); - if (ret) + if (ret) { + kobject_put(kobj); return ret; + } ret = sysfs_create_group(kobj, &vmbus_chan_group); @@ -2038,6 +2040,7 @@ int vmbus_add_channel_kobj(struct hv_device *dev, struct vmbus_channel *channel) * The calling functions' error handling paths will cleanup the * empty channel directory. */ + kobject_put(kobj); dev_err(device, "Unable to set up channel sysfs files\n"); return ret; } From 6bf625a4140f24b490766043b307f8252519578b Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 6 Feb 2022 11:36:56 -0800 Subject: [PATCH 1145/1295] Drivers: hv: vmbus: Rework use of DMA_BIT_MASK(64) Using DMA_BIT_MASK(64) as an initializer for a global variable causes problems with Clang 12.0.1. The compiler doesn't understand that value 64 is excluded from the shift at compile time, resulting in a build error. While this is a compiler problem, avoid the issue by setting up the dma_mask memory as part of struct hv_device, and initialize it using dma_set_mask(). Reported-by: Nathan Chancellor Reported-by: Vitaly Chikunov Reported-by: Jakub Kicinski Fixes: 743b237c3a7b ("scsi: storvsc: Add Isolation VM support for storvsc driver") Signed-off-by: Michael Kelley Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Link: https://lore.kernel.org/r/1644176216-12531-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 4 ++-- include/linux/hyperv.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 34a4fd21bdf5..12a2b37e87f3 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2082,7 +2082,6 @@ struct hv_device *vmbus_device_create(const guid_t *type, return child_device_obj; } -static u64 vmbus_dma_mask = DMA_BIT_MASK(64); /* * vmbus_device_register - Register the child device */ @@ -2123,8 +2122,9 @@ int vmbus_device_register(struct hv_device *child_device_obj) } hv_debug_add_dev_dir(child_device_obj); - child_device_obj->device.dma_mask = &vmbus_dma_mask; child_device_obj->device.dma_parms = &child_device_obj->dma_parms; + child_device_obj->device.dma_mask = &child_device_obj->dma_mask; + dma_set_mask(&child_device_obj->device, DMA_BIT_MASK(64)); return 0; err_kset_unregister: diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index f565a8938836..fe2e0179ed51 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1262,6 +1262,7 @@ struct hv_device { struct vmbus_channel *channel; struct kset *channels_kset; struct device_dma_parameters dma_parms; + u64 dma_mask; /* place holder to keep track of the dir for hv device in debugfs */ struct dentry *debug_dir; From d6ebb17ccc7b37872a32bc25b4a21f1e5af8c7e3 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 28 Jan 2022 14:35:03 -0600 Subject: [PATCH 1146/1295] ACPI: PM: Revert "Only mark EC GPE for wakeup on Intel systems" Testing on various upcoming OEM systems shows commit 7b167c4cb48e ("ACPI: PM: Only mark EC GPE for wakeup on Intel systems") was short sighted and the symptoms were indicative of other problems. Some OEMs do have the dedicated GPIOs for the power button but also rely upon an interrupt to the EC SCI to let the lid work. The original commit showed spurious activity on Lenovo systems: * On both Lenovo T14 and P14s the keyboard wakeup doesn't work, and sometimes the power button event doesn't work. This was confirmed on my end at that time. However further development in the kernel showed that the issue was actually the IRQ for the GPIO controller was also shared with the EC SCI. This was actually fixed by commit 2d54067fcd23 ("pinctrl: amd: Fix wakeups when IRQ is shared with SCI"). The original commit also showed problems with AC adapter: * On HP 635 G7 detaching or attaching AC during suspend will cause the system not to wakeup * On Asus vivobook to prevent detaching AC causing resume problems * On Lenovo 14ARE05 to prevent detaching AC causing resume problems * On HP ENVY x360 to prevent detaching AC causing resume problems Detaching AC adapter causing problems appears to have been a problem because the EC SCI went off to notify the OS of the power adapter change but the SCI was ignored and there was no other way to wake up this system since GPIO controller wasn't properly enabled. The wakeups were fixed by enabling the GPIO controller in commit acd47b9f28e5 ("pinctrl: amd: Handle wake-up interrupt"). I've confirmed on a variety of OEM notebooks with the following test 1) echo 1 | sudo tee /sys/power/pm_debug_messages 2) sudo systemctl suspend 3) unplug AC adapter, make sure system is still asleep 4) wake system from lid (which is provided by ACPI SCI on some of them) 5) dmesg a) see the EC GPE dispatched, timekeeping for X seconds (matching ~time until AC adapter plug out) b) see timekeeping for Y seconds until woke (matching ~time from AC adapter until lid event) 6) Look at /sys/kernel/debug/amd_pmc/s0ix_stats "Time (in us) in S0i3" = X + Y - firmware processing time Signed-off-by: Mario Limonciello Tested-by: Kai-Heng Feng Signed-off-by: Rafael J. Wysocki --- drivers/acpi/x86/s2idle.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c index abc06e7f89d8..ed889f827f53 100644 --- a/drivers/acpi/x86/s2idle.c +++ b/drivers/acpi/x86/s2idle.c @@ -424,15 +424,11 @@ static int lps0_device_attach(struct acpi_device *adev, mem_sleep_current = PM_SUSPEND_TO_IDLE; /* - * Some Intel based LPS0 systems, like ASUS Zenbook UX430UNR/i7-8550U don't - * use intel-hid or intel-vbtn but require the EC GPE to be enabled while - * suspended for certain wakeup devices to work, so mark it as wakeup-capable. - * - * Only enable on !AMD as enabling this universally causes problems for a number - * of AMD based systems. + * Some LPS0 systems, like ASUS Zenbook UX430UNR/i7-8550U, require the + * EC GPE to be enabled while suspended for certain wakeup devices to + * work, so mark it as wakeup-capable. */ - if (!acpi_s2idle_vendor_amd()) - acpi_ec_mark_gpe_for_wake(); + acpi_ec_mark_gpe_for_wake(); return 0; } From dc0075ba7f387fe4c48a8c674b11ab6f374a6acc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 4 Feb 2022 18:31:02 +0100 Subject: [PATCH 1147/1295] ACPI: PM: s2idle: Cancel wakeup before dispatching EC GPE Commit 4a9af6cac050 ("ACPI: EC: Rework flushing of EC work while suspended to idle") made acpi_ec_dispatch_gpe() check pm_wakeup_pending(), but that is before canceling the SCI wakeup, so pm_wakeup_pending() is always true. This causes the loop in acpi_ec_dispatch_gpe() to always terminate after one iteration which may not be correct. Address this issue by canceling the SCI wakeup earlier, from acpi_ec_dispatch_gpe() itself. Fixes: 4a9af6cac050 ("ACPI: EC: Rework flushing of EC work while suspended to idle") Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 10 ++++++++++ drivers/acpi/sleep.c | 14 ++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 0077d2c85df8..46710380a402 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -2065,6 +2065,16 @@ bool acpi_ec_dispatch_gpe(void) if (acpi_any_gpe_status_set(first_ec->gpe)) return true; + /* + * Cancel the SCI wakeup and process all pending events in case there + * are any wakeup ones in there. + * + * Note that if any non-EC GPEs are active at this point, the SCI will + * retrigger after the rearming in acpi_s2idle_wake(), so no events + * should be missed by canceling the wakeup here. + */ + pm_system_cancel_wakeup(); + /* * Dispatch the EC GPE in-band, but do not report wakeup in any case * to allow the caller to process events properly after that. diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index a60ff5dfed3a..fac7c9d4c9a1 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -736,21 +736,15 @@ bool acpi_s2idle_wake(void) return true; } - /* Check non-EC GPE wakeups and dispatch the EC GPE. */ + /* + * Check non-EC GPE wakeups and if there are none, cancel the + * SCI-related wakeup and dispatch the EC GPE. + */ if (acpi_ec_dispatch_gpe()) { pm_pr_dbg("ACPI non-EC GPE wakeup\n"); return true; } - /* - * Cancel the SCI wakeup and process all pending events in case - * there are any wakeup ones in there. - * - * Note that if any non-EC GPEs are active at this point, the - * SCI will retrigger after the rearming below, so no events - * should be missed by canceling the wakeup here. - */ - pm_system_cancel_wakeup(); acpi_os_wait_events_complete(); /* From cb1f65c1e1424a4b5e4a86da8aa3b8fd8459c8ec Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 4 Feb 2022 18:35:22 +0100 Subject: [PATCH 1148/1295] PM: s2idle: ACPI: Fix wakeup interrupts handling After commit e3728b50cd9b ("ACPI: PM: s2idle: Avoid possible race related to the EC GPE") wakeup interrupts occurring immediately after the one discarded by acpi_s2idle_wake() may be missed. Moreover, if the SCI triggers again immediately after the rearming in acpi_s2idle_wake(), that wakeup may be missed too. The problem is that pm_system_irq_wakeup() only calls pm_system_wakeup() when pm_wakeup_irq is 0, but that's not the case any more after the interrupt causing acpi_s2idle_wake() to run until pm_wakeup_irq is cleared by the pm_wakeup_clear() call in s2idle_loop(). However, there may be wakeup interrupts occurring in that time frame and if that happens, they will be missed. To address that issue first move the clearing of pm_wakeup_irq to the point at which it is known that the interrupt causing acpi_s2idle_wake() to tun will be discarded, before rearming the SCI for wakeup. Moreover, because that only reduces the size of the time window in which the issue may manifest itself, allow pm_system_irq_wakeup() to register two second wakeup interrupts in a row and, when discarding the first one, replace it with the second one. [Of course, this assumes that only one wakeup interrupt can be discarded in one go, but currently that is the case and I am not aware of any plans to change that.] Fixes: e3728b50cd9b ("ACPI: PM: s2idle: Avoid possible race related to the EC GPE") Cc: 5.4+ # 5.4+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sleep.c | 1 + drivers/base/power/wakeup.c | 41 ++++++++++++++++++++++++++++++------- include/linux/suspend.h | 4 ++-- kernel/power/main.c | 5 ++++- kernel/power/process.c | 2 +- kernel/power/suspend.c | 2 -- 6 files changed, 42 insertions(+), 13 deletions(-) diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index fac7c9d4c9a1..d4fbea91ab6b 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -758,6 +758,7 @@ bool acpi_s2idle_wake(void) return true; } + pm_wakeup_clear(acpi_sci_irq); rearm_wake_irq(acpi_sci_irq); } diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 99bda0da23a8..8666590201c9 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -34,7 +34,8 @@ suspend_state_t pm_suspend_target_state; bool events_check_enabled __read_mostly; /* First wakeup IRQ seen by the kernel in the last cycle. */ -unsigned int pm_wakeup_irq __read_mostly; +static unsigned int wakeup_irq[2] __read_mostly; +static DEFINE_RAW_SPINLOCK(wakeup_irq_lock); /* If greater than 0 and the system is suspending, terminate the suspend. */ static atomic_t pm_abort_suspend __read_mostly; @@ -942,19 +943,45 @@ void pm_system_cancel_wakeup(void) atomic_dec_if_positive(&pm_abort_suspend); } -void pm_wakeup_clear(bool reset) +void pm_wakeup_clear(unsigned int irq_number) { - pm_wakeup_irq = 0; - if (reset) + raw_spin_lock_irq(&wakeup_irq_lock); + + if (irq_number && wakeup_irq[0] == irq_number) + wakeup_irq[0] = wakeup_irq[1]; + else + wakeup_irq[0] = 0; + + wakeup_irq[1] = 0; + + raw_spin_unlock_irq(&wakeup_irq_lock); + + if (!irq_number) atomic_set(&pm_abort_suspend, 0); } void pm_system_irq_wakeup(unsigned int irq_number) { - if (pm_wakeup_irq == 0) { - pm_wakeup_irq = irq_number; + unsigned long flags; + + raw_spin_lock_irqsave(&wakeup_irq_lock, flags); + + if (wakeup_irq[0] == 0) + wakeup_irq[0] = irq_number; + else if (wakeup_irq[1] == 0) + wakeup_irq[1] = irq_number; + else + irq_number = 0; + + raw_spin_unlock_irqrestore(&wakeup_irq_lock, flags); + + if (irq_number) pm_system_wakeup(); - } +} + +unsigned int pm_wakeup_irq(void) +{ + return wakeup_irq[0]; } /** diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 3e8ecdebe601..300273ff40cc 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -497,14 +497,14 @@ extern void ksys_sync_helper(void); /* drivers/base/power/wakeup.c */ extern bool events_check_enabled; -extern unsigned int pm_wakeup_irq; extern suspend_state_t pm_suspend_target_state; extern bool pm_wakeup_pending(void); extern void pm_system_wakeup(void); extern void pm_system_cancel_wakeup(void); -extern void pm_wakeup_clear(bool reset); +extern void pm_wakeup_clear(unsigned int irq_number); extern void pm_system_irq_wakeup(unsigned int irq_number); +extern unsigned int pm_wakeup_irq(void); extern bool pm_get_wakeup_count(unsigned int *count, bool block); extern bool pm_save_wakeup_count(unsigned int count); extern void pm_wakep_autosleep_enabled(bool set); diff --git a/kernel/power/main.c b/kernel/power/main.c index 44169f3081fd..7e646079fbeb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -504,7 +504,10 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; + if (!pm_wakeup_irq()) + return -ENODATA; + + return sprintf(buf, "%u\n", pm_wakeup_irq()); } power_attr_ro(pm_wakeup_irq); diff --git a/kernel/power/process.c b/kernel/power/process.c index b7e7798637b8..11b570fcf049 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -134,7 +134,7 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); - pm_wakeup_clear(true); + pm_wakeup_clear(0); pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 80cc1f0f502b..6fcdee7e87a5 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -136,8 +136,6 @@ static void s2idle_loop(void) break; } - pm_wakeup_clear(false); - s2idle_enter(); } From fe4f57bf7b585dca58f1496c4e2481ecbae18126 Mon Sep 17 00:00:00 2001 From: Pavel Parkhomenko Date: Sat, 5 Feb 2022 23:39:32 +0300 Subject: [PATCH 1149/1295] net: phy: marvell: Fix RGMII Tx/Rx delays setting in 88e1121-compatible PHYs It is mandatory for a software to issue a reset upon modifying RGMII Receive Timing Control and RGMII Transmit Timing Control bit fields of MAC Specific Control register 2 (page 2, register 21) otherwise the changes won't be perceived by the PHY (the same is applicable for a lot of other registers). Not setting the RGMII delays on the platforms that imply it' being done on the PHY side will consequently cause the traffic loss. We discovered that the denoted soft-reset is missing in the m88e1121_config_aneg() method for the case if the RGMII delays are modified but the MDIx polarity isn't changed or the auto-negotiation is left enabled, thus causing the traffic loss on our platform with Marvell Alaska 88E1510 installed. Let's fix that by issuing the soft-reset if the delays have been actually set in the m88e1121_config_aneg_rgmii_delays() method. Cc: stable@vger.kernel.org Fixes: d6ab93364734 ("net: phy: marvell: Avoid unnecessary soft reset") Signed-off-by: Pavel Parkhomenko Reviewed-by: Russell King (Oracle) Reviewed-by: Serge Semin Link: https://lore.kernel.org/r/20220205203932.26899-1-Pavel.Parkhomenko@baikalelectronics.ru Signed-off-by: Jakub Kicinski --- drivers/net/phy/marvell.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index ab063961ac00..2429db614b59 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -553,9 +553,9 @@ static int m88e1121_config_aneg_rgmii_delays(struct phy_device *phydev) else mscr = 0; - return phy_modify_paged(phydev, MII_MARVELL_MSCR_PAGE, - MII_88E1121_PHY_MSCR_REG, - MII_88E1121_PHY_MSCR_DELAY_MASK, mscr); + return phy_modify_paged_changed(phydev, MII_MARVELL_MSCR_PAGE, + MII_88E1121_PHY_MSCR_REG, + MII_88E1121_PHY_MSCR_DELAY_MASK, mscr); } static int m88e1121_config_aneg(struct phy_device *phydev) @@ -569,11 +569,13 @@ static int m88e1121_config_aneg(struct phy_device *phydev) return err; } + changed = err; + err = marvell_set_polarity(phydev, phydev->mdix_ctrl); if (err < 0) return err; - changed = err; + changed |= err; err = genphy_config_aneg(phydev); if (err < 0) From 8a4c5b2a6d8ea079fa36034e8167de87ab6f8880 Mon Sep 17 00:00:00 2001 From: Brenda Streiff Date: Fri, 28 Jan 2022 16:01:28 -0600 Subject: [PATCH 1150/1295] kconfig: let 'shell' return enough output for deep path names The 'shell' built-in only returns the first 256 bytes of the command's output. In some cases, 'shell' is used to return a path; by bumping up the buffer size to 4096 this lets us capture up to PATH_MAX. The specific case where I ran into this was due to commit 1e860048c53e ("gcc-plugins: simplify GCC plugin-dev capability test"). After this change, we now use `$(shell,$(CC) -print-file-name=plugin)` to return a path; if the gcc path is particularly long, then the path ends up truncated at the 256 byte mark, which makes the HAVE_GCC_PLUGINS depends test always fail. Signed-off-by: Brenda Streiff Signed-off-by: Masahiro Yamada --- scripts/kconfig/preprocess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kconfig/preprocess.c b/scripts/kconfig/preprocess.c index 0590f86df6e4..748da578b418 100644 --- a/scripts/kconfig/preprocess.c +++ b/scripts/kconfig/preprocess.c @@ -141,7 +141,7 @@ static char *do_lineno(int argc, char *argv[]) static char *do_shell(int argc, char *argv[]) { FILE *p; - char buf[256]; + char buf[4096]; char *cmd; size_t nread; int i; From 1cf5f151d25fcca94689efd91afa0253621fb33a Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 2 Feb 2022 16:05:16 -0700 Subject: [PATCH 1151/1295] Makefile.extrawarn: Move -Wunaligned-access to W=1 -Wunaligned-access is a new warning in clang that is default enabled for arm and arm64 under certain circumstances within the clang frontend (see LLVM commit below). On v5.17-rc2, an ARCH=arm allmodconfig build shows 1284 total/70 unique instances of this warning (most of the instances are in header files), which is quite noisy. To keep a normal build green through CONFIG_WERROR, only show this warning with W=1, which will allow automated build systems to catch new instances of the warning so that the total number can be driven down to zero eventually since catching unaligned accesses at compile time would be generally useful. Cc: stable@vger.kernel.org Link: https://github.com/llvm/llvm-project/commit/35737df4dcd28534bd3090157c224c19b501278a Link: https://github.com/ClangBuiltLinux/linux/issues/1569 Link: https://github.com/ClangBuiltLinux/linux/issues/1576 Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Signed-off-by: Masahiro Yamada --- scripts/Makefile.extrawarn | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index d53825503874..8be892887d71 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -51,6 +51,7 @@ KBUILD_CFLAGS += -Wno-sign-compare KBUILD_CFLAGS += -Wno-format-zero-length KBUILD_CFLAGS += $(call cc-disable-warning, pointer-to-enum-cast) KBUILD_CFLAGS += -Wno-tautological-constant-out-of-range-compare +KBUILD_CFLAGS += $(call cc-disable-warning, unaligned-access) endif endif From c80b27cfd93ba9f5161383f798414609e84729f3 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 7 Feb 2022 10:05:16 -0800 Subject: [PATCH 1152/1295] scsi: lpfc: Remove NVMe support if kernel has NVME_FC disabled The driver is initiating NVMe PRLIs to determine device NVMe support. This should not be occurring if CONFIG_NVME_FC support is disabled. Correct this by changing the default value for FC4 support. Currently it defaults to FCP and NVMe. With change, when NVME_FC support is not enabled in the kernel, the default value is just FCP. Link: https://lore.kernel.org/r/20220207180516.73052-1-jsmart2021@gmail.com Reviewed-by: Ewan D. Milne Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc.h | 13 ++++++++++--- drivers/scsi/lpfc/lpfc_attr.c | 4 ++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index 4878c94761f9..a1e0a106c132 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -1161,6 +1161,16 @@ struct lpfc_hba { uint32_t cfg_hostmem_hgp; uint32_t cfg_log_verbose; uint32_t cfg_enable_fc4_type; +#define LPFC_ENABLE_FCP 1 +#define LPFC_ENABLE_NVME 2 +#define LPFC_ENABLE_BOTH 3 +#if (IS_ENABLED(CONFIG_NVME_FC)) +#define LPFC_MAX_ENBL_FC4_TYPE LPFC_ENABLE_BOTH +#define LPFC_DEF_ENBL_FC4_TYPE LPFC_ENABLE_BOTH +#else +#define LPFC_MAX_ENBL_FC4_TYPE LPFC_ENABLE_FCP +#define LPFC_DEF_ENBL_FC4_TYPE LPFC_ENABLE_FCP +#endif uint32_t cfg_aer_support; uint32_t cfg_sriov_nr_virtfn; uint32_t cfg_request_firmware_upgrade; @@ -1182,9 +1192,6 @@ struct lpfc_hba { uint32_t cfg_ras_fwlog_func; uint32_t cfg_enable_bbcr; /* Enable BB Credit Recovery */ uint32_t cfg_enable_dpp; /* Enable Direct Packet Push */ -#define LPFC_ENABLE_FCP 1 -#define LPFC_ENABLE_NVME 2 -#define LPFC_ENABLE_BOTH 3 uint32_t cfg_enable_pbde; uint32_t cfg_enable_mi; struct nvmet_fc_target_port *targetport; diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 7a7f17d71811..bac78fbce8d6 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -3978,8 +3978,8 @@ LPFC_ATTR_R(nvmet_mrq_post, * 3 - register both FCP and NVME * Supported values are [1,3]. Default value is 3 */ -LPFC_ATTR_R(enable_fc4_type, LPFC_ENABLE_BOTH, - LPFC_ENABLE_FCP, LPFC_ENABLE_BOTH, +LPFC_ATTR_R(enable_fc4_type, LPFC_DEF_ENBL_FC4_TYPE, + LPFC_ENABLE_FCP, LPFC_MAX_ENBL_FC4_TYPE, "Enable FC4 Protocol support - FCP / NVME"); /* From 5852ed2a6a39c862c8a3fdf646e1f4e01b91d710 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 7 Feb 2022 10:04:42 -0800 Subject: [PATCH 1153/1295] scsi: lpfc: Reduce log messages seen after firmware download Messages around firmware download were incorrectly tagged as being related to discovery trace events. Thus, firmware download status ended up dumping the trace log as well as the firmware update message. As there were a couple of log messages in this state, the trace log was dumped multiple times. Resolve this by converting from trace events to SLI events. Link: https://lore.kernel.org/r/20220207180442.72836-1-jsmart2021@gmail.com Reviewed-by: Ewan D. Milne Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_init.c | 2 +- drivers/scsi/lpfc/lpfc_sli.c | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index a56f01f659f8..558f7d2559c4 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -2104,7 +2104,7 @@ lpfc_handle_eratt_s4(struct lpfc_hba *phba) } if (reg_err1 == SLIPORT_ERR1_REG_ERR_CODE_2 && reg_err2 == SLIPORT_ERR2_REG_FW_RESTART) { - lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT, + lpfc_printf_log(phba, KERN_ERR, LOG_SLI, "3143 Port Down: Firmware Update " "Detected\n"); en_rn_msg = false; diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 1bc0db572d9e..430abebf99f1 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -13363,6 +13363,7 @@ lpfc_sli4_eratt_read(struct lpfc_hba *phba) uint32_t uerr_sta_hi, uerr_sta_lo; uint32_t if_type, portsmphr; struct lpfc_register portstat_reg; + u32 logmask; /* * For now, use the SLI4 device internal unrecoverable error @@ -13413,7 +13414,12 @@ lpfc_sli4_eratt_read(struct lpfc_hba *phba) readl(phba->sli4_hba.u.if_type2.ERR1regaddr); phba->work_status[1] = readl(phba->sli4_hba.u.if_type2.ERR2regaddr); - lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT, + logmask = LOG_TRACE_EVENT; + if (phba->work_status[0] == + SLIPORT_ERR1_REG_ERR_CODE_2 && + phba->work_status[1] == SLIPORT_ERR2_REG_FW_RESTART) + logmask = LOG_SLI; + lpfc_printf_log(phba, KERN_ERR, logmask, "2885 Port Status Event: " "port status reg 0x%x, " "port smphr reg 0x%x, " From 921d2eb09673af8e74aa11369a4de700e676a9c1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 8 Feb 2022 15:16:09 +0900 Subject: [PATCH 1154/1295] ata: sata_fsl: fix sscanf() and sysfs_emit() format strings Use the %u format for unsigned int parameters handling with sscanf() and sysfs_emit() to avoid compilation warnings. In fsl_sata_rx_watermark_store(), the call to sscanf() to parse a single argument is replaced with a call to kstrtouint(). While at it, also replace the printk(KERN_ERR) calls with dev_err() calls and fix blank lines in fsl_sata_rx_watermark_store(). Reported-by: kernel test robot Signed-off-by: Damien Le Moal --- drivers/ata/sata_fsl.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c index da0152116d9f..556034a15430 100644 --- a/drivers/ata/sata_fsl.c +++ b/drivers/ata/sata_fsl.c @@ -322,7 +322,7 @@ static void fsl_sata_set_irq_coalescing(struct ata_host *host, static ssize_t fsl_sata_intr_coalescing_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sysfs_emit(buf, "%d %d\n", + return sysfs_emit(buf, "%u %u\n", intr_coalescing_count, intr_coalescing_ticks); } @@ -332,10 +332,8 @@ static ssize_t fsl_sata_intr_coalescing_store(struct device *dev, { unsigned int coalescing_count, coalescing_ticks; - if (sscanf(buf, "%d%d", - &coalescing_count, - &coalescing_ticks) != 2) { - printk(KERN_ERR "fsl-sata: wrong parameter format.\n"); + if (sscanf(buf, "%u%u", &coalescing_count, &coalescing_ticks) != 2) { + dev_err(dev, "fsl-sata: wrong parameter format.\n"); return -EINVAL; } @@ -359,7 +357,7 @@ static ssize_t fsl_sata_rx_watermark_show(struct device *dev, rx_watermark &= 0x1f; spin_unlock_irqrestore(&host->lock, flags); - return sysfs_emit(buf, "%d\n", rx_watermark); + return sysfs_emit(buf, "%u\n", rx_watermark); } static ssize_t fsl_sata_rx_watermark_store(struct device *dev, @@ -373,8 +371,8 @@ static ssize_t fsl_sata_rx_watermark_store(struct device *dev, void __iomem *csr_base = host_priv->csr_base; u32 temp; - if (sscanf(buf, "%d", &rx_watermark) != 1) { - printk(KERN_ERR "fsl-sata: wrong parameter format.\n"); + if (kstrtouint(buf, 10, &rx_watermark) < 0) { + dev_err(dev, "fsl-sata: wrong parameter format.\n"); return -EINVAL; } @@ -382,8 +380,8 @@ static ssize_t fsl_sata_rx_watermark_store(struct device *dev, temp = ioread32(csr_base + TRANSCFG); temp &= 0xffffffe0; iowrite32(temp | rx_watermark, csr_base + TRANSCFG); - spin_unlock_irqrestore(&host->lock, flags); + return strlen(buf); } From c8ea23d5fa59f28302d4e3370c75d9c308e64410 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20B=C3=B6sz=C3=B6rm=C3=A9nyi?= Date: Fri, 4 Feb 2022 13:57:50 +0100 Subject: [PATCH 1155/1295] ata: libata-core: Disable TRIM on M88V29 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This device is a CF card, or possibly an SSD in CF form factor. It supports NCQ and high speed DMA. While it also advertises TRIM support, I/O errors are reported when the discard mount option fstrim is used. TRIM also fails when disabling NCQ and not just as an NCQ command. TRIM must be disabled for this device. Signed-off-by: Zoltán Böszörményi Signed-off-by: Damien Le Moal --- drivers/ata/libata-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index ba9273f80069..0c854aebfe0b 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4029,6 +4029,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { /* devices that don't properly handle TRIM commands */ { "SuperSSpeed S238*", NULL, ATA_HORKAGE_NOTRIM, }, + { "M88V29*", NULL, ATA_HORKAGE_NOTRIM, }, /* * As defined, the DRAT (Deterministic Read After Trim) and RZAT From 088400521e421a1df7d0128dc0f9246db4ef1c7c Mon Sep 17 00:00:00 2001 From: Chia-Wei Wang Date: Tue, 1 Feb 2022 17:30:27 +1030 Subject: [PATCH 1156/1295] docs/ABI: testing: aspeed-uart-routing: Escape asterisk Escape asterisk symbols to fix the following warning: "WARNING: Inline emphasis start-string without end-string" Fixes: c6807970c3bc ("soc: aspeed: Add UART routing support") Reported-by: Stephen Rothwell Signed-off-by: Chia-Wei Wang Signed-off-by: Joel Stanley Link: https://lore.kernel.org/r/20220124014351.9121-1-chiawei_wang@aspeedtech.com Link: https://lore.kernel.org/r/20220201070027.196314-1-joel@jms.id.au' Signed-off-by: Arnd Bergmann --- Documentation/ABI/testing/sysfs-driver-aspeed-uart-routing | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-driver-aspeed-uart-routing b/Documentation/ABI/testing/sysfs-driver-aspeed-uart-routing index b363827da437..910df0e5815a 100644 --- a/Documentation/ABI/testing/sysfs-driver-aspeed-uart-routing +++ b/Documentation/ABI/testing/sysfs-driver-aspeed-uart-routing @@ -1,4 +1,4 @@ -What: /sys/bus/platform/drivers/aspeed-uart-routing/*/uart* +What: /sys/bus/platform/drivers/aspeed-uart-routing/\*/uart\* Date: September 2021 Contact: Oskar Senft Chia-Wei Wang @@ -9,7 +9,7 @@ Description: Selects the RX source of the UARTx device. depends on the selected file. e.g. - cat /sys/bus/platform/drivers/aspeed-uart-routing/*.uart_routing/uart1 + cat /sys/bus/platform/drivers/aspeed-uart-routing/\*.uart_routing/uart1 [io1] io2 io3 io4 uart2 uart3 uart4 io6 In this case, UART1 gets its input from IO1 (physical serial port 1). @@ -17,7 +17,7 @@ Description: Selects the RX source of the UARTx device. Users: OpenBMC. Proposed changes should be mailed to openbmc@lists.ozlabs.org -What: /sys/bus/platform/drivers/aspeed-uart-routing/*/io* +What: /sys/bus/platform/drivers/aspeed-uart-routing/\*/io\* Date: September 2021 Contact: Oskar Senft Chia-Wei Wang From 301a5d3ad2432d7829f59432ca0a93a6defbb9a1 Mon Sep 17 00:00:00 2001 From: Jae Hyun Yoo Date: Tue, 1 Feb 2022 17:31:18 +1030 Subject: [PATCH 1157/1295] soc: aspeed: lpc-ctrl: Block error printing on probe defer cases Add a checking code when it gets -EPROBE_DEFER while getting a clock resource. In this case, it doesn't need to print out an error message because the probing will be re-visited. Signed-off-by: Jae Hyun Yoo Signed-off-by: Joel Stanley Reviewed-by: Andrew Jeffery Reviewed-by: Iwona Winiarska Link: https://lore.kernel.org/r/20211104173709.222912-1-jae.hyun.yoo@intel.com Link: https://lore.kernel.org/r/20220201070118.196372-1-joel@jms.id.au' Signed-off-by: Arnd Bergmann --- drivers/soc/aspeed/aspeed-lpc-ctrl.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/soc/aspeed/aspeed-lpc-ctrl.c b/drivers/soc/aspeed/aspeed-lpc-ctrl.c index 72771e018c42..258894ed234b 100644 --- a/drivers/soc/aspeed/aspeed-lpc-ctrl.c +++ b/drivers/soc/aspeed/aspeed-lpc-ctrl.c @@ -306,10 +306,9 @@ static int aspeed_lpc_ctrl_probe(struct platform_device *pdev) } lpc_ctrl->clk = devm_clk_get(dev, NULL); - if (IS_ERR(lpc_ctrl->clk)) { - dev_err(dev, "couldn't get clock\n"); - return PTR_ERR(lpc_ctrl->clk); - } + if (IS_ERR(lpc_ctrl->clk)) + return dev_err_probe(dev, PTR_ERR(lpc_ctrl->clk), + "couldn't get clock\n"); rc = clk_prepare_enable(lpc_ctrl->clk); if (rc) { dev_err(dev, "couldn't enable clock\n"); From 724004a11a84ea762b03bc1822c40d977ae53f1c Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 3 Feb 2022 18:41:17 -0600 Subject: [PATCH 1158/1295] ARM: dts: spear320: Drop unused and undocumented 'irq-over-gpio' property The property 'irq-over-gpio' is both unused and undocumented. It also happens to collide with standard *-gpio properties. As it is not needed, drop it. Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220204004117.1232902-1-robh@kernel.org' Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/spear320-hmi.dts | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/boot/dts/spear320-hmi.dts b/arch/arm/boot/dts/spear320-hmi.dts index 367ba48aac3e..b587e4ec11e5 100644 --- a/arch/arm/boot/dts/spear320-hmi.dts +++ b/arch/arm/boot/dts/spear320-hmi.dts @@ -235,7 +235,6 @@ #address-cells = <1>; #size-cells = <0>; reg = <0x41>; - irq-over-gpio; irq-gpios = <&gpiopinctrl 29 0x4>; id = <0>; blocks = <0x5>; From d9058d6a0e92d8e4a00855f8fe204792f42794db Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 6 Feb 2022 00:53:12 +0100 Subject: [PATCH 1159/1295] ARM: dts: Fix boot regression on Skomer The signal routing on the Skomer board was incorrect making it impossible to mount root from the SD card. Fix this up. Signed-off-by: Linus Walleij Cc: stable@vger.kernel.org Cc: Stefan Hansson Link: https://lore.kernel.org/r/20220205235312.446730-1-linus.walleij@linaro.org' Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/ste-ux500-samsung-skomer.dts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/arm/boot/dts/ste-ux500-samsung-skomer.dts b/arch/arm/boot/dts/ste-ux500-samsung-skomer.dts index 580ca497f312..f8c5899fbdba 100644 --- a/arch/arm/boot/dts/ste-ux500-samsung-skomer.dts +++ b/arch/arm/boot/dts/ste-ux500-samsung-skomer.dts @@ -185,10 +185,6 @@ cap-sd-highspeed; cap-mmc-highspeed; /* All direction control is used */ - st,sig-dir-cmd; - st,sig-dir-dat0; - st,sig-dir-dat2; - st,sig-dir-dat31; st,sig-pin-fbclk; full-pwr-cycle; vmmc-supply = <&ab8500_ldo_aux3_reg>; From d9bc0de02aa0afa7ff96682428b2bb792bf00d9c Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 7 Feb 2022 18:55:02 +0100 Subject: [PATCH 1160/1295] MAINTAINERS: arm: samsung: add Git tree and IRC Add already used Krzysztof Kozlowski's Git tree for Samsung S3C/S5P/Exynos ARM sub-architecture and IRC channel (#linux-exynos at Libera). This documents purely existing state. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220207175503.425200-1-krzysztof.kozlowski@canonical.com' Signed-off-by: Arnd Bergmann --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6dbbc16a807a..9a03b8d00f2c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2575,7 +2575,9 @@ R: Alim Akhtar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-samsung-soc@vger.kernel.org S: Maintained +C: irc://irc.libera.chat/linux-exynos Q: https://patchwork.kernel.org/project/linux-samsung-soc/list/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux.git F: Documentation/arm/samsung/ F: Documentation/devicetree/bindings/arm/samsung/ F: Documentation/devicetree/bindings/power/pd-samsung.yaml @@ -15296,6 +15298,7 @@ R: Alim Akhtar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-samsung-soc@vger.kernel.org S: Maintained +C: irc://irc.libera.chat/linux-exynos Q: https://patchwork.kernel.org/project/linux-samsung-soc/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/samsung.git F: Documentation/devicetree/bindings/pinctrl/samsung-pinctrl.txt From 5b52ada7141f7adb53bed6d104df2690f4304f4c Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 7 Feb 2022 18:55:03 +0100 Subject: [PATCH 1161/1295] MAINTAINERS: add IRC to ARM sub-architectures and Devicetree Mention the IRC channels used for discussions about ARM/ARM64 sub-architectures and Devicetree. This documents purely existing state. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220207175503.425200-2-krzysztof.kozlowski@canonical.com' Signed-off-by: Arnd Bergmann --- MAINTAINERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 9a03b8d00f2c..8bcc2989313a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1620,6 +1620,7 @@ M: Olof Johansson M: soc@kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained +C: irc://irc.libera.chat/armlinux T: git git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git F: arch/arm/boot/dts/Makefile F: arch/arm64/boot/dts/Makefile @@ -1627,6 +1628,7 @@ F: arch/arm64/boot/dts/Makefile ARM SUB-ARCHITECTURES L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained +C: irc://irc.libera.chat/armlinux T: git git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git F: arch/arm/mach-*/ F: arch/arm/plat-*/ @@ -14393,6 +14395,7 @@ M: Rob Herring M: Frank Rowand L: devicetree@vger.kernel.org S: Maintained +C: irc://irc.libera.chat/devicetree W: http://www.devicetree.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git F: Documentation/ABI/testing/sysfs-firmware-ofw @@ -14404,6 +14407,7 @@ OPEN FIRMWARE AND FLATTENED DEVICE TREE BINDINGS M: Rob Herring L: devicetree@vger.kernel.org S: Maintained +C: irc://irc.libera.chat/devicetree Q: http://patchwork.ozlabs.org/project/devicetree-bindings/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git F: Documentation/devicetree/ From 4a64f2d3527a0ae400bcea353898a8f47209b446 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 7 Feb 2022 19:46:52 +0100 Subject: [PATCH 1162/1295] MAINTAINERS: add myself as a maintainer for the sl28cpld The sl28cpld is a management controller found on the Kontron SMARC-sAL28 board for now. Support for it was added by me quite a while ago, but I didn't add a MAINTAINERS entry. Add it now. Signed-off-by: Michael Walle Link: https://lore.kernel.org/r/20220207184652.1218447-1-michael@walle.cc' Signed-off-by: Arnd Bergmann --- MAINTAINERS | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8bcc2989313a..f3b825c779a5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17736,6 +17736,21 @@ S: Maintained W: http://www.winischhofer.at/linuxsisusbvga.shtml F: drivers/usb/misc/sisusbvga/ +SL28 CPLD MFD DRIVER +M: Michael Walle +S: Maintained +F: Documentation/devicetree/bindings/gpio/kontron,sl28cpld-gpio.yaml +F: Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml +F: Documentation/devicetree/bindings/interrupt-controller/kontron,sl28cpld-intc.yaml +F: Documentation/devicetree/bindings/mfd/kontron,sl28cpld.yaml +F: Documentation/devicetree/bindings/pwm/kontron,sl28cpld-pwm.yaml +F: Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml +F: drivers/gpio/gpio-sl28cpld.c +F: drivers/hwmon/sl28cpld-hwmon.c +F: drivers/irqchip/irq-sl28cpld.c +F: drivers/pwm/pwm-sl28cpld.c +F: drivers/watchdog/sl28cpld_wdt.c + SLAB ALLOCATOR M: Christoph Lameter M: Pekka Enberg From 95a4eed7dd5b7c1c3664a626174290686ddbee9f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 1 Feb 2022 17:27:55 +0200 Subject: [PATCH 1163/1295] gpiolib: Never return internal error codes to user space Currently it's possible that character device interface may return the error codes which are not supposed to be seen by user space. In this case it's EPROBE_DEFER. Wrap it to return -ENODEV instead as sysfs does. Fixes: d7c51b47ac11 ("gpio: userspace ABI for reading/writing GPIO lines") Fixes: 61f922db7221 ("gpio: userspace ABI for reading GPIO line events") Fixes: 3c0d9c635ae2 ("gpiolib: cdev: support GPIO_V2_GET_LINE_IOCTL and GPIO_V2_LINE_GET_VALUES_IOCTL") Reported-by: Suresh Balakrishnan Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-cdev.c | 6 +++--- drivers/gpio/gpiolib-sysfs.c | 7 ++----- drivers/gpio/gpiolib.h | 12 ++++++++++++ 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index c7b5446d01fd..ffa0256cad5a 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -330,7 +330,7 @@ static int linehandle_create(struct gpio_device *gdev, void __user *ip) goto out_free_lh; } - ret = gpiod_request(desc, lh->label); + ret = gpiod_request_user(desc, lh->label); if (ret) goto out_free_lh; lh->descs[i] = desc; @@ -1378,7 +1378,7 @@ static int linereq_create(struct gpio_device *gdev, void __user *ip) goto out_free_linereq; } - ret = gpiod_request(desc, lr->label); + ret = gpiod_request_user(desc, lr->label); if (ret) goto out_free_linereq; @@ -1764,7 +1764,7 @@ static int lineevent_create(struct gpio_device *gdev, void __user *ip) } } - ret = gpiod_request(desc, le->label); + ret = gpiod_request_user(desc, le->label); if (ret) goto out_free_le; le->desc = desc; diff --git a/drivers/gpio/gpiolib-sysfs.c b/drivers/gpio/gpiolib-sysfs.c index 4098bc7f88b7..44c1ad51b3fe 100644 --- a/drivers/gpio/gpiolib-sysfs.c +++ b/drivers/gpio/gpiolib-sysfs.c @@ -475,12 +475,9 @@ static ssize_t export_store(struct class *class, * they may be undone on its behalf too. */ - status = gpiod_request(desc, "sysfs"); - if (status) { - if (status == -EPROBE_DEFER) - status = -ENODEV; + status = gpiod_request_user(desc, "sysfs"); + if (status) goto done; - } status = gpiod_set_transitory(desc, false); if (!status) { diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h index 30bc3f80f83e..c31f4626915d 100644 --- a/drivers/gpio/gpiolib.h +++ b/drivers/gpio/gpiolib.h @@ -135,6 +135,18 @@ struct gpio_desc { int gpiod_request(struct gpio_desc *desc, const char *label); void gpiod_free(struct gpio_desc *desc); + +static inline int gpiod_request_user(struct gpio_desc *desc, const char *label) +{ + int ret; + + ret = gpiod_request(desc, label); + if (ret == -EPROBE_DEFER) + ret = -ENODEV; + + return ret; +} + int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id, unsigned long lflags, enum gpiod_flags dflags); int gpio_set_debounce_timeout(struct gpio_desc *desc, unsigned int debounce); From cc38ef936840ac29204d806deb4d1836ec509594 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 4 Feb 2022 13:02:25 +0000 Subject: [PATCH 1164/1295] gpio: sifive: use the correct register to read output values Setting the output of a GPIO to 1 using gpiod_set_value(), followed by reading the same GPIO using gpiod_get_value(), will currently yield an incorrect result. This is because the SiFive GPIO device stores the output values in reg_set, not reg_dat. Supply the flag BGPIOF_READ_OUTPUT_REG_SET to bgpio_init() so that the generic driver reads the correct register. Fixes: 96868dce644d ("gpio/sifive: Add GPIO driver for SiFive SoCs") Signed-off-by: Niklas Cassel Reviewed-by: Linus Walleij [Bartosz: added the Fixes tag] Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-sifive.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-sifive.c b/drivers/gpio/gpio-sifive.c index 403f9e833d6a..7d82388b4ab7 100644 --- a/drivers/gpio/gpio-sifive.c +++ b/drivers/gpio/gpio-sifive.c @@ -223,7 +223,7 @@ static int sifive_gpio_probe(struct platform_device *pdev) NULL, chip->base + SIFIVE_GPIO_OUTPUT_EN, chip->base + SIFIVE_GPIO_INPUT_EN, - 0); + BGPIOF_READ_OUTPUT_REG_SET); if (ret) { dev_err(dev, "unable to init generic GPIO\n"); return ret; From bca828ccdd6548d24613d0cede04ada4dfb2f89c Mon Sep 17 00:00:00 2001 From: Samuel Thibault Date: Sun, 6 Feb 2022 02:56:26 +0100 Subject: [PATCH 1165/1295] speakup-dectlk: Restore pitch setting d97a9d7aea04 ("staging/speakup: Add inflection synth parameter") introduced the inflection parameter, but happened to drop the pitch parameter from the dectlk driver. This restores it. Cc: stable@vger.kernel.org Fixes: d97a9d7aea04 ("staging/speakup: Add inflection synth parameter") Signed-off-by: Samuel Thibault Link: https://lore.kernel.org/r/20220206015626.aesbhvvdkmqsrbaw@begin Signed-off-by: Greg Kroah-Hartman --- drivers/accessibility/speakup/speakup_dectlk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/accessibility/speakup/speakup_dectlk.c b/drivers/accessibility/speakup/speakup_dectlk.c index 580ec796816b..78ca4987e619 100644 --- a/drivers/accessibility/speakup/speakup_dectlk.c +++ b/drivers/accessibility/speakup/speakup_dectlk.c @@ -44,6 +44,7 @@ static struct var_t vars[] = { { CAPS_START, .u.s = {"[:dv ap 160] " } }, { CAPS_STOP, .u.s = {"[:dv ap 100 ] " } }, { RATE, .u.n = {"[:ra %d] ", 180, 75, 650, 0, 0, NULL } }, + { PITCH, .u.n = {"[:dv ap %d] ", 122, 50, 350, 0, 0, NULL } }, { INFLECTION, .u.n = {"[:dv pr %d] ", 100, 0, 10000, 0, 0, NULL } }, { VOL, .u.n = {"[:dv g5 %d] ", 86, 60, 86, 0, 0, NULL } }, { PUNCT, .u.n = {"[:pu %c] ", 0, 0, 2, 0, 0, "nsa" } }, From 117b4e96c7f362eb6459543883fc07f77662472c Mon Sep 17 00:00:00 2001 From: Udipto Goswami Date: Mon, 7 Feb 2022 09:55:58 +0530 Subject: [PATCH 1166/1295] usb: dwc3: gadget: Prevent core from processing stale TRBs With CPU re-ordering on write instructions, there might be a chance that the HWO is set before the TRB is updated with the new mapped buffer address. And in the case where core is processing a list of TRBs it is possible that it fetched the TRBs when the HWO is set but before the buffer address is updated. Prevent this by adding a memory barrier before the HWO is updated to ensure that the core always process the updated TRBs. Fixes: f6bafc6a1c9d ("usb: dwc3: convert TRBs into bitshifts") Cc: stable Reviewed-by: Pavankumar Kondeti Signed-off-by: Udipto Goswami Link: https://lore.kernel.org/r/1644207958-18287-1-git-send-email-quic_ugoswami@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 520031ba38aa..183b90923f51 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -1291,6 +1291,19 @@ static void __dwc3_prepare_one_trb(struct dwc3_ep *dep, struct dwc3_trb *trb, if (usb_endpoint_xfer_bulk(dep->endpoint.desc) && dep->stream_capable) trb->ctrl |= DWC3_TRB_CTRL_SID_SOFN(stream_id); + /* + * As per data book 4.2.3.2TRB Control Bit Rules section + * + * The controller autonomously checks the HWO field of a TRB to determine if the + * entire TRB is valid. Therefore, software must ensure that the rest of the TRB + * is valid before setting the HWO field to '1'. In most systems, this means that + * software must update the fourth DWORD of a TRB last. + * + * However there is a possibility of CPU re-ordering here which can cause + * controller to observe the HWO bit set prematurely. + * Add a write memory barrier to prevent CPU re-ordering. + */ + wmb(); trb->ctrl |= DWC3_TRB_CTRL_HWO; dwc3_ep_inc_enq(dep); From 4e2a354e3775870ca823f1fb29bbbffbe11059a6 Mon Sep 17 00:00:00 2001 From: Oliver Barta Date: Tue, 8 Feb 2022 09:46:45 +0100 Subject: [PATCH 1167/1295] regulator: core: fix false positive in regulator_late_cleanup() The check done by regulator_late_cleanup() to detect whether a regulator is on was inconsistent with the check done by _regulator_is_enabled(). While _regulator_is_enabled() takes the enable GPIO into account, regulator_late_cleanup() was not doing that. This resulted in a false positive, e.g. when a GPIO-controlled fixed regulator was used, which was not enabled at boot time, e.g. reg_disp_1v2: reg_disp_1v2 { compatible = "regulator-fixed"; regulator-name = "display_1v2"; regulator-min-microvolt = <1200000>; regulator-max-microvolt = <1200000>; gpio = <&tlmm 148 0>; enable-active-high; }; Such regulator doesn't have an is_enabled() operation. Nevertheless it's state can be determined based on the enable GPIO. The check in regulator_late_cleanup() wrongly assumed that the regulator is on and tried to disable it. Signed-off-by: Oliver Barta Link: https://lore.kernel.org/r/20220208084645.8686-1-oliver.barta@aptiv.com Signed-off-by: Mark Brown --- drivers/regulator/core.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 86aa4141efa9..d2553970a67b 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -6014,9 +6014,8 @@ core_initcall(regulator_init); static int regulator_late_cleanup(struct device *dev, void *data) { struct regulator_dev *rdev = dev_to_rdev(dev); - const struct regulator_ops *ops = rdev->desc->ops; struct regulation_constraints *c = rdev->constraints; - int enabled, ret; + int ret; if (c && c->always_on) return 0; @@ -6029,14 +6028,8 @@ static int regulator_late_cleanup(struct device *dev, void *data) if (rdev->use_count) goto unlock; - /* If we can't read the status assume it's always on. */ - if (ops->is_enabled) - enabled = ops->is_enabled(rdev); - else - enabled = 1; - - /* But if reading the status failed, assume that it's off. */ - if (enabled <= 0) + /* If reading the status failed, assume that it's off. */ + if (_regulator_is_enabled(rdev) <= 0) goto unlock; if (have_full_constraints()) { From 50b10528aad568c95f772039d4b3093b4aea7439 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 7 Feb 2022 16:59:31 +0100 Subject: [PATCH 1168/1295] fbcon: Avoid 'cap' set but not used warning Fix this kernel test robot warning: drivers/video/fbdev/core/fbcon.c: In function 'fbcon_init': drivers/video/fbdev/core/fbcon.c:1028:6: warning: variable 'cap' set but not used [-Wunused-but-set-variable] The cap variable is only used when CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION is enabled. Drop the temporary variable and use info->flags instead. Fixes: 87ab9f6b7417 ("Revert "fbcon: Disable accelerated scrolling") Reported-by: kernel test robot Signed-off-by: Helge Deller Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/YgFB4xqI+As196FR@p100 --- drivers/video/fbdev/core/fbcon.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index f36829eeb5a9..2fc1b80a26ad 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -1025,7 +1025,7 @@ static void fbcon_init(struct vc_data *vc, int init) struct vc_data *svc = *default_mode; struct fbcon_display *t, *p = &fb_display[vc->vc_num]; int logo = 1, new_rows, new_cols, rows, cols; - int cap, ret; + int ret; if (WARN_ON(info_idx == -1)) return; @@ -1034,7 +1034,6 @@ static void fbcon_init(struct vc_data *vc, int init) con2fb_map[vc->vc_num] = info_idx; info = registered_fb[con2fb_map[vc->vc_num]]; - cap = info->flags; if (logo_shown < 0 && console_loglevel <= CONSOLE_LOGLEVEL_QUIET) logo_shown = FBCON_LOGO_DONTSHOW; @@ -1137,8 +1136,8 @@ static void fbcon_init(struct vc_data *vc, int init) ops->graphics = 0; #ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION - if ((cap & FBINFO_HWACCEL_COPYAREA) && - !(cap & FBINFO_HWACCEL_DISABLED)) + if ((info->flags & FBINFO_HWACCEL_COPYAREA) && + !(info->flags & FBINFO_HWACCEL_DISABLED)) p->scrollmode = SCROLL_MOVE; else /* default to something safe */ p->scrollmode = SCROLL_REDRAW; From db405774f6a80c5607dcf43ec810f078bb5c660d Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Mon, 31 Jan 2022 22:05:32 +0100 Subject: [PATCH 1169/1295] MAINTAINERS: Add entry for fbdev core Ever since Tomi extracted the core code in 2014 it's been defacto me maintaining this, with help from others from dri-devel and sometimes Linus (but those are mostly merge conflicts): $ git shortlog -ns drivers/video/fbdev/core/ | head -n5 35 Daniel Vetter 23 Linus Torvalds 10 Hans de Goede 9 Dave Airlie 6 Peter Rosin I think ideally we'd also record that the various firmware fb drivers (efifb, vesafb, ...) are also maintained in drm-misc because for the past few years the patches have either been to fix handover issues with drm drivers, or caused handover issues with drm drivers. So any other tree just doesn't make sense. But also, there's plenty of outdated MAINTAINER entries for these with people and git trees that haven't been active in years, so maybe let's just leave them alone. And furthermore distros are now adopting simpledrm as the firmware fb driver, so hopefully the need to care about the fbdev firmware drivers will go down going forward. Note that drm-misc is group maintained, I expect that to continue like we've done before, so no new expectations that patches all go through my hands. That would be silly. This also means I'm happy to put any other volunteer's name in the M: line, but otherwise git log says I'm the one who's stuck with this. Acked-by: Alex Deucher Acked-by: Daniel Stone Acked-by: Dave Airlie Acked-by: Geert Uytterhoeven Acked-by: Greg Kroah-Hartman Cc: Dave Airlie Acked-by: Helge Deller Acked-by: Jani Nikula Acked-by: Maxime Ripard Acked-by: Sam Ravnborg Acked-by: Thomas Zimmermann Acked-by: Tomi Valkeinen Reviewed-by: Javier Martinez Canillas Cc: Jani Nikula Cc: Linus Torvalds Cc: Linux Fbdev development list Cc: Pavel Machek Cc: Sam Ravnborg Cc: Greg Kroah-Hartman Cc: Javier Martinez Canillas Cc: DRI Development Cc: Linux Kernel Mailing List Cc: Claudio Suarez Cc: Tomi Valkeinen Cc: Geert Uytterhoeven Cc: Thomas Zimmermann Cc: Daniel Vetter Cc: Sven Schnelle Cc: Gerd Hoffmann Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20220131210552.482606-2-daniel.vetter@ffwll.ch --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index ea3e6c914384..49809eaa3096 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7573,6 +7573,12 @@ S: Maintained W: http://floatingpoint.sourceforge.net/emulator/index.html F: arch/x86/math-emu/ +FRAMEBUFFER CORE +M: Daniel Vetter +F: drivers/video/fbdev/core/ +S: Odd Fixes +T: git git://anongit.freedesktop.org/drm/drm-misc + FRAMEBUFFER LAYER M: Helge Deller L: linux-fbdev@vger.kernel.org From 468d126dab45718feeb728319be20bd869a5eaa7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 2 Feb 2022 18:52:01 -0500 Subject: [PATCH 1170/1295] NFS: Fix initialisation of nfs_client cl_flags field For some long forgotten reason, the nfs_client cl_flags field is initialised in nfs_get_client() instead of being initialised at allocation time. This quirk was harmless until we moved the call to nfs_create_rpc_client(). Fixes: dd99e9f98fbf ("NFSv4: Initialise connection to the server in nfs4_alloc_client()") Cc: stable@vger.kernel.org # 4.8.x Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index f18e80fda9bf..d1f34229e11a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -177,6 +177,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) INIT_LIST_HEAD(&clp->cl_superblocks); clp->cl_rpcclient = ERR_PTR(-EINVAL); + clp->cl_flags = cl_init->init_flags; clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; @@ -423,7 +424,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) list_add_tail(&new->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); - new->cl_flags = cl_init->init_flags; return rpc_ops->init_client(new, cl_init); } From a9c10b5b3b67b3750a10c8b089b2e05f5e176e33 Mon Sep 17 00:00:00 2001 From: Dan Aloni Date: Tue, 25 Jan 2022 22:06:46 +0200 Subject: [PATCH 1171/1295] xprtrdma: fix pointer derefs in error cases of rpcrdma_ep_create If there are failures then we must not leave the non-NULL pointers with the error value, otherwise `rpcrdma_ep_destroy` gets confused and tries free them, resulting in an Oops. Signed-off-by: Dan Aloni Acked-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f172d1298013..7b5fce2faa10 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -413,6 +413,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) IB_POLL_WORKQUEUE); if (IS_ERR(ep->re_attr.send_cq)) { rc = PTR_ERR(ep->re_attr.send_cq); + ep->re_attr.send_cq = NULL; goto out_destroy; } @@ -421,6 +422,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) IB_POLL_WORKQUEUE); if (IS_ERR(ep->re_attr.recv_cq)) { rc = PTR_ERR(ep->re_attr.recv_cq); + ep->re_attr.recv_cq = NULL; goto out_destroy; } ep->re_receive_count = 0; @@ -459,6 +461,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) ep->re_pd = ib_alloc_pd(device, 0); if (IS_ERR(ep->re_pd)) { rc = PTR_ERR(ep->re_pd); + ep->re_pd = NULL; goto out_destroy; } From 3d4a39404ba323b08fb42bcdca9a015144e213dd Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 13 Jan 2022 10:26:04 +0800 Subject: [PATCH 1172/1295] NFS: Fix nfs4_proc_get_locations() kernel-doc comment Add the description of @server and @fhandle, and remove the excess @inode in nfs4_proc_get_locations() kernel-doc comment to remove warnings found by running scripts/kernel-doc, which is caused by using 'make W=1'. fs/nfs/nfs4proc.c:8219: warning: Function parameter or member 'server' not described in 'nfs4_proc_get_locations' fs/nfs/nfs4proc.c:8219: warning: Function parameter or member 'fhandle' not described in 'nfs4_proc_get_locations' fs/nfs/nfs4proc.c:8219: warning: Excess function parameter 'inode' description in 'nfs4_proc_get_locations' Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b18f31b2c9e7..f5020828ab65 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8032,7 +8032,8 @@ static int _nfs41_proc_get_locations(struct nfs_server *server, /** * nfs4_proc_get_locations - discover locations for a migrated FSID - * @inode: inode on FSID that is migrating + * @server: pointer to nfs_server to process + * @fhandle: pointer to the kernel NFS client file handle * @locations: result of query * @page: buffer * @cred: credential to use for this operation From 63db37e99ac17f575766c9317f927c8d4c6e8cdf Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 7 Feb 2022 11:14:47 -0500 Subject: [PATCH 1173/1295] MAINTAINERS: Update my email address Signed-off-by: Anna Schumaker --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f41088418aae..afff78665e0f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13566,7 +13566,7 @@ F: tools/testing/selftests/nci/ NFS, SUNRPC, AND LOCKD CLIENTS M: Trond Myklebust -M: Anna Schumaker +M: Anna Schumaker L: linux-nfs@vger.kernel.org S: Maintained W: http://client.linux-nfs.org From b49ea673e119f59c71645e2f65b3ccad857c90ee Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 17 Jan 2022 16:36:53 +1100 Subject: [PATCH 1174/1295] SUNRPC: lock against ->sock changing during sysfs read ->sock can be set to NULL asynchronously unless ->recv_mutex is held. So it is important to hold that mutex. Otherwise a sysfs read can trigger an oops. Commit 17f09d3f619a ("SUNRPC: Check if the xprt is connected before handling sysfs reads") appears to attempt to fix this problem, but it only narrows the race window. Fixes: 17f09d3f619a ("SUNRPC: Check if the xprt is connected before handling sysfs reads") Fixes: a8482488a7d6 ("SUNRPC query transport's source port") Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- net/sunrpc/sysfs.c | 5 ++++- net/sunrpc/xprtsock.c | 7 ++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index b64a0286b182..05c758da6a92 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -115,11 +115,14 @@ static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, } sock = container_of(xprt, struct sock_xprt, xprt); - if (kernel_getsockname(sock->sock, (struct sockaddr *)&saddr) < 0) + mutex_lock(&sock->recv_mutex); + if (sock->sock == NULL || + kernel_getsockname(sock->sock, (struct sockaddr *)&saddr) < 0) goto out; ret = sprintf(buf, "%pISc\n", &saddr); out: + mutex_unlock(&sock->recv_mutex); xprt_put(xprt); return ret + 1; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 69b6ee5a5fd1..0f39e08ee580 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1641,7 +1641,12 @@ static int xs_get_srcport(struct sock_xprt *transport) unsigned short get_srcport(struct rpc_xprt *xprt) { struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt); - return xs_sock_getport(sock->sock); + unsigned short ret = 0; + mutex_lock(&sock->recv_mutex); + if (sock->sock) + ret = xs_sock_getport(sock->sock); + mutex_unlock(&sock->recv_mutex); + return ret; } EXPORT_SYMBOL(get_srcport); From 1830947ee4e8ed3e7083e8d41d2b8486568ebea7 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 12 Jan 2022 15:07:37 +0100 Subject: [PATCH 1175/1295] arm64: Remove ARCH_VULCAN Commit a314520d82317650 ("arm64: disable Broadcom Vulcan platform") did not remove the ARCH_VULCAN configuration symbol, as there were still references to this symbol. As of commits 240d3d5b2a7a3263 ("gpio: xlp: update GPIO_XLP dependency") and f85a543e5373eeba ("arm64: defconfig: drop ARCH_VULCAN"), the last users of ARCH_VULCAN have been removed. Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/9e8fef2cf4f2d5648e87076bc96601cff945ce40.1641996361.git.geert+renesas@glider.be' Signed-off-by: Arnd Bergmann --- arch/arm64/Kconfig.platforms | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index 7d5d58800170..21697449d762 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -309,9 +309,6 @@ config ARCH_VISCONTI help This enables support for Toshiba Visconti SoCs Family. -config ARCH_VULCAN - def_bool n - config ARCH_XGENE bool "AppliedMicro X-Gene SOC Family" help From 4bbf59a9db44c78dd3e5c72057548f7c1eb8d2ba Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 8 Feb 2022 15:36:21 +0100 Subject: [PATCH 1176/1295] rtla: Fix segmentation fault when failing to enable -t rtla osnoise and timerlat are causing a segmentation fault when running with the --trace option on a kernel that does not support multiple instances. For example: [root@f34 rtla]# rtla osnoise top -t failed to enable the tracer osnoise Could not enable osnoiser tracer for tracing Failed to enable the trace instance Segmentation fault (core dumped) This error happens because the exit code of the tools is trying to destroy the trace instance that failed to be created. Make osnoise_destroy_tool() aware of possible NULL osnoise_tool *, and do not attempt to destroy it. This also simplifies the exit code. Link: https://lkml.kernel.org/r/5660a2b6bf66c2655842360f2d7f6b48db5dba23.1644327249.git.bristot@kernel.org Suggested-by: Steven Rostedt Fixes: 1eceb2fc2ca5 ("rtla/osnoise: Add osnoise top mode") Fixes: 829a6c0b5698 ("rtla/osnoise: Add the hist mode") Fixes: a828cd18bc4a ("rtla: Add timerlat tool and timelart top mode") Fixes: 1eeb6328e8b3 ("rtla/timerlat: Add timerlat hist mode") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/src/osnoise.c | 3 +++ tools/tracing/rtla/src/osnoise_hist.c | 7 +++---- tools/tracing/rtla/src/osnoise_top.c | 7 +++---- tools/tracing/rtla/src/timerlat_hist.c | 7 +++---- tools/tracing/rtla/src/timerlat_top.c | 7 +++---- 5 files changed, 15 insertions(+), 16 deletions(-) diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c index 7b73d1eccd0e..5648f9252e58 100644 --- a/tools/tracing/rtla/src/osnoise.c +++ b/tools/tracing/rtla/src/osnoise.c @@ -750,6 +750,9 @@ void osnoise_put_context(struct osnoise_context *context) */ void osnoise_destroy_tool(struct osnoise_tool *top) { + if (!top) + return; + trace_instance_destroy(&top->trace); if (top->context) diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index 180fcbe423cd..1f0b7fce55cf 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -701,9 +701,9 @@ osnoise_hist_set_signals(struct osnoise_hist_params *params) int osnoise_hist_main(int argc, char *argv[]) { struct osnoise_hist_params *params; + struct osnoise_tool *record = NULL; + struct osnoise_tool *tool = NULL; struct trace_instance *trace; - struct osnoise_tool *record; - struct osnoise_tool *tool; int return_value = 1; int retval; @@ -792,9 +792,8 @@ int osnoise_hist_main(int argc, char *argv[]) out_hist: osnoise_free_histogram(tool->data); out_destroy: + osnoise_destroy_tool(record); osnoise_destroy_tool(tool); - if (params->trace_output) - osnoise_destroy_tool(record); free(params); out_exit: exit(return_value); diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 332b2ac205fc..c67dc28ef716 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -483,9 +483,9 @@ static void osnoise_top_set_signals(struct osnoise_top_params *params) int osnoise_top_main(int argc, char **argv) { struct osnoise_top_params *params; + struct osnoise_tool *record = NULL; + struct osnoise_tool *tool = NULL; struct trace_instance *trace; - struct osnoise_tool *record; - struct osnoise_tool *tool; int return_value = 1; int retval; @@ -571,9 +571,8 @@ int osnoise_top_main(int argc, char **argv) out_top: osnoise_free_top(tool->data); + osnoise_destroy_tool(record); osnoise_destroy_tool(tool); - if (params->trace_output) - osnoise_destroy_tool(record); out_exit: exit(return_value); } diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 235f9620ef3d..436a799f9adf 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -729,9 +729,9 @@ timerlat_hist_set_signals(struct timerlat_hist_params *params) int timerlat_hist_main(int argc, char *argv[]) { struct timerlat_hist_params *params; + struct osnoise_tool *record = NULL; + struct osnoise_tool *tool = NULL; struct trace_instance *trace; - struct osnoise_tool *record; - struct osnoise_tool *tool; int return_value = 1; int retval; @@ -813,9 +813,8 @@ int timerlat_hist_main(int argc, char *argv[]) out_hist: timerlat_free_histogram(tool->data); + osnoise_destroy_tool(record); osnoise_destroy_tool(tool); - if (params->trace_output) - osnoise_destroy_tool(record); free(params); out_exit: exit(return_value); diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 1ebd5291539c..d4187f6534ed 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -521,9 +521,9 @@ timerlat_top_set_signals(struct timerlat_top_params *params) int timerlat_top_main(int argc, char *argv[]) { struct timerlat_top_params *params; + struct osnoise_tool *record = NULL; + struct osnoise_tool *top = NULL; struct trace_instance *trace; - struct osnoise_tool *record; - struct osnoise_tool *top; int return_value = 1; int retval; @@ -609,9 +609,8 @@ int timerlat_top_main(int argc, char *argv[]) out_top: timerlat_free_top(top->data); + osnoise_destroy_tool(record); osnoise_destroy_tool(top); - if (params->trace_output) - osnoise_destroy_tool(record); free(params); out_exit: exit(return_value); From c0cfbb122275da1b726481de5a8cffeb24e6322b Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Wed, 26 Jan 2022 15:55:24 +0100 Subject: [PATCH 1177/1295] drm/rockchip: dw_hdmi: Do not leave clock enabled in error case The driver returns an error when devm_phy_optional_get() fails leaving the previously enabled clock turned on. Change order and enable the clock only after the phy has been acquired. Signed-off-by: Sascha Hauer Signed-off-by: Heiko Stuebner Link: https://patchwork.freedesktop.org/patch/msgid/20220126145549.617165-3-s.hauer@pengutronix.de --- drivers/gpu/drm/rockchip/dw_hdmi-rockchip.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/rockchip/dw_hdmi-rockchip.c b/drivers/gpu/drm/rockchip/dw_hdmi-rockchip.c index 830bdd5e9b7c..8677c8271678 100644 --- a/drivers/gpu/drm/rockchip/dw_hdmi-rockchip.c +++ b/drivers/gpu/drm/rockchip/dw_hdmi-rockchip.c @@ -529,13 +529,6 @@ static int dw_hdmi_rockchip_bind(struct device *dev, struct device *master, return ret; } - ret = clk_prepare_enable(hdmi->vpll_clk); - if (ret) { - DRM_DEV_ERROR(hdmi->dev, "Failed to enable HDMI vpll: %d\n", - ret); - return ret; - } - hdmi->phy = devm_phy_optional_get(dev, "hdmi"); if (IS_ERR(hdmi->phy)) { ret = PTR_ERR(hdmi->phy); @@ -544,6 +537,13 @@ static int dw_hdmi_rockchip_bind(struct device *dev, struct device *master, return ret; } + ret = clk_prepare_enable(hdmi->vpll_clk); + if (ret) { + DRM_DEV_ERROR(hdmi->dev, "Failed to enable HDMI vpll: %d\n", + ret); + return ret; + } + drm_encoder_helper_add(encoder, &dw_hdmi_rockchip_encoder_helper_funcs); drm_simple_encoder_init(drm, encoder, DRM_MODE_ENCODER_TMDS); From 9da1e9ab82c92d0e89fe44cad2cd7c2d18d64070 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Wed, 19 Jan 2022 16:11:22 -0800 Subject: [PATCH 1178/1295] drm/rockchip: vop: Correct RK3399 VOP register fields Commit 7707f7227f09 ("drm/rockchip: Add support for afbc") switched up the rk3399_vop_big[] register windows, but it did so incorrectly. The biggest problem is in rk3288_win23_data[] vs. rk3368_win23_data[] .format field: RK3288's format: VOP_REG(RK3288_WIN2_CTRL0, 0x7, 1) RK3368's format: VOP_REG(RK3368_WIN2_CTRL0, 0x3, 5) Bits 5:6 (i.e., shift 5, mask 0x3) are correct for RK3399, according to the TRM. There are a few other small differences between the 3288 and 3368 definitions that were swapped in commit 7707f7227f09. I reviewed them to the best of my ability according to the RK3399 TRM and fixed them up. This fixes IOMMU issues (and display errors) when testing with BG24 color formats. Fixes: 7707f7227f09 ("drm/rockchip: Add support for afbc") Cc: Andrzej Pietrasiewicz Cc: Signed-off-by: Brian Norris Tested-by: Andrzej Pietrasiewicz Signed-off-by: Heiko Stuebner Link: https://patchwork.freedesktop.org/patch/msgid/20220119161104.1.I1d01436bef35165a8cdfe9308789c0badb5ff46a@changeid --- drivers/gpu/drm/rockchip/rockchip_vop_reg.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/rockchip/rockchip_vop_reg.c b/drivers/gpu/drm/rockchip/rockchip_vop_reg.c index 1f7353f0684a..798b542e5916 100644 --- a/drivers/gpu/drm/rockchip/rockchip_vop_reg.c +++ b/drivers/gpu/drm/rockchip/rockchip_vop_reg.c @@ -902,6 +902,7 @@ static const struct vop_win_phy rk3399_win01_data = { .enable = VOP_REG(RK3288_WIN0_CTRL0, 0x1, 0), .format = VOP_REG(RK3288_WIN0_CTRL0, 0x7, 1), .rb_swap = VOP_REG(RK3288_WIN0_CTRL0, 0x1, 12), + .x_mir_en = VOP_REG(RK3288_WIN0_CTRL0, 0x1, 21), .y_mir_en = VOP_REG(RK3288_WIN0_CTRL0, 0x1, 22), .act_info = VOP_REG(RK3288_WIN0_ACT_INFO, 0x1fff1fff, 0), .dsp_info = VOP_REG(RK3288_WIN0_DSP_INFO, 0x0fff0fff, 0), @@ -912,6 +913,7 @@ static const struct vop_win_phy rk3399_win01_data = { .uv_vir = VOP_REG(RK3288_WIN0_VIR, 0x3fff, 16), .src_alpha_ctl = VOP_REG(RK3288_WIN0_SRC_ALPHA_CTRL, 0xff, 0), .dst_alpha_ctl = VOP_REG(RK3288_WIN0_DST_ALPHA_CTRL, 0xff, 0), + .channel = VOP_REG(RK3288_WIN0_CTRL2, 0xff, 0), }; /* @@ -922,11 +924,11 @@ static const struct vop_win_phy rk3399_win01_data = { static const struct vop_win_data rk3399_vop_win_data[] = { { .base = 0x00, .phy = &rk3399_win01_data, .type = DRM_PLANE_TYPE_PRIMARY }, - { .base = 0x40, .phy = &rk3288_win01_data, + { .base = 0x40, .phy = &rk3368_win01_data, .type = DRM_PLANE_TYPE_OVERLAY }, - { .base = 0x00, .phy = &rk3288_win23_data, + { .base = 0x00, .phy = &rk3368_win23_data, .type = DRM_PLANE_TYPE_OVERLAY }, - { .base = 0x50, .phy = &rk3288_win23_data, + { .base = 0x50, .phy = &rk3368_win23_data, .type = DRM_PLANE_TYPE_CURSOR }, }; From a81da65fbae6436e1e2f415532b8aacc3274d840 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 8 Feb 2022 19:28:28 +0000 Subject: [PATCH 1179/1295] cifs: call cifs_reconnect when a connection is marked In cifsd thread, we should continue to call cifs_reconnect whenever server->tcpStatus is marked as CifsNeedReconnect. This was inexplicably removed by one of my recent commits. Fixing that here. Fixes: a05885ce13bd ("cifs: fix the connection state transitions with multichannel") Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index cff6c01feae2..5b4733eb42c7 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -639,6 +639,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) if (server->tcpStatus == CifsNeedReconnect) { spin_unlock(&cifs_tcp_ses_lock); + cifs_reconnect(server, false); return -ECONNABORTED; } spin_unlock(&cifs_tcp_ses_lock); From 92a68053c3468705e2c7c752c9a3f256304a35a6 Mon Sep 17 00:00:00 2001 From: Akira Kawata Date: Mon, 7 Feb 2022 20:20:44 +0900 Subject: [PATCH 1180/1295] Documentation: KUnit: Fix usage bug Fix a bug of kunit documentation. Link: https://bugzilla.kernel.org/show_bug.cgi?id=205773 : Quoting Steve Pfetsch: : : kunit documentation is incorrect: : https://kunit.dev/third_party/stable_kernel/docs/usage.html : struct rectangle *self = container_of(this, struct shape, parent); : : : Shouldn't it be: : struct rectangle *self = container_of(this, struct rectangle, parent); : ? Signed-off-by: Akira Kawata Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- Documentation/dev-tools/kunit/usage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst index 76af931a332c..1c83e7d60a8a 100644 --- a/Documentation/dev-tools/kunit/usage.rst +++ b/Documentation/dev-tools/kunit/usage.rst @@ -242,7 +242,7 @@ example: int rectangle_area(struct shape *this) { - struct rectangle *self = container_of(this, struct shape, parent); + struct rectangle *self = container_of(this, struct rectangle, parent); return self->length * self->width; }; From 3a92e6de780c7afaa826b80058abd952bb2904d8 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 8 Feb 2022 15:47:10 +0100 Subject: [PATCH 1181/1295] MAINTAINERS: Add RTLA entry Add an RTLA entry in the MAINTAINERS file with Steven Rostedt and myself as maintainers. Link: https://lkml.kernel.org/r/50d8870522580905a1c7f3e6fb611a700f632af1.1643994005.git.bristot@kernel.org Cc: Joe Perches Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index ea3e6c914384..61d127e3314f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19576,6 +19576,14 @@ F: Documentation/trace/timerlat-tracer.rst F: Documentation/trace/hwlat_detector.rst F: arch/*/kernel/trace.c +Real-time Linux Analysis (RTLA) tools +M: Daniel Bristot de Oliveira +M: Steven Rostedt +L: linux-trace-devel@vger.kernel.org +S: Maintained +F: Documentation/tools/rtla/ +F: tools/tracing/rtla/ + TRADITIONAL CHINESE DOCUMENTATION M: Hu Haowen L: linux-doc-tw-discuss@lists.sourceforge.net From 3203ce39ac0b2a57a84382ec184c7d4a0bede175 Mon Sep 17 00:00:00 2001 From: JaeSang Yoo Date: Wed, 9 Feb 2022 04:54:22 +0900 Subject: [PATCH 1182/1295] tracing: Fix tp_printk option related with tp_printk_stop_on_boot The kernel parameter "tp_printk_stop_on_boot" starts with "tp_printk" which is the same as another kernel parameter "tp_printk". If "tp_printk" setup is called before the "tp_printk_stop_on_boot", it will override the latter and keep it from being set. This is similar to other kernel parameter issues, such as: Commit 745a600cf1a6 ("um: console: Ignore console= option") or init/do_mounts.c:45 (setup function of "ro" kernel param) Fix it by checking for a "_" right after the "tp_printk" and if that exists do not process the parameter. Link: https://lkml.kernel.org/r/20220208195421.969326-1-jsyoo5b@gmail.com Signed-off-by: JaeSang Yoo [ Fixed up change log and added space after if condition ] Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c860f582b078..7c2578efde26 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -252,6 +252,10 @@ __setup("trace_clock=", set_trace_boot_clock); static int __init set_tracepoint_printk(char *str) { + /* Ignore the "tp_printk_stop_on_boot" param */ + if (*str == '_') + return 0; + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) tracepoint_printk = 1; return 1; From 084cbb2ec3af2d23be9de65fcc9493e21e265859 Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Mon, 7 Feb 2022 09:59:01 -0800 Subject: [PATCH 1183/1295] gve: Recording rx queue before sending to napi This caused a significant performance degredation when using generic XDP with multiple queues. Fixes: f5cedc84a30d2 ("gve: Add transmit and receive support") Signed-off-by: Tao Liu Link: https://lore.kernel.org/r/20220207175901.2486596-1-jeroendb@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_rx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c index 2068199445bd..e4e98aa7745f 100644 --- a/drivers/net/ethernet/google/gve/gve_rx.c +++ b/drivers/net/ethernet/google/gve/gve_rx.c @@ -609,6 +609,7 @@ static bool gve_rx(struct gve_rx_ring *rx, netdev_features_t feat, *packet_size_bytes = skb->len + (skb->protocol ? ETH_HLEN : 0); *work_done = work_cnt; + skb_record_rx_queue(skb, rx->q_num); if (skb_is_nonlinear(skb)) napi_gro_frags(napi); else From f81393a5b252df772b934cde81b7e16273afbd43 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Sun, 23 Jan 2022 17:04:17 +0800 Subject: [PATCH 1184/1295] riscv: extable: fix err reg writing in dedicated uaccess handler Mayuresh reported commit 20802d8d477d ("riscv: extable: add a dedicated uaccess handler") breaks the writev02 test case in LTP. This is due to the err reg isn't correctly set with the errno(-EFAULT in writev02 case). First of all, the err and zero regs are reg numbers rather than reg offsets in struct pt_regs; Secondly, regs_set_gpr() should write the regs when offset isn't zero(zero means epc) Fix it by correcting regs_set_gpr() logic and passing the correct reg offset to it. Reported-by: Mayuresh Chitale Fixes: 20802d8d477d ("riscv: extable: add a dedicated uaccess handler") Signed-off-by: Jisheng Zhang Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/extable.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/mm/extable.c b/arch/riscv/mm/extable.c index 05978f78579f..35484d830fd6 100644 --- a/arch/riscv/mm/extable.c +++ b/arch/riscv/mm/extable.c @@ -33,7 +33,7 @@ static inline void regs_set_gpr(struct pt_regs *regs, unsigned int offset, if (unlikely(offset > MAX_REG_OFFSET)) return; - if (!offset) + if (offset) *(unsigned long *)((unsigned long)regs + offset) = val; } @@ -43,8 +43,8 @@ static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex, int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); int reg_zero = FIELD_GET(EX_DATA_REG_ZERO, ex->data); - regs_set_gpr(regs, reg_err, -EFAULT); - regs_set_gpr(regs, reg_zero, 0); + regs_set_gpr(regs, reg_err * sizeof(unsigned long), -EFAULT); + regs_set_gpr(regs, reg_zero * sizeof(unsigned long), 0); regs->epc = get_ex_fixup(ex); return true; From 23de0d7b6f0e3f9a6283a882594c479949da1120 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Mon, 7 Feb 2022 14:29:01 -0800 Subject: [PATCH 1185/1295] bonding: pair enable_port with slave_arr_updates When 803.2ad mode enables a participating port, it should update the slave-array. I have observed that the member links are participating and are part of the active aggregator while the traffic is egressing via only one member link (in a case where two links are participating). Via kprobes I discovered that slave-arr has only one link added while the other participating link wasn't part of the slave-arr. I couldn't see what caused that situation but the simple code-walk through provided me hints that the enable_port wasn't always associated with the slave-array update. Fixes: ee6377147409 ("bonding: Simplify the xmit function for modes that use xmit_hash") Signed-off-by: Mahesh Bandewar Acked-by: Jay Vosburgh Link: https://lore.kernel.org/r/20220207222901.1795287-1-maheshb@google.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_3ad.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 6006c2e8fa2b..9fd1d6cba3cd 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -1021,8 +1021,8 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) if (port->aggregator && port->aggregator->is_active && !__port_is_enabled(port)) { - __enable_port(port); + *update_slave_arr = true; } } break; @@ -1779,6 +1779,7 @@ static void ad_agg_selection_logic(struct aggregator *agg, port = port->next_port_in_aggregator) { __enable_port(port); } + *update_slave_arr = true; } } From 52492ff5c583036306bc422a83e246c971af387a Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 8 Feb 2022 19:15:17 +0000 Subject: [PATCH 1186/1295] cifs: call helper functions for marking channels for reconnect cifs_mark_tcp_ses_conns_for_reconnect helper function is now meant to be used by any of the threads to mark a channel (or all the channels) for reconnect. Replace all such manual changes to tcpStatus to use this helper function, which takes care that the right channels, smb sessions and tcons are marked for reconnect. Also includes one line minor change Reported-by: kernel test robot Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifs_swn.c | 6 +++--- fs/cifs/dfs_cache.c | 2 +- fs/cifs/smb1ops.c | 4 +--- fs/cifs/transport.c | 5 +---- 4 files changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index 463ebe34892b..cdce1609c5c2 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -396,11 +396,11 @@ static int cifs_swn_resource_state_changed(struct cifs_swn_reg *swnreg, const ch switch (state) { case CIFS_SWN_RESOURCE_STATE_UNAVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become unavailable\n", __func__, name); - cifs_reconnect(swnreg->tcon->ses->server, true); + cifs_mark_tcp_ses_conns_for_reconnect(swnreg->tcon->ses->server, true); break; case CIFS_SWN_RESOURCE_STATE_AVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become available\n", __func__, name); - cifs_reconnect(swnreg->tcon->ses->server, true); + cifs_mark_tcp_ses_conns_for_reconnect(swnreg->tcon->ses->server, true); break; case CIFS_SWN_RESOURCE_STATE_UNKNOWN: cifs_dbg(FYI, "%s: resource name '%s' changed to unknown state\n", __func__, name); @@ -498,7 +498,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a goto unlock; } - cifs_reconnect(tcon->ses->server, false); + cifs_mark_tcp_ses_conns_for_reconnect(tcon->ses->server, false); unlock: mutex_unlock(&tcon->ses->server->srv_mutex); diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index dd9643751671..831f42458bf6 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1355,7 +1355,7 @@ static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cach } cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__); - cifs_reconnect(tcon->ses->server, true); + cifs_mark_tcp_ses_conns_for_reconnect(tcon->ses->server, true); } /* Refresh dfs referral of tcon and mark it for reconnect if needed */ diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 8272c91e15ef..b2fb7bd11936 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -228,9 +228,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server) spin_unlock(&GlobalMid_Lock); if (reconnect) { - spin_lock(&cifs_tcp_ses_lock); - server->tcpStatus = CifsNeedReconnect; - spin_unlock(&cifs_tcp_ses_lock); + cifs_mark_tcp_ses_conns_for_reconnect(server, false); } return mid; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 8540f7c13eae..a4c3e027cca2 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -430,10 +430,7 @@ unmask: * be taken as the remainder of this one. We need to kill the * socket so the server throws away the partial SMB */ - spin_lock(&cifs_tcp_ses_lock); - if (server->tcpStatus != CifsExiting) - server->tcpStatus = CifsNeedReconnect; - spin_unlock(&cifs_tcp_ses_lock); + cifs_mark_tcp_ses_conns_for_reconnect(server, false); trace_smb3_partial_send_reconnect(server->CurrentMid, server->conn_id, server->hostname); } From 2a05137a0575b7d1006bdf4c1beeee9e391e22a0 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 8 Feb 2022 17:10:02 +0000 Subject: [PATCH 1187/1295] cifs: mark sessions for reconnection in helper function Today we have the code to mark connections and sessions (and tcons) for reconnect clubbed with the code to close the socket and abort all mids in the same function. Sometimes, we need to mark connections and sessions outside cifsd thread. So as a part of this change, I'm splitting this function into two different functions and calling them one after the other in cifs_reconnect. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5b4733eb42c7..053cb449eb16 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -175,11 +175,6 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; - struct mid_q_entry *mid, *nmid; - struct list_head retry_list; - - server->maxBuf = 0; - server->max_read = 0; /* * before reconnecting the tcp session, mark the smb session (uid) and the tid bad so they @@ -219,6 +214,16 @@ next_session: spin_unlock(&ses->chan_lock); } spin_unlock(&cifs_tcp_ses_lock); +} + +static void +cifs_abort_connection(struct TCP_Server_Info *server) +{ + struct mid_q_entry *mid, *nmid; + struct list_head retry_list; + + server->maxBuf = 0; + server->max_read = 0; /* do not want to be sending data on a socket we are freeing */ cifs_dbg(FYI, "%s: tearing down socket\n", __func__); @@ -310,6 +315,8 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session); + cifs_abort_connection(server); + do { try_to_freeze(); mutex_lock(&server->srv_mutex); @@ -434,6 +441,8 @@ reconnect_dfs_server(struct TCP_Server_Info *server, cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session); + cifs_abort_connection(server); + do { try_to_freeze(); mutex_lock(&server->srv_mutex); From f53a2ce893b2c7884ef94471f170839170a4eba0 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Feb 2022 18:15:47 +0200 Subject: [PATCH 1188/1295] net: dsa: mv88e6xxx: don't use devres for mdiobus As explained in commits: 74b6d7d13307 ("net: dsa: realtek: register the MDIO bus under devres") 5135e96a3dd2 ("net: dsa: don't allocate the slave_mii_bus using devres") mdiobus_free() will panic when called from devm_mdiobus_free() <- devres_release_all() <- __device_release_driver(), and that mdiobus was not previously unregistered. The mv88e6xxx is an MDIO device, so the initial set of constraints that I thought would cause this (I2C or SPI buses which call ->remove on ->shutdown) do not apply. But there is one more which applies here. If the DSA master itself is on a bus that calls ->remove from ->shutdown (like dpaa2-eth, which is on the fsl-mc bus), there is a device link between the switch and the DSA master, and device_links_unbind_consumers() will unbind the Marvell switch driver on shutdown. systemd-shutdown[1]: Powering off. mv88e6085 0x0000000008b96000:00 sw_gl0: Link is Down fsl-mc dpbp.9: Removing from iommu group 7 fsl-mc dpbp.8: Removing from iommu group 7 ------------[ cut here ]------------ kernel BUG at drivers/net/phy/mdio_bus.c:677! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 1 Comm: systemd-shutdow Not tainted 5.16.5-00040-gdc05f73788e5 #15 pc : mdiobus_free+0x44/0x50 lr : devm_mdiobus_free+0x10/0x20 Call trace: mdiobus_free+0x44/0x50 devm_mdiobus_free+0x10/0x20 devres_release_all+0xa0/0x100 __device_release_driver+0x190/0x220 device_release_driver_internal+0xac/0xb0 device_links_unbind_consumers+0xd4/0x100 __device_release_driver+0x4c/0x220 device_release_driver_internal+0xac/0xb0 device_links_unbind_consumers+0xd4/0x100 __device_release_driver+0x94/0x220 device_release_driver+0x28/0x40 bus_remove_device+0x118/0x124 device_del+0x174/0x420 fsl_mc_device_remove+0x24/0x40 __fsl_mc_device_remove+0xc/0x20 device_for_each_child+0x58/0xa0 dprc_remove+0x90/0xb0 fsl_mc_driver_remove+0x20/0x5c __device_release_driver+0x21c/0x220 device_release_driver+0x28/0x40 bus_remove_device+0x118/0x124 device_del+0x174/0x420 fsl_mc_bus_remove+0x80/0x100 fsl_mc_bus_shutdown+0xc/0x1c platform_shutdown+0x20/0x30 device_shutdown+0x154/0x330 kernel_power_off+0x34/0x6c __do_sys_reboot+0x15c/0x250 __arm64_sys_reboot+0x20/0x30 invoke_syscall.constprop.0+0x4c/0xe0 do_el0_svc+0x4c/0x150 el0_svc+0x24/0xb0 el0t_64_sync_handler+0xa8/0xb0 el0t_64_sync+0x178/0x17c So the same treatment must be applied to all DSA switch drivers, which is: either use devres for both the mdiobus allocation and registration, or don't use devres at all. The Marvell driver already has a good structure for mdiobus removal, so just plug in mdiobus_free and get rid of devres. Fixes: ac3a68d56651 ("net: phy: don't abuse devres in devm_mdiobus_register()") Reported-by: Rafael Richter Signed-off-by: Vladimir Oltean Tested-by: Daniel Klauer Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 58ca684d73f7..659f29582406 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3399,7 +3399,7 @@ static int mv88e6xxx_mdio_register(struct mv88e6xxx_chip *chip, return err; } - bus = devm_mdiobus_alloc_size(chip->dev, sizeof(*mdio_bus)); + bus = mdiobus_alloc_size(sizeof(*mdio_bus)); if (!bus) return -ENOMEM; @@ -3424,14 +3424,14 @@ static int mv88e6xxx_mdio_register(struct mv88e6xxx_chip *chip, if (!external) { err = mv88e6xxx_g2_irq_mdio_setup(chip, bus); if (err) - return err; + goto out; } err = of_mdiobus_register(bus, np); if (err) { dev_err(chip->dev, "Cannot register MDIO bus (%d)\n", err); mv88e6xxx_g2_irq_mdio_free(chip, bus); - return err; + goto out; } if (external) @@ -3440,6 +3440,10 @@ static int mv88e6xxx_mdio_register(struct mv88e6xxx_chip *chip, list_add(&mdio_bus->list, &chip->mdios); return 0; + +out: + mdiobus_free(bus); + return err; } static void mv88e6xxx_mdios_unregister(struct mv88e6xxx_chip *chip) @@ -3455,6 +3459,7 @@ static void mv88e6xxx_mdios_unregister(struct mv88e6xxx_chip *chip) mv88e6xxx_g2_irq_mdio_free(chip, bus); mdiobus_unregister(bus); + mdiobus_free(bus); } } From 50facd86e9fbc4b93fe02e5fe05776047f45dbfb Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Feb 2022 18:15:48 +0200 Subject: [PATCH 1189/1295] net: dsa: ar9331: register the mdiobus under devres As explained in commits: 74b6d7d13307 ("net: dsa: realtek: register the MDIO bus under devres") 5135e96a3dd2 ("net: dsa: don't allocate the slave_mii_bus using devres") mdiobus_free() will panic when called from devm_mdiobus_free() <- devres_release_all() <- __device_release_driver(), and that mdiobus was not previously unregistered. The ar9331 is an MDIO device, so the initial set of constraints that I thought would cause this (I2C or SPI buses which call ->remove on ->shutdown) do not apply. But there is one more which applies here. If the DSA master itself is on a bus that calls ->remove from ->shutdown (like dpaa2-eth, which is on the fsl-mc bus), there is a device link between the switch and the DSA master, and device_links_unbind_consumers() will unbind the ar9331 switch driver on shutdown. So the same treatment must be applied to all DSA switch drivers, which is: either use devres for both the mdiobus allocation and registration, or don't use devres at all. The ar9331 driver doesn't have a complex code structure for mdiobus removal, so just replace of_mdiobus_register with the devres variant in order to be all-devres and ensure that we don't free a still-registered bus. Fixes: ac3a68d56651 ("net: phy: don't abuse devres in devm_mdiobus_register()") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Tested-by: Oleksij Rempel Signed-off-by: Jakub Kicinski --- drivers/net/dsa/qca/ar9331.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/dsa/qca/ar9331.c b/drivers/net/dsa/qca/ar9331.c index da0d7e68643a..c39de2a4c1fe 100644 --- a/drivers/net/dsa/qca/ar9331.c +++ b/drivers/net/dsa/qca/ar9331.c @@ -378,7 +378,7 @@ static int ar9331_sw_mbus_init(struct ar9331_sw_priv *priv) if (!mnp) return -ENODEV; - ret = of_mdiobus_register(mbus, mnp); + ret = devm_of_mdiobus_register(dev, mbus, mnp); of_node_put(mnp); if (ret) return ret; @@ -1091,7 +1091,6 @@ static void ar9331_sw_remove(struct mdio_device *mdiodev) } irq_domain_remove(priv->irqdomain); - mdiobus_unregister(priv->mbus); dsa_unregister_switch(&priv->ds); reset_control_assert(priv->sw_reset); From 08f1a20822349004bb9cc1b153ecb516e9f2889d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Feb 2022 18:15:49 +0200 Subject: [PATCH 1190/1295] net: dsa: bcm_sf2: don't use devres for mdiobus As explained in commits: 74b6d7d13307 ("net: dsa: realtek: register the MDIO bus under devres") 5135e96a3dd2 ("net: dsa: don't allocate the slave_mii_bus using devres") mdiobus_free() will panic when called from devm_mdiobus_free() <- devres_release_all() <- __device_release_driver(), and that mdiobus was not previously unregistered. The Starfighter 2 is a platform device, so the initial set of constraints that I thought would cause this (I2C or SPI buses which call ->remove on ->shutdown) do not apply. But there is one more which applies here. If the DSA master itself is on a bus that calls ->remove from ->shutdown (like dpaa2-eth, which is on the fsl-mc bus), there is a device link between the switch and the DSA master, and device_links_unbind_consumers() will unbind the bcm_sf2 switch driver on shutdown. So the same treatment must be applied to all DSA switch drivers, which is: either use devres for both the mdiobus allocation and registration, or don't use devres at all. The bcm_sf2 driver has the code structure in place for orderly mdiobus removal, so just replace devm_mdiobus_alloc() with the non-devres variant, and add manual free where necessary, to ensure that we don't let devres free a still-registered bus. Fixes: ac3a68d56651 ("net: phy: don't abuse devres in devm_mdiobus_register()") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/bcm_sf2.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 33499fcd8848..6afb5db8244c 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -621,7 +621,7 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds) get_device(&priv->master_mii_bus->dev); priv->master_mii_dn = dn; - priv->slave_mii_bus = devm_mdiobus_alloc(ds->dev); + priv->slave_mii_bus = mdiobus_alloc(); if (!priv->slave_mii_bus) { of_node_put(dn); return -ENOMEM; @@ -681,8 +681,10 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds) } err = mdiobus_register(priv->slave_mii_bus); - if (err && dn) + if (err && dn) { + mdiobus_free(priv->slave_mii_bus); of_node_put(dn); + } return err; } @@ -690,6 +692,7 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds) static void bcm_sf2_mdio_unregister(struct bcm_sf2_priv *priv) { mdiobus_unregister(priv->slave_mii_bus); + mdiobus_free(priv->slave_mii_bus); of_node_put(priv->master_mii_dn); } From 209bdb7ec6a28c7cdf580a0a98afbc9fc3b98932 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Feb 2022 18:15:50 +0200 Subject: [PATCH 1191/1295] net: dsa: felix: don't use devres for mdiobus As explained in commits: 74b6d7d13307 ("net: dsa: realtek: register the MDIO bus under devres") 5135e96a3dd2 ("net: dsa: don't allocate the slave_mii_bus using devres") mdiobus_free() will panic when called from devm_mdiobus_free() <- devres_release_all() <- __device_release_driver(), and that mdiobus was not previously unregistered. The Felix VSC9959 switch is a PCI device, so the initial set of constraints that I thought would cause this (I2C or SPI buses which call ->remove on ->shutdown) do not apply. But there is one more which applies here. If the DSA master itself is on a bus that calls ->remove from ->shutdown (like dpaa2-eth, which is on the fsl-mc bus), there is a device link between the switch and the DSA master, and device_links_unbind_consumers() will unbind the felix switch driver on shutdown. So the same treatment must be applied to all DSA switch drivers, which is: either use devres for both the mdiobus allocation and registration, or don't use devres at all. The felix driver has the code structure in place for orderly mdiobus removal, so just replace devm_mdiobus_alloc_size() with the non-devres variant, and add manual free where necessary, to ensure that we don't let devres free a still-registered bus. Fixes: ac3a68d56651 ("net: phy: don't abuse devres in devm_mdiobus_register()") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix_vsc9959.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index bf8d38239e7e..33f0ceae381d 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1061,7 +1061,7 @@ static int vsc9959_mdio_bus_alloc(struct ocelot *ocelot) return PTR_ERR(hw); } - bus = devm_mdiobus_alloc_size(dev, sizeof(*mdio_priv)); + bus = mdiobus_alloc_size(sizeof(*mdio_priv)); if (!bus) return -ENOMEM; @@ -1081,6 +1081,7 @@ static int vsc9959_mdio_bus_alloc(struct ocelot *ocelot) rc = mdiobus_register(bus); if (rc < 0) { dev_err(dev, "failed to register MDIO bus\n"); + mdiobus_free(bus); return rc; } @@ -1132,6 +1133,7 @@ static void vsc9959_mdio_bus_free(struct ocelot *ocelot) lynx_pcs_destroy(phylink_pcs); } mdiobus_unregister(felix->imdio); + mdiobus_free(felix->imdio); } static void vsc9959_sched_speed_set(struct ocelot *ocelot, int port, From bd488afc3b39e045ba71aab472233f2a78726e7b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Feb 2022 18:15:51 +0200 Subject: [PATCH 1192/1295] net: dsa: seville: register the mdiobus under devres As explained in commits: 74b6d7d13307 ("net: dsa: realtek: register the MDIO bus under devres") 5135e96a3dd2 ("net: dsa: don't allocate the slave_mii_bus using devres") mdiobus_free() will panic when called from devm_mdiobus_free() <- devres_release_all() <- __device_release_driver(), and that mdiobus was not previously unregistered. The Seville VSC9959 switch is a platform device, so the initial set of constraints that I thought would cause this (I2C or SPI buses which call ->remove on ->shutdown) do not apply. But there is one more which applies here. If the DSA master itself is on a bus that calls ->remove from ->shutdown (like dpaa2-eth, which is on the fsl-mc bus), there is a device link between the switch and the DSA master, and device_links_unbind_consumers() will unbind the seville switch driver on shutdown. So the same treatment must be applied to all DSA switch drivers, which is: either use devres for both the mdiobus allocation and registration, or don't use devres at all. The seville driver has a code structure that could accommodate both the mdiobus_unregister and mdiobus_free calls, but it has an external dependency upon mscc_miim_setup() from mdio-mscc-miim.c, which calls devm_mdiobus_alloc_size() on its behalf. So rather than restructuring that, and exporting yet one more symbol mscc_miim_teardown(), let's work with devres and replace of_mdiobus_register with the devres variant. When we use all-devres, we can ensure that devres doesn't free a still-registered bus (it either runs both callbacks, or none). Fixes: ac3a68d56651 ("net: phy: don't abuse devres in devm_mdiobus_register()") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/seville_vsc9953.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c index 8c1c9da61602..f2f1608a476c 100644 --- a/drivers/net/dsa/ocelot/seville_vsc9953.c +++ b/drivers/net/dsa/ocelot/seville_vsc9953.c @@ -1029,7 +1029,7 @@ static int vsc9953_mdio_bus_alloc(struct ocelot *ocelot) } /* Needed in order to initialize the bus mutex lock */ - rc = of_mdiobus_register(bus, NULL); + rc = devm_of_mdiobus_register(dev, bus, NULL); if (rc < 0) { dev_err(dev, "failed to register MDIO bus\n"); return rc; @@ -1083,7 +1083,8 @@ static void vsc9953_mdio_bus_free(struct ocelot *ocelot) mdio_device_free(mdio_device); lynx_pcs_destroy(phylink_pcs); } - mdiobus_unregister(felix->imdio); + + /* mdiobus_unregister and mdiobus_free handled by devres */ } static const struct felix_info seville_info_vsc9953 = { From 9ffe3d09e32da45bb5a29cf2e80ec8d7534010c5 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Feb 2022 18:15:52 +0200 Subject: [PATCH 1193/1295] net: dsa: mt7530: fix kernel bug in mdiobus_free() when unbinding Nobody in this driver calls mdiobus_unregister(), which is necessary if mdiobus_register() completes successfully. So if the devres callbacks that free the mdiobus get invoked (this is the case when unbinding the driver), mdiobus_free() will BUG if the mdiobus is still registered, which it is. My speculation is that this is due to the fact that prior to commit ac3a68d56651 ("net: phy: don't abuse devres in devm_mdiobus_register()") from June 2020, _devm_mdiobus_free() used to call mdiobus_unregister(). But at the time that the mt7530 support was introduced in May 2021, the API was already changed. It's therefore likely that the blamed patch was developed on an older tree, and incorrectly adapted to net-next. This makes the Fixes: tag correct. Fix the problem by using the devres variant of mdiobus_register. Fixes: ba751e28d442 ("net: dsa: mt7530: add interrupt support") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mt7530.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index b82512e5b33b..ff3c267d0f26 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -2074,7 +2074,7 @@ mt7530_setup_mdio(struct mt7530_priv *priv) if (priv->irq) mt7530_setup_mdio_irq(priv); - ret = mdiobus_register(bus); + ret = devm_mdiobus_register(dev, bus); if (ret) { dev_err(dev, "failed to register MDIO bus: %d\n", ret); if (priv->irq) From 0d120dfb5d67edc5bcd1804e167dba2b30809afd Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Feb 2022 18:15:53 +0200 Subject: [PATCH 1194/1295] net: dsa: lantiq_gswip: don't use devres for mdiobus As explained in commits: 74b6d7d13307 ("net: dsa: realtek: register the MDIO bus under devres") 5135e96a3dd2 ("net: dsa: don't allocate the slave_mii_bus using devres") mdiobus_free() will panic when called from devm_mdiobus_free() <- devres_release_all() <- __device_release_driver(), and that mdiobus was not previously unregistered. The GSWIP switch is a platform device, so the initial set of constraints that I thought would cause this (I2C or SPI buses which call ->remove on ->shutdown) do not apply. But there is one more which applies here. If the DSA master itself is on a bus that calls ->remove from ->shutdown (like dpaa2-eth, which is on the fsl-mc bus), there is a device link between the switch and the DSA master, and device_links_unbind_consumers() will unbind the GSWIP switch driver on shutdown. So the same treatment must be applied to all DSA switch drivers, which is: either use devres for both the mdiobus allocation and registration, or don't use devres at all. The gswip driver has the code structure in place for orderly mdiobus removal, so just replace devm_mdiobus_alloc() with the non-devres variant, and add manual free where necessary, to ensure that we don't let devres free a still-registered bus. Fixes: ac3a68d56651 ("net: phy: don't abuse devres in devm_mdiobus_register()") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/lantiq_gswip.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 46ed953e787e..320ee7fe91a8 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -498,8 +498,9 @@ static int gswip_mdio_rd(struct mii_bus *bus, int addr, int reg) static int gswip_mdio(struct gswip_priv *priv, struct device_node *mdio_np) { struct dsa_switch *ds = priv->ds; + int err; - ds->slave_mii_bus = devm_mdiobus_alloc(priv->dev); + ds->slave_mii_bus = mdiobus_alloc(); if (!ds->slave_mii_bus) return -ENOMEM; @@ -512,7 +513,11 @@ static int gswip_mdio(struct gswip_priv *priv, struct device_node *mdio_np) ds->slave_mii_bus->parent = priv->dev; ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask; - return of_mdiobus_register(ds->slave_mii_bus, mdio_np); + err = of_mdiobus_register(ds->slave_mii_bus, mdio_np); + if (err) + mdiobus_free(ds->slave_mii_bus); + + return err; } static int gswip_pce_table_entry_read(struct gswip_priv *priv, @@ -2145,8 +2150,10 @@ disable_switch: gswip_mdio_mask(priv, GSWIP_MDIO_GLOB_ENABLE, 0, GSWIP_MDIO_GLOB); dsa_unregister_switch(priv->ds); mdio_bus: - if (mdio_np) + if (mdio_np) { mdiobus_unregister(priv->ds->slave_mii_bus); + mdiobus_free(priv->ds->slave_mii_bus); + } put_mdio_node: of_node_put(mdio_np); for (i = 0; i < priv->num_gphy_fw; i++) @@ -2169,6 +2176,7 @@ static int gswip_remove(struct platform_device *pdev) if (priv->ds->slave_mii_bus) { mdiobus_unregister(priv->ds->slave_mii_bus); + mdiobus_free(priv->ds->slave_mii_bus); of_node_put(priv->ds->slave_mii_bus->dev.of_node); } From 61772b0908c640d0309c40f7d41d062ca4e979fa Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Mon, 7 Feb 2022 16:19:18 -0800 Subject: [PATCH 1195/1295] ibmvnic: don't release napi in __ibmvnic_open() If __ibmvnic_open() encounters an error such as when setting link state, it calls release_resources() which frees the napi structures needlessly. Instead, have __ibmvnic_open() only clean up the work it did so far (i.e. disable napi and irqs) and leave the rest to the callers. If caller of __ibmvnic_open() is ibmvnic_open(), it should release the resources immediately. If the caller is do_reset() or do_hard_reset(), they will release the resources on the next reset. This fixes following crash that occurred when running the drmgr command several times to add/remove a vnic interface: [102056] ibmvnic 30000003 env3: Disabling rx_scrq[6] irq [102056] ibmvnic 30000003 env3: Disabling rx_scrq[7] irq [102056] ibmvnic 30000003 env3: Replenished 8 pools Kernel attempted to read user page (10) - exploit attempt? (uid: 0) BUG: Kernel NULL pointer dereference on read at 0x00000010 Faulting instruction address: 0xc000000000a3c840 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries ... CPU: 9 PID: 102056 Comm: kworker/9:2 Kdump: loaded Not tainted 5.16.0-rc5-autotest-g6441998e2e37 #1 Workqueue: events_long __ibmvnic_reset [ibmvnic] NIP: c000000000a3c840 LR: c0080000029b5378 CTR: c000000000a3c820 REGS: c0000000548e37e0 TRAP: 0300 Not tainted (5.16.0-rc5-autotest-g6441998e2e37) MSR: 8000000000009033 CR: 28248484 XER: 00000004 CFAR: c0080000029bdd24 DAR: 0000000000000010 DSISR: 40000000 IRQMASK: 0 GPR00: c0080000029b55d0 c0000000548e3a80 c0000000028f0200 0000000000000000 ... NIP [c000000000a3c840] napi_enable+0x20/0xc0 LR [c0080000029b5378] __ibmvnic_open+0xf0/0x430 [ibmvnic] Call Trace: [c0000000548e3a80] [0000000000000006] 0x6 (unreliable) [c0000000548e3ab0] [c0080000029b55d0] __ibmvnic_open+0x348/0x430 [ibmvnic] [c0000000548e3b40] [c0080000029bcc28] __ibmvnic_reset+0x500/0xdf0 [ibmvnic] [c0000000548e3c60] [c000000000176228] process_one_work+0x288/0x570 [c0000000548e3d00] [c000000000176588] worker_thread+0x78/0x660 [c0000000548e3da0] [c0000000001822f0] kthread+0x1c0/0x1d0 [c0000000548e3e10] [c00000000000cf64] ret_from_kernel_thread+0x5c/0x64 Instruction dump: 7d2948f8 792307e0 4e800020 60000000 3c4c01eb 384239e0 f821ffd1 39430010 38a0fff6 e92d1100 f9210028 39200000 f9010020 60420000 e9210020 ---[ end trace 5f8033b08fd27706 ]--- Fixes: ed651a10875f ("ibmvnic: Updated reset handling") Reported-by: Abdul Haleem Signed-off-by: Sukadev Bhattiprolu Reviewed-by: Dany Madden Link: https://lore.kernel.org/r/20220208001918.900602-1-sukadev@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index bda7a2a9d211..29617a86b299 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -110,6 +110,7 @@ static void ibmvnic_tx_scrq_clean_buffer(struct ibmvnic_adapter *adapter, struct ibmvnic_sub_crq_queue *tx_scrq); static void free_long_term_buff(struct ibmvnic_adapter *adapter, struct ibmvnic_long_term_buff *ltb); +static void ibmvnic_disable_irqs(struct ibmvnic_adapter *adapter); struct ibmvnic_stat { char name[ETH_GSTRING_LEN]; @@ -1424,7 +1425,7 @@ static int __ibmvnic_open(struct net_device *netdev) rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_UP); if (rc) { ibmvnic_napi_disable(adapter); - release_resources(adapter); + ibmvnic_disable_irqs(adapter); return rc; } @@ -1474,9 +1475,6 @@ static int ibmvnic_open(struct net_device *netdev) rc = init_resources(adapter); if (rc) { netdev_err(netdev, "failed to initialize resources\n"); - release_resources(adapter); - release_rx_pools(adapter); - release_tx_pools(adapter); goto out; } } @@ -1493,6 +1491,13 @@ out: adapter->state = VNIC_OPEN; rc = 0; } + + if (rc) { + release_resources(adapter); + release_rx_pools(adapter); + release_tx_pools(adapter); + } + return rc; } From 2427f03fb42f9dc14c53108f2c9b5563eb37e770 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Tue, 8 Feb 2022 09:33:08 +0800 Subject: [PATCH 1196/1295] net: ethernet: litex: Add the dependency on HAS_IOMEM The LiteX driver uses devm io function API which needs HAS_IOMEM enabled, so add the dependency on HAS_IOMEM. Fixes: ee7da21ac4c3 ("net: Add driver for LiteX's LiteETH network interface") Signed-off-by: Cai Huoqing Link: https://lore.kernel.org/r/20220208013308.6563-1-cai.huoqing@linux.dev Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/litex/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/litex/Kconfig b/drivers/net/ethernet/litex/Kconfig index f99adbf26ab4..04345b929d8e 100644 --- a/drivers/net/ethernet/litex/Kconfig +++ b/drivers/net/ethernet/litex/Kconfig @@ -17,7 +17,7 @@ if NET_VENDOR_LITEX config LITEX_LITEETH tristate "LiteX Ethernet support" - depends on OF + depends on OF && HAS_IOMEM help If you wish to compile a kernel for hardware with a LiteX LiteEth device then you should answer Y to this. From 5611a00697c8ecc5aad04392bea629e9d6a20463 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 21:34:51 -0800 Subject: [PATCH 1197/1295] ipmr,ip6mr: acquire RTNL before calling ip[6]mr_free_table() on failure path ip[6]mr_free_table() can only be called under RTNL lock. RTNL: assertion failed at net/core/dev.c (10367) WARNING: CPU: 1 PID: 5890 at net/core/dev.c:10367 unregister_netdevice_many+0x1246/0x1850 net/core/dev.c:10367 Modules linked in: CPU: 1 PID: 5890 Comm: syz-executor.2 Not tainted 5.16.0-syzkaller-11627-g422ee58dc0ef #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:unregister_netdevice_many+0x1246/0x1850 net/core/dev.c:10367 Code: 0f 85 9b ee ff ff e8 69 07 4b fa ba 7f 28 00 00 48 c7 c6 00 90 ae 8a 48 c7 c7 40 90 ae 8a c6 05 6d b1 51 06 01 e8 8c 90 d8 01 <0f> 0b e9 70 ee ff ff e8 3e 07 4b fa 4c 89 e7 e8 86 2a 59 fa e9 ee RSP: 0018:ffffc900046ff6e0 EFLAGS: 00010286 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff888050f51d00 RSI: ffffffff815fa008 RDI: fffff520008dfece RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff815f3d6e R11: 0000000000000000 R12: 00000000fffffff4 R13: dffffc0000000000 R14: ffffc900046ff750 R15: ffff88807b7dc000 FS: 00007f4ab736e700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fee0b4f8990 CR3: 000000001e7d2000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mroute_clean_tables+0x244/0xb40 net/ipv6/ip6mr.c:1509 ip6mr_free_table net/ipv6/ip6mr.c:389 [inline] ip6mr_rules_init net/ipv6/ip6mr.c:246 [inline] ip6mr_net_init net/ipv6/ip6mr.c:1306 [inline] ip6mr_net_init+0x3f0/0x4e0 net/ipv6/ip6mr.c:1298 ops_init+0xaf/0x470 net/core/net_namespace.c:140 setup_net+0x54f/0xbb0 net/core/net_namespace.c:331 copy_net_ns+0x318/0x760 net/core/net_namespace.c:475 create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110 copy_namespaces+0x391/0x450 kernel/nsproxy.c:178 copy_process+0x2e0c/0x7300 kernel/fork.c:2167 kernel_clone+0xe7/0xab0 kernel/fork.c:2555 __do_sys_clone+0xc8/0x110 kernel/fork.c:2672 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f4ab89f9059 Code: Unable to access opcode bytes at RIP 0x7f4ab89f902f. RSP: 002b:00007f4ab736e118 EFLAGS: 00000206 ORIG_RAX: 0000000000000038 RAX: ffffffffffffffda RBX: 00007f4ab8b0bf60 RCX: 00007f4ab89f9059 RDX: 0000000020000280 RSI: 0000000020000270 RDI: 0000000040200000 RBP: 00007f4ab8a5308d R08: 0000000020000300 R09: 0000000020000300 R10: 00000000200002c0 R11: 0000000000000206 R12: 0000000000000000 R13: 00007ffc3977cc1f R14: 00007f4ab736e300 R15: 0000000000022000 Fixes: f243e5a7859a ("ipmr,ip6mr: call ip6mr_free_table() on failure path") Signed-off-by: Eric Dumazet Cc: Cong Wang Reported-by: syzbot Link: https://lore.kernel.org/r/20220208053451.2885398-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 2 ++ net/ipv6/ip6mr.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 07274619b9ea..29bbe2b08ae9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -256,7 +256,9 @@ static int __net_init ipmr_rules_init(struct net *net) return 0; err2: + rtnl_lock(); ipmr_free_table(mrt); + rtnl_unlock(); err1: fib_rules_unregister(ops); return err; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 7cf73e60e619..8a2db926b5eb 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -243,7 +243,9 @@ static int __net_init ip6mr_rules_init(struct net *net) return 0; err2: + rtnl_lock(); ip6mr_free_table(mrt); + rtnl_unlock(); err1: fib_rules_unregister(ops); return err; From 7db788ad627aabff2b74d4f1a3b68516d0fee0d7 Mon Sep 17 00:00:00 2001 From: Louis Peens Date: Tue, 8 Feb 2022 11:14:53 +0100 Subject: [PATCH 1198/1295] nfp: flower: fix ida_idx not being released When looking for a global mac index the extra NFP_TUN_PRE_TUN_IDX_BIT that gets set if nfp_flower_is_supported_bridge is true is not taken into account. Consequently the path that should release the ida_index in cleanup is never triggered, causing messages like: nfp 0000:02:00.0: nfp: Failed to offload MAC on br-ex. nfp 0000:02:00.0: nfp: Failed to offload MAC on br-ex. nfp 0000:02:00.0: nfp: Failed to offload MAC on br-ex. after NFP_MAX_MAC_INDEX number of reconfigs. Ultimately this lead to new tunnel flows not being offloaded. Fix this by unsetting the NFP_TUN_PRE_TUN_IDX_BIT before checking if the port is of type OTHER. Fixes: 2e0bc7f3cb55 ("nfp: flower: encode mac indexes with pre-tunnel rule check") Signed-off-by: Louis Peens Signed-off-by: Simon Horman Link: https://lore.kernel.org/r/20220208101453.321949-1-simon.horman@corigine.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/netronome/nfp/flower/tunnel_conf.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c index dfb4468fe287..0a326e04e692 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c @@ -1011,6 +1011,7 @@ nfp_tunnel_del_shared_mac(struct nfp_app *app, struct net_device *netdev, struct nfp_flower_repr_priv *repr_priv; struct nfp_tun_offloaded_mac *entry; struct nfp_repr *repr; + u16 nfp_mac_idx; int ida_idx; entry = nfp_tunnel_lookup_offloaded_macs(app, mac); @@ -1029,8 +1030,6 @@ nfp_tunnel_del_shared_mac(struct nfp_app *app, struct net_device *netdev, entry->bridge_count--; if (!entry->bridge_count && entry->ref_count) { - u16 nfp_mac_idx; - nfp_mac_idx = entry->index & ~NFP_TUN_PRE_TUN_IDX_BIT; if (__nfp_tunnel_offload_mac(app, mac, nfp_mac_idx, false)) { @@ -1046,7 +1045,6 @@ nfp_tunnel_del_shared_mac(struct nfp_app *app, struct net_device *netdev, /* If MAC is now used by 1 repr set the offloaded MAC index to port. */ if (entry->ref_count == 1 && list_is_singular(&entry->repr_list)) { - u16 nfp_mac_idx; int port, err; repr_priv = list_first_entry(&entry->repr_list, @@ -1074,8 +1072,14 @@ nfp_tunnel_del_shared_mac(struct nfp_app *app, struct net_device *netdev, WARN_ON_ONCE(rhashtable_remove_fast(&priv->tun.offloaded_macs, &entry->ht_node, offloaded_macs_params)); + + if (nfp_flower_is_supported_bridge(netdev)) + nfp_mac_idx = entry->index & ~NFP_TUN_PRE_TUN_IDX_BIT; + else + nfp_mac_idx = entry->index; + /* If MAC has global ID then extract and free the ida entry. */ - if (nfp_tunnel_is_mac_idx_global(entry->index)) { + if (nfp_tunnel_is_mac_idx_global(nfp_mac_idx)) { ida_idx = nfp_tunnel_get_ida_from_global_mac_idx(entry->index); ida_simple_remove(&priv->tun.mac_off_ids, ida_idx); } From 7c759040c1dd03954f650f147ae7175476d51314 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 8 Feb 2022 21:00:26 +0100 Subject: [PATCH 1199/1295] can: isotp: fix potential CAN frame reception race in isotp_rcv() When receiving a CAN frame the current code logic does not consider concurrently receiving processes which do not show up in real world usage. Ziyang Xuan writes: The following syz problem is one of the scenarios. so->rx.len is changed by isotp_rcv_ff() during isotp_rcv_cf(), so->rx.len equals 0 before alloc_skb() and equals 4096 after alloc_skb(). That will trigger skb_over_panic() in skb_put(). ======================================================= CPU: 1 PID: 19 Comm: ksoftirqd/1 Not tainted 5.16.0-rc8-syzkaller #0 RIP: 0010:skb_panic+0x16c/0x16e net/core/skbuff.c:113 Call Trace: skb_over_panic net/core/skbuff.c:118 [inline] skb_put.cold+0x24/0x24 net/core/skbuff.c:1990 isotp_rcv_cf net/can/isotp.c:570 [inline] isotp_rcv+0xa38/0x1e30 net/can/isotp.c:668 deliver net/can/af_can.c:574 [inline] can_rcv_filter+0x445/0x8d0 net/can/af_can.c:635 can_receive+0x31d/0x580 net/can/af_can.c:665 can_rcv+0x120/0x1c0 net/can/af_can.c:696 __netif_receive_skb_one_core+0x114/0x180 net/core/dev.c:5465 __netif_receive_skb+0x24/0x1b0 net/core/dev.c:5579 Therefore we make sure the state changes and data structures stay consistent at CAN frame reception time by adding a spin_lock in isotp_rcv(). This fixes the issue reported by syzkaller but does not affect real world operation. Fixes: e057dd3fc20f ("can: add ISO 15765-2:2016 transport protocol") Link: https://lore.kernel.org/linux-can/d7e69278-d741-c706-65e1-e87623d9a8e8@huawei.com/T/ Link: https://lore.kernel.org/all/20220208200026.13783-1-socketcan@hartkopp.net Cc: stable@vger.kernel.org Reported-by: syzbot+4c63f36709a642f801c5@syzkaller.appspotmail.com Reported-by: Ziyang Xuan Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/net/can/isotp.c b/net/can/isotp.c index 02cbcb2ecf0d..9149e8d8aefc 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -145,6 +146,7 @@ struct isotp_sock { struct tpcon rx, tx; struct list_head notifier; wait_queue_head_t wait; + spinlock_t rx_lock; /* protect single thread state machine */ }; static LIST_HEAD(isotp_notifier_list); @@ -615,11 +617,17 @@ static void isotp_rcv(struct sk_buff *skb, void *data) n_pci_type = cf->data[ae] & 0xF0; + /* Make sure the state changes and data structures stay consistent at + * CAN frame reception time. This locking is not needed in real world + * use cases but the inconsistency can be triggered with syzkaller. + */ + spin_lock(&so->rx_lock); + if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) { /* check rx/tx path half duplex expectations */ if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) || (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC)) - return; + goto out_unlock; } switch (n_pci_type) { @@ -668,6 +676,9 @@ static void isotp_rcv(struct sk_buff *skb, void *data) isotp_rcv_cf(sk, cf, ae, skb); break; } + +out_unlock: + spin_unlock(&so->rx_lock); } static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so, @@ -1444,6 +1455,7 @@ static int isotp_init(struct sock *sk) so->txtimer.function = isotp_tx_timer_handler; init_waitqueue_head(&so->wait); + spin_lock_init(&so->rx_lock); spin_lock(&isotp_notifier_lock); list_add_tail(&so->notifier, &isotp_notifier_list); From 8375dfac4f683e1b2c5956d919d36aeedad46699 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 9 Feb 2022 08:36:01 +0100 Subject: [PATCH 1200/1295] can: isotp: fix error path in isotp_sendmsg() to unlock wait queue Commit 43a08c3bdac4 ("can: isotp: isotp_sendmsg(): fix TX buffer concurrent access in isotp_sendmsg()") introduced a new locking scheme that may render the userspace application in a locking state when an error is detected. This issue shows up under high load on simultaneously running isotp channels with identical configuration which is against the ISO specification and therefore breaks any reasonable PDU communication anyway. Fixes: 43a08c3bdac4 ("can: isotp: isotp_sendmsg(): fix TX buffer concurrent access in isotp_sendmsg()") Link: https://lore.kernel.org/all/20220209073601.25728-1-socketcan@hartkopp.net Cc: stable@vger.kernel.org Cc: Ziyang Xuan Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/can/isotp.c b/net/can/isotp.c index 9149e8d8aefc..d2a430b6a13b 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -887,7 +887,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (!size || size > MAX_MSG_LENGTH) { err = -EINVAL; - goto err_out; + goto err_out_drop; } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ @@ -897,24 +897,24 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if ((so->opt.flags & CAN_ISOTP_SF_BROADCAST) && (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) { err = -EINVAL; - goto err_out; + goto err_out_drop; } err = memcpy_from_msg(so->tx.buf, msg, size); if (err < 0) - goto err_out; + goto err_out_drop; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { err = -ENXIO; - goto err_out; + goto err_out_drop; } skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) { dev_put(dev); - goto err_out; + goto err_out_drop; } can_skb_reserve(skb); @@ -976,7 +976,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(err)); - goto err_out; + goto err_out_drop; } if (wait_tx_done) { @@ -989,6 +989,9 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) return size; +err_out_drop: + /* drop this PDU and unlock a potential wait queue */ + old_state = ISOTP_IDLE; err_out: so->tx.state = old_state; if (so->tx.state == ISOTP_IDLE) From c162ca0bcbfb39308c4dff4157e27c751af7032a Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 8 Feb 2022 11:37:56 +0100 Subject: [PATCH 1201/1295] gpio: sim: fix hogs with custom chip labels We always assign the default device name as the chip_label in hog structures which makes it impossible to assign hogs to chips. Let's first check if a custom label was set and then copy it instead of the default device name. Fixes: cb8c474e79be ("gpio: sim: new testing module") Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko --- drivers/gpio/gpio-sim.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index 04b137eca8da..153fe79e1bf3 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -570,6 +570,11 @@ static struct gpio_sim_bank *to_gpio_sim_bank(struct config_item *item) return container_of(group, struct gpio_sim_bank, group); } +static bool gpio_sim_bank_has_label(struct gpio_sim_bank *bank) +{ + return bank->label && *bank->label; +} + static struct gpio_sim_device * gpio_sim_bank_get_device(struct gpio_sim_bank *bank) { @@ -770,9 +775,15 @@ static int gpio_sim_add_hogs(struct gpio_sim_device *dev) * point the device doesn't exist yet and so dev_name() * is not available. */ - hog->chip_label = kasprintf(GFP_KERNEL, - "gpio-sim.%u-%s", dev->id, - fwnode_get_name(bank->swnode)); + if (gpio_sim_bank_has_label(bank)) + hog->chip_label = kstrdup(bank->label, + GFP_KERNEL); + else + hog->chip_label = kasprintf(GFP_KERNEL, + "gpio-sim.%u-%s", + dev->id, + fwnode_get_name( + bank->swnode)); if (!hog->chip_label) { gpio_sim_remove_hogs(dev); return -ENOMEM; @@ -816,7 +827,7 @@ gpio_sim_make_bank_swnode(struct gpio_sim_bank *bank, properties[prop_idx++] = PROPERTY_ENTRY_U32("ngpios", bank->num_lines); - if (bank->label && (strlen(bank->label) > 0)) + if (gpio_sim_bank_has_label(bank)) properties[prop_idx++] = PROPERTY_ENTRY_STRING("gpio-sim,label", bank->label); From cfc56f85e72f5b9c5c5be26dc2b16518d36a7868 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 7 Feb 2022 18:13:18 +0100 Subject: [PATCH 1202/1295] net: do not keep the dst cache when uncloning an skb dst and its metadata When uncloning an skb dst and its associated metadata a new dst+metadata is allocated and the tunnel information from the old metadata is copied over there. The issue is the tunnel metadata has references to cached dst, which are copied along the way. When a dst+metadata refcount drops to 0 the metadata is freed including the cached dst entries. As they are also referenced in the initial dst+metadata, this ends up in UaFs. In practice the above did not happen because of another issue, the dst+metadata was never freed because its refcount never dropped to 0 (this will be fixed in a subsequent patch). Fix this by initializing the dst cache after copying the tunnel information from the old metadata to also unshare the dst cache. Fixes: d71785ffc7e7 ("net: add dst_cache to ovs vxlan lwtunnel") Cc: Paolo Abeni Reported-by: Vlad Buslov Tested-by: Vlad Buslov Signed-off-by: Antoine Tenart Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 14efa0ded75d..b997e0c1e362 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -123,6 +123,19 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, sizeof(struct ip_tunnel_info) + md_size); +#ifdef CONFIG_DST_CACHE + /* Unclone the dst cache if there is one */ + if (new_md->u.tun_info.dst_cache.cache) { + int ret; + + ret = dst_cache_init(&new_md->u.tun_info.dst_cache, GFP_ATOMIC); + if (ret) { + metadata_dst_free(new_md); + return ERR_PTR(ret); + } + } +#endif + skb_dst_drop(skb); dst_hold(&new_md->dst); skb_dst_set(skb, &new_md->dst); From 9eeabdf17fa0ab75381045c867c370f4cc75a613 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 7 Feb 2022 18:13:19 +0100 Subject: [PATCH 1203/1295] net: fix a memleak when uncloning an skb dst and its metadata When uncloning an skb dst and its associated metadata, a new dst+metadata is allocated and later replaces the old one in the skb. This is helpful to have a non-shared dst+metadata attached to a specific skb. The issue is the uncloned dst+metadata is initialized with a refcount of 1, which is increased to 2 before attaching it to the skb. When tun_dst_unclone returns, the dst+metadata is only referenced from a single place (the skb) while its refcount is 2. Its refcount will never drop to 0 (when the skb is consumed), leading to a memory leak. Fix this by removing the call to dst_hold in tun_dst_unclone, as the dst+metadata refcount is already 1. Fixes: fc4099f17240 ("openvswitch: Fix egress tunnel info.") Cc: Pravin B Shelar Reported-by: Vlad Buslov Tested-by: Vlad Buslov Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index b997e0c1e362..adab27ba1ecb 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -137,7 +137,6 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) #endif skb_dst_drop(skb); - dst_hold(&new_md->dst); skb_dst_set(skb, &new_md->dst); return new_md; } From 7ec02f5ac8a5be5a3f20611731243dc5e1d9ba10 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Tue, 8 Feb 2022 23:40:00 +0800 Subject: [PATCH 1204/1295] ax25: fix NPD bug in ax25_disconnect The ax25_disconnect() in ax25_kill_by_device() is not protected by any locks, thus there is a race condition between ax25_disconnect() and ax25_destroy_socket(). when ax25->sk is assigned as NULL by ax25_destroy_socket(), a NULL pointer dereference bug will occur if site (1) or (2) dereferences ax25->sk. ax25_kill_by_device() | ax25_release() ax25_disconnect() | ax25_destroy_socket() ... | if(ax25->sk != NULL) | ... ... | ax25->sk = NULL; bh_lock_sock(ax25->sk); //(1) | ... ... | bh_unlock_sock(ax25->sk); //(2)| This patch moves ax25_disconnect() into lock_sock(), which can synchronize with ax25_destroy_socket() in ax25_release(). Fail log: =============================================================== BUG: kernel NULL pointer dereference, address: 0000000000000088 ... RIP: 0010:_raw_spin_lock+0x7e/0xd0 ... Call Trace: ax25_disconnect+0xf6/0x220 ax25_device_event+0x187/0x250 raw_notifier_call_chain+0x5e/0x70 dev_close_many+0x17d/0x230 rollback_registered_many+0x1f1/0x950 unregister_netdevice_queue+0x133/0x200 unregister_netdev+0x13/0x20 ... Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 3e49d28824ed..3d87040d1bfc 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -92,8 +92,8 @@ again: lock_sock(sk); s->ax25_dev = NULL; ax25_dev_put(ax25_dev); - release_sock(sk); ax25_disconnect(s, ENETUNREACH); + release_sock(sk); spin_lock_bh(&ax25_list_lock); sock_put(sk); /* The entry could have been deleted from the From 68468d8c4cd4222a4ca1f185ab5a1c14480d078c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Feb 2022 15:28:22 -0800 Subject: [PATCH 1205/1295] veth: fix races around rq->rx_notify_masked veth being NETIF_F_LLTX enabled, we need to be more careful whenever we read/write rq->rx_notify_masked. BUG: KCSAN: data-race in veth_xmit / veth_xmit write to 0xffff888133d9a9f8 of 1 bytes by task 23552 on cpu 0: __veth_xdp_flush drivers/net/veth.c:269 [inline] veth_xmit+0x307/0x470 drivers/net/veth.c:350 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 br_dev_queue_push_xmit+0x3ce/0x430 net/bridge/br_forward.c:53 NF_HOOK include/linux/netfilter.h:307 [inline] br_forward_finish net/bridge/br_forward.c:66 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] __br_forward+0x2e4/0x400 net/bridge/br_forward.c:115 br_flood+0x521/0x5c0 net/bridge/br_forward.c:242 br_dev_xmit+0x8b6/0x960 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 neigh_hh_output include/net/neighbour.h:525 [inline] neigh_output include/net/neighbour.h:539 [inline] ip_finish_output2+0x6f8/0xb70 net/ipv4/ip_output.c:228 ip_finish_output+0xfb/0x240 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip_output+0xf3/0x1a0 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:451 [inline] ip_local_out net/ipv4/ip_output.c:126 [inline] ip_send_skb+0x6e/0xe0 net/ipv4/ip_output.c:1570 udp_send_skb+0x641/0x880 net/ipv4/udp.c:967 udp_sendmsg+0x12ea/0x14c0 net/ipv4/udp.c:1254 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888133d9a9f8 of 1 bytes by task 23563 on cpu 1: __veth_xdp_flush drivers/net/veth.c:268 [inline] veth_xmit+0x2d6/0x470 drivers/net/veth.c:350 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 br_dev_queue_push_xmit+0x3ce/0x430 net/bridge/br_forward.c:53 NF_HOOK include/linux/netfilter.h:307 [inline] br_forward_finish net/bridge/br_forward.c:66 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] __br_forward+0x2e4/0x400 net/bridge/br_forward.c:115 br_flood+0x521/0x5c0 net/bridge/br_forward.c:242 br_dev_xmit+0x8b6/0x960 __netdev_start_xmit include/linux/netdevice.h:4683 [inline] netdev_start_xmit include/linux/netdevice.h:4697 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3473 dev_hard_start_xmit net/core/dev.c:3489 [inline] __dev_queue_xmit+0x86d/0xf90 net/core/dev.c:4116 dev_queue_xmit+0x13/0x20 net/core/dev.c:4149 neigh_hh_output include/net/neighbour.h:525 [inline] neigh_output include/net/neighbour.h:539 [inline] ip_finish_output2+0x6f8/0xb70 net/ipv4/ip_output.c:228 ip_finish_output+0xfb/0x240 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip_output+0xf3/0x1a0 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:451 [inline] ip_local_out net/ipv4/ip_output.c:126 [inline] ip_send_skb+0x6e/0xe0 net/ipv4/ip_output.c:1570 udp_send_skb+0x641/0x880 net/ipv4/udp.c:967 udp_sendmsg+0x12ea/0x14c0 net/ipv4/udp.c:1254 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00 -> 0x01 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 23563 Comm: syz-executor.5 Not tainted 5.17.0-rc2-syzkaller-00064-gc36c04c2e132 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 948d4f214fde ("veth: Add driver XDP") Signed-off-by: Eric Dumazet Cc: Toshiaki Makita Reported-by: syzbot Signed-off-by: David S. Miller --- drivers/net/veth.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 354a963075c5..d29fb9759cc9 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -265,9 +265,10 @@ static void __veth_xdp_flush(struct veth_rq *rq) { /* Write ptr_ring before reading rx_notify_masked */ smp_mb(); - if (!rq->rx_notify_masked) { - rq->rx_notify_masked = true; - napi_schedule(&rq->xdp_napi); + if (!READ_ONCE(rq->rx_notify_masked) && + napi_schedule_prep(&rq->xdp_napi)) { + WRITE_ONCE(rq->rx_notify_masked, true); + __napi_schedule(&rq->xdp_napi); } } @@ -912,8 +913,10 @@ static int veth_poll(struct napi_struct *napi, int budget) /* Write rx_notify_masked before reading ptr_ring */ smp_store_mb(rq->rx_notify_masked, false); if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { - rq->rx_notify_masked = true; - napi_schedule(&rq->xdp_napi); + if (napi_schedule_prep(&rq->xdp_napi)) { + WRITE_ONCE(rq->rx_notify_masked, true); + __napi_schedule(&rq->xdp_napi); + } } } From bc1c3c3b10db4f37c41e6107751a8d450d9c431c Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Wed, 9 Feb 2022 10:33:59 +1030 Subject: [PATCH 1206/1295] net: mdio: aspeed: Add missing MODULE_DEVICE_TABLE Fix loading of the driver when built as a module. Fixes: f160e99462c6 ("net: phy: Add mdio-aspeed") Signed-off-by: Joel Stanley Reviewed-by: Andrew Lunn Acked-by: Andrew Jeffery Signed-off-by: David S. Miller --- drivers/net/mdio/mdio-aspeed.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/mdio/mdio-aspeed.c b/drivers/net/mdio/mdio-aspeed.c index 966c3b4ad59d..e2273588c75b 100644 --- a/drivers/net/mdio/mdio-aspeed.c +++ b/drivers/net/mdio/mdio-aspeed.c @@ -148,6 +148,7 @@ static const struct of_device_id aspeed_mdio_of_match[] = { { .compatible = "aspeed,ast2600-mdio", }, { }, }; +MODULE_DEVICE_TABLE(of, aspeed_mdio_of_match); static struct platform_driver aspeed_mdio_driver = { .driver = { From c7223d687758462826a20e9735305d55bb874c70 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Tue, 8 Feb 2022 22:22:37 -0500 Subject: [PATCH 1207/1295] tipc: rate limit warning for received illegal binding update It would be easy to craft a message containing an illegal binding table update operation. This is handled correctly by the code, but the corresponding warning printout is not rate limited as is should be. We fix this now. Fixes: b97bf3fd8f6a ("[TIPC] Initial merge") Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/name_distr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index bda902caa814..8267b751a526 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -313,7 +313,7 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, pr_warn_ratelimited("Failed to remove binding %u,%u from %u\n", ua.sr.type, ua.sr.lower, node); } else { - pr_warn("Unrecognized name table message received\n"); + pr_warn_ratelimited("Unknown name table message received\n"); } return false; } From 68c2d6af1f1e469544d6cbe9a601d96fb9c00e7f Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Wed, 9 Feb 2022 10:02:01 +0530 Subject: [PATCH 1208/1295] net: amd-xgbe: disable interrupts during pci removal Hardware interrupts are enabled during the pci probe, however, they are not disabled during pci removal. Disable all hardware interrupts during pci removal to avoid any issues. Fixes: e75377404726 ("amd-xgbe: Update PCI support to use new IRQ functions") Suggested-by: Selwin Sebastian Signed-off-by: Raju Rangoju Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c index efdcf484a510..2af3da4b2d05 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c @@ -425,6 +425,9 @@ static void xgbe_pci_remove(struct pci_dev *pdev) pci_free_irq_vectors(pdata->pcidev); + /* Disable all interrupts in the hardware */ + XP_IOWRITE(pdata, XP_INT_EN, 0x0); + xgbe_free_pdata(pdata); } From d9565bf40da22426d2f660cb31700b6858d1911d Mon Sep 17 00:00:00 2001 From: "H. Nikolaus Schaller" Date: Wed, 2 Feb 2022 17:31:22 +0100 Subject: [PATCH 1209/1295] MIPS: DTS: CI20: fix how ddc power is enabled Originally we proposed a new hdmi-5v-supply regulator reference for CI20 device tree but that was superseded by a better idea to use the already defined "ddc-en-gpios" property of the "hdmi-connector". Since "MIPS: DTS: CI20: Add DT nodes for HDMI setup" has already been applied to v5.17-rc1, we add this on top. Fixes: ae1b8d2c2de9 ("MIPS: DTS: CI20: Add DT nodes for HDMI setup") Signed-off-by: H. Nikolaus Schaller Reviewed-by: Paul Cercueil Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/ingenic/ci20.dts | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/arch/mips/boot/dts/ingenic/ci20.dts b/arch/mips/boot/dts/ingenic/ci20.dts index 3e336b3dbb10..ab6e3dc0bc1d 100644 --- a/arch/mips/boot/dts/ingenic/ci20.dts +++ b/arch/mips/boot/dts/ingenic/ci20.dts @@ -83,6 +83,8 @@ label = "HDMI OUT"; type = "a"; + ddc-en-gpios = <&gpa 25 GPIO_ACTIVE_HIGH>; + port { hdmi_con: endpoint { remote-endpoint = <&dw_hdmi_out>; @@ -114,17 +116,6 @@ gpio = <&gpf 14 GPIO_ACTIVE_LOW>; enable-active-high; }; - - hdmi_power: fixedregulator@3 { - compatible = "regulator-fixed"; - - regulator-name = "hdmi_power"; - regulator-min-microvolt = <5000000>; - regulator-max-microvolt = <5000000>; - - gpio = <&gpa 25 0>; - enable-active-high; - }; }; &ext { @@ -576,8 +567,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pins_hdmi_ddc>; - hdmi-5v-supply = <&hdmi_power>; - ports { #address-cells = <1>; #size-cells = <0>; From ee534378f00561207656663d93907583958339ae Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 9 Feb 2022 14:04:33 +0200 Subject: [PATCH 1210/1295] net: dsa: fix panic when DSA master device unbinds on shutdown Rafael reports that on a system with LX2160A and Marvell DSA switches, if a reboot occurs while the DSA master (dpaa2-eth) is up, the following panic can be seen: systemd-shutdown[1]: Rebooting. Unable to handle kernel paging request at virtual address 00a0000800000041 [00a0000800000041] address between user and kernel address ranges Internal error: Oops: 96000004 [#1] PREEMPT SMP CPU: 6 PID: 1 Comm: systemd-shutdow Not tainted 5.16.5-00042-g8f5585009b24 #32 pc : dsa_slave_netdevice_event+0x130/0x3e4 lr : raw_notifier_call_chain+0x50/0x6c Call trace: dsa_slave_netdevice_event+0x130/0x3e4 raw_notifier_call_chain+0x50/0x6c call_netdevice_notifiers_info+0x54/0xa0 __dev_close_many+0x50/0x130 dev_close_many+0x84/0x120 unregister_netdevice_many+0x130/0x710 unregister_netdevice_queue+0x8c/0xd0 unregister_netdev+0x20/0x30 dpaa2_eth_remove+0x68/0x190 fsl_mc_driver_remove+0x20/0x5c __device_release_driver+0x21c/0x220 device_release_driver_internal+0xac/0xb0 device_links_unbind_consumers+0xd4/0x100 __device_release_driver+0x94/0x220 device_release_driver+0x28/0x40 bus_remove_device+0x118/0x124 device_del+0x174/0x420 fsl_mc_device_remove+0x24/0x40 __fsl_mc_device_remove+0xc/0x20 device_for_each_child+0x58/0xa0 dprc_remove+0x90/0xb0 fsl_mc_driver_remove+0x20/0x5c __device_release_driver+0x21c/0x220 device_release_driver+0x28/0x40 bus_remove_device+0x118/0x124 device_del+0x174/0x420 fsl_mc_bus_remove+0x80/0x100 fsl_mc_bus_shutdown+0xc/0x1c platform_shutdown+0x20/0x30 device_shutdown+0x154/0x330 __do_sys_reboot+0x1cc/0x250 __arm64_sys_reboot+0x20/0x30 invoke_syscall.constprop.0+0x4c/0xe0 do_el0_svc+0x4c/0x150 el0_svc+0x24/0xb0 el0t_64_sync_handler+0xa8/0xb0 el0t_64_sync+0x178/0x17c It can be seen from the stack trace that the problem is that the deregistration of the master causes a dev_close(), which gets notified as NETDEV_GOING_DOWN to dsa_slave_netdevice_event(). But dsa_switch_shutdown() has already run, and this has unregistered the DSA slave interfaces, and yet, the NETDEV_GOING_DOWN handler attempts to call dev_close_many() on those slave interfaces, leading to the problem. The previous attempt to avoid the NETDEV_GOING_DOWN on the master after dsa_switch_shutdown() was called seems improper. Unregistering the slave interfaces is unnecessary and unhelpful. Instead, after the slaves have stopped being uppers of the DSA master, we can now reset to NULL the master->dsa_ptr pointer, which will make DSA start ignoring all future notifier events on the master. Fixes: 0650bf52b31f ("net: dsa: be compatible with masters which unregister on shutdown") Reported-by: Rafael Richter Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 3d21521453fe..dcad3100b164 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1718,7 +1718,6 @@ EXPORT_SYMBOL_GPL(dsa_unregister_switch); void dsa_switch_shutdown(struct dsa_switch *ds) { struct net_device *master, *slave_dev; - LIST_HEAD(unregister_list); struct dsa_port *dp; mutex_lock(&dsa2_mutex); @@ -1729,25 +1728,13 @@ void dsa_switch_shutdown(struct dsa_switch *ds) slave_dev = dp->slave; netdev_upper_dev_unlink(master, slave_dev); - /* Just unlinking ourselves as uppers of the master is not - * sufficient. When the master net device unregisters, that will - * also call dev_close, which we will catch as NETDEV_GOING_DOWN - * and trigger a dev_close on our own devices (dsa_slave_close). - * In turn, that will call dev_mc_unsync on the master's net - * device. If the master is also a DSA switch port, this will - * trigger dsa_slave_set_rx_mode which will call dev_mc_sync on - * its own master. Lockdep will complain about the fact that - * all cascaded masters have the same dsa_master_addr_list_lock_key, - * which it normally would not do if the cascaded masters would - * be in a proper upper/lower relationship, which we've just - * destroyed. - * To suppress the lockdep warnings, let's actually unregister - * the DSA slave interfaces too, to avoid the nonsensical - * multicast address list synchronization on shutdown. - */ - unregister_netdevice_queue(slave_dev, &unregister_list); } - unregister_netdevice_many(&unregister_list); + + /* Disconnect from further netdevice notifiers on the master, + * since netdev_uses_dsa() will now return false. + */ + dsa_switch_for_each_cpu_port(dp, ds) + dp->master->dsa_ptr = NULL; rtnl_unlock(); mutex_unlock(&dsa2_mutex); From feef318c855a361a1eccd880f33e88c460eb63b4 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Wed, 9 Feb 2022 20:53:45 +0800 Subject: [PATCH 1211/1295] ax25: fix UAF bugs of net_device caused by rebinding operation The ax25_kill_by_device() will set s->ax25_dev = NULL and call ax25_disconnect() to change states of ax25_cb and sock, if we call ax25_bind() before ax25_kill_by_device(). However, if we call ax25_bind() again between the window of ax25_kill_by_device() and ax25_dev_device_down(), the values and states changed by ax25_kill_by_device() will be reassigned. Finally, ax25_dev_device_down() will deallocate net_device. If we dereference net_device in syscall functions such as ax25_release(), ax25_sendmsg(), ax25_getsockopt(), ax25_getname() and ax25_info_show(), a UAF bug will occur. One of the possible race conditions is shown below: (USE) | (FREE) ax25_bind() | | ax25_kill_by_device() ax25_bind() | ax25_connect() | ... | ax25_dev_device_down() | ... | dev_put_track(dev, ...) //FREE ax25_release() | ... ax25_send_control() | alloc_skb() //USE | the corresponding fail log is shown below: =============================================================== BUG: KASAN: use-after-free in ax25_send_control+0x43/0x210 ... Call Trace: ... ax25_send_control+0x43/0x210 ax25_release+0x2db/0x3b0 __sock_release+0x6d/0x120 sock_close+0xf/0x20 __fput+0x11f/0x420 ... Allocated by task 1283: ... __kasan_kmalloc+0x81/0xa0 alloc_netdev_mqs+0x5a/0x680 mkiss_open+0x6c/0x380 tty_ldisc_open+0x55/0x90 ... Freed by task 1969: ... kfree+0xa3/0x2c0 device_release+0x54/0xe0 kobject_put+0xa5/0x120 tty_ldisc_kill+0x3e/0x80 ... In order to fix these UAF bugs caused by rebinding operation, this patch adds dev_hold_track() into ax25_bind() and corresponding dev_put_track() into ax25_kill_by_device(). Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 3d87040d1bfc..d53cbb4e2503 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -91,6 +91,7 @@ again: spin_unlock_bh(&ax25_list_lock); lock_sock(sk); s->ax25_dev = NULL; + dev_put_track(ax25_dev->dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); ax25_disconnect(s, ENETUNREACH); release_sock(sk); @@ -1116,8 +1117,10 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } } - if (ax25_dev != NULL) + if (ax25_dev) { ax25_fillin_cb(ax25, ax25_dev); + dev_hold_track(ax25_dev->dev, &ax25_dev->dev_tracker, GFP_ATOMIC); + } done: ax25_cb_add(ax25); From 37aa50c539bcbcc01767e515bd170787fcfc0f33 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 9 Feb 2022 03:19:55 -0500 Subject: [PATCH 1212/1295] vlan: introduce vlan_dev_free_egress_priority This patch is to introduce vlan_dev_free_egress_priority() to free egress priority for vlan dev, and keep vlan_dev_uninit() static as .ndo_uninit. It makes the code more clear and safer when adding new code in vlan_dev_uninit() in the future. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/8021q/vlan.h | 2 +- net/8021q/vlan_dev.c | 7 ++++++- net/8021q/vlan_netlink.c | 7 ++++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 1a705a4ef7fa..5eaf38875554 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -129,6 +129,7 @@ void vlan_dev_set_ingress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); int vlan_dev_set_egress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); +void vlan_dev_free_egress_priority(const struct net_device *dev); int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask); void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, size_t size); @@ -139,7 +140,6 @@ int vlan_check_real_dev(struct net_device *real_dev, void vlan_setup(struct net_device *dev); int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); -void vlan_dev_uninit(struct net_device *dev); bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 26d031a43cc1..e5d23e75572a 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -622,7 +622,7 @@ static int vlan_dev_init(struct net_device *dev) } /* Note: this function might be called multiple times for the same device. */ -void vlan_dev_uninit(struct net_device *dev) +void vlan_dev_free_egress_priority(const struct net_device *dev) { struct vlan_priority_tci_mapping *pm; struct vlan_dev_priv *vlan = vlan_dev_priv(dev); @@ -636,6 +636,11 @@ void vlan_dev_uninit(struct net_device *dev) } } +static void vlan_dev_uninit(struct net_device *dev) +{ + vlan_dev_free_egress_priority(dev); +} + static netdev_features_t vlan_dev_fix_features(struct net_device *dev, netdev_features_t features) { diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 0db85aeb119b..53b1955b027f 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -183,10 +183,11 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, return -EINVAL; err = vlan_changelink(dev, tb, data, extack); - if (!err) - err = register_vlan_dev(dev, extack); if (err) - vlan_dev_uninit(dev); + return err; + err = register_vlan_dev(dev, extack); + if (err) + vlan_dev_free_egress_priority(dev); return err; } From d6ff94afd90b0ce8d1715f8ef77d4347d7a7f2c0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 9 Feb 2022 03:19:56 -0500 Subject: [PATCH 1213/1295] vlan: move dev_put into vlan_dev_uninit Shuang Li reported an QinQ issue by simply doing: # ip link add dummy0 type dummy # ip link add link dummy0 name dummy0.1 type vlan id 1 # ip link add link dummy0.1 name dummy0.1.2 type vlan id 2 # rmmod 8021q unregister_netdevice: waiting for dummy0.1 to become free. Usage count = 1 When rmmods 8021q, all vlan devs are deleted from their real_dev's vlan grp and added into list_kill by unregister_vlan_dev(). dummy0.1 is unregistered before dummy0.1.2, as it's using for_each_netdev() in __rtnl_kill_links(). When unregisters dummy0.1, dummy0.1.2 is not unregistered in the event of NETDEV_UNREGISTER, as it's been deleted from dummy0.1's vlan grp. However, due to dummy0.1.2 still holding dummy0.1, dummy0.1 will keep waiting in netdev_wait_allrefs(), while dummy0.1.2 will never get unregistered and release dummy0.1, as it delays dev_put until calling dev->priv_destructor, vlan_dev_free(). This issue was introduced by Commit 563bcbae3ba2 ("net: vlan: fix a UAF in vlan_dev_real_dev()"), and this patch is to fix it by moving dev_put() into vlan_dev_uninit(), which is called after NETDEV_UNREGISTER event but before netdev_wait_allrefs(). Fixes: 563bcbae3ba2 ("net: vlan: fix a UAF in vlan_dev_real_dev()") Reported-by: Shuang Li Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index e5d23e75572a..d1902828a18a 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -638,7 +638,12 @@ void vlan_dev_free_egress_priority(const struct net_device *dev) static void vlan_dev_uninit(struct net_device *dev) { + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + vlan_dev_free_egress_priority(dev); + + /* Get rid of the vlan's reference to real_dev */ + dev_put_track(vlan->real_dev, &vlan->dev_tracker); } static netdev_features_t vlan_dev_fix_features(struct net_device *dev, @@ -851,9 +856,6 @@ static void vlan_dev_free(struct net_device *dev) free_percpu(vlan->vlan_pcpu_stats); vlan->vlan_pcpu_stats = NULL; - - /* Get rid of the vlan's reference to real_dev */ - dev_put_track(vlan->real_dev, &vlan->dev_tracker); } void vlan_setup(struct net_device *dev) From 00e757b648c0935d703a9b8042312f4a76ee793b Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Tue, 8 Feb 2022 00:28:06 +0100 Subject: [PATCH 1214/1295] nvme: add nvme_complete_req tracepoint for batched completion Add NVMe request completion trace in nvme_complete_batch_req() because nvme:nvme_complete_req tracepoint is missing in case of request batched completion. Signed-off-by: Bean Huo Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 961a5f8a44d2..79005ea1a33e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -368,6 +368,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_rq); void nvme_complete_batch_req(struct request *req) { + trace_nvme_complete_rq(req); nvme_cleanup_cmd(req); nvme_end_req_zoned(req); } From 63573807b27e0faf8065a28b1bbe1cbfb23c0130 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 7 Feb 2022 00:40:13 +0200 Subject: [PATCH 1215/1295] nvme-tcp: fix bogus request completion when failing to send AER AER is not backed by a real request, hence we should not incorrectly assume that when failing to send a nvme command, it is a normal request but rather check if this is an aer and if so complete the aer (similar to the normal completion path). Cc: stable@vger.kernel.org Signed-off-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 01e24b5703db..891a36d02e7c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -913,7 +913,15 @@ static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) static void nvme_tcp_fail_request(struct nvme_tcp_request *req) { - nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR); + if (nvme_tcp_async_req(req)) { + union nvme_result res = {}; + + nvme_complete_async_event(&req->queue->ctrl->ctrl, + cpu_to_le16(NVME_SC_HOST_PATH_ERROR), &res); + } else { + nvme_tcp_end_request(blk_mq_rq_from_pdu(req), + NVME_SC_HOST_PATH_ERROR); + } } static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) From 0cb4d23ae08c48f6bf3c29a8e5c4a74b8388b960 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 4 Feb 2022 15:19:34 -0500 Subject: [PATCH 1216/1295] NFSD: Fix the behavior of READ near OFFSET_MAX Dan Aloni reports: > Due to commit 8cfb9015280d ("NFS: Always provide aligned buffers to > the RPC read layers") on the client, a read of 0xfff is aligned up > to server rsize of 0x1000. > > As a result, in a test where the server has a file of size > 0x7fffffffffffffff, and the client tries to read from the offset > 0x7ffffffffffff000, the read causes loff_t overflow in the server > and it returns an NFS code of EINVAL to the client. The client as > a result indefinitely retries the request. The Linux NFS client does not handle NFS?ERR_INVAL, even though all NFS specifications permit servers to return that status code for a READ. Instead of NFS?ERR_INVAL, have out-of-range READ requests succeed and return a short result. Set the EOF flag in the result to prevent the client from retrying the READ request. This behavior appears to be consistent with Solaris NFS servers. Note that NFSv3 and NFSv4 use u64 offset values on the wire. These must be converted to loff_t internally before use -- an implicit type cast is not adequate for this purpose. Otherwise VFS checks against sb->s_maxbytes do not work properly. Reported-by: Dan Aloni Cc: stable@vger.kernel.org Signed-off-by: Chuck Lever --- fs/nfsd/nfs3proc.c | 8 ++++++-- fs/nfsd/nfs4proc.c | 8 ++++++-- fs/nfsd/nfs4xdr.c | 8 ++------ 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 2c681785186f..b5a52528f19f 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -150,13 +150,17 @@ nfsd3_proc_read(struct svc_rqst *rqstp) unsigned int len; int v; - argp->count = min_t(u32, argp->count, max_blocksize); - dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", SVCFH_fmt(&argp->fh), (unsigned long) argp->count, (unsigned long long) argp->offset); + argp->count = min_t(u32, argp->count, max_blocksize); + if (argp->offset > (u64)OFFSET_MAX) + argp->offset = (u64)OFFSET_MAX; + if (argp->offset + argp->count > (u64)OFFSET_MAX) + argp->count = (u64)OFFSET_MAX - argp->offset; + v = 0; len = argp->count; resp->pages = rqstp->rq_next_page; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ed1ee25647be..71d735b125a0 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -782,12 +782,16 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; read->rd_nf = NULL; - if (read->rd_offset >= OFFSET_MAX) - return nfserr_inval; trace_nfsd_read_start(rqstp, &cstate->current_fh, read->rd_offset, read->rd_length); + read->rd_length = min_t(u32, read->rd_length, svc_max_payload(rqstp)); + if (read->rd_offset > (u64)OFFSET_MAX) + read->rd_offset = (u64)OFFSET_MAX; + if (read->rd_offset + read->rd_length > (u64)OFFSET_MAX) + read->rd_length = (u64)OFFSET_MAX - read->rd_offset; + /* * If we do a zero copy read, then a client will see read data * that reflects the state of the file *after* performing the diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 899de438e529..f5e3430bb6ff 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3986,10 +3986,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, } xdr_commit_encode(xdr); - maxcount = svc_max_payload(resp->rqstp); - maxcount = min_t(unsigned long, maxcount, + maxcount = min_t(unsigned long, read->rd_length, (xdr->buf->buflen - xdr->buf->len)); - maxcount = min_t(unsigned long, maxcount, read->rd_length); if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) @@ -4826,10 +4824,8 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, return nfserr_resource; xdr_commit_encode(xdr); - maxcount = svc_max_payload(resp->rqstp); - maxcount = min_t(unsigned long, maxcount, + maxcount = min_t(unsigned long, read->rd_length, (xdr->buf->buflen - xdr->buf->len)); - maxcount = min_t(unsigned long, maxcount, read->rd_length); count = maxcount; eof = read->rd_offset >= i_size_read(file_inode(file)); From e6faac3f58c7c4176b66f63def17a34232a17b0e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 31 Jan 2022 13:01:53 -0500 Subject: [PATCH 1217/1295] NFSD: Fix ia_size underflow iattr::ia_size is a loff_t, which is a signed 64-bit type. NFSv3 and NFSv4 both define file size as an unsigned 64-bit type. Thus there is a range of valid file size values an NFS client can send that is already larger than Linux can handle. Currently decode_fattr4() dumps a full u64 value into ia_size. If that value happens to be larger than S64_MAX, then ia_size underflows. I'm about to fix up the NFSv3 behavior as well, so let's catch the underflow in the common code path: nfsd_setattr(). Cc: stable@vger.kernel.org Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 99c2b9dfbb10..0cccceb105e7 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -435,6 +435,10 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, .ia_size = iap->ia_size, }; + host_err = -EFBIG; + if (iap->ia_size < 0) + goto out_unlock; + host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL); if (host_err) goto out_unlock; From a648fdeb7c0e17177a2280344d015dba3fbe3314 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 25 Jan 2022 15:59:57 -0500 Subject: [PATCH 1218/1295] NFSD: Fix NFSv3 SETATTR/CREATE's handling of large file sizes iattr::ia_size is a loff_t, so these NFSv3 procedures must be careful to deal with incoming client size values that are larger than s64_max without corrupting the value. Silently capping the value results in storing a different value than the client passed in which is unexpected behavior, so remove the min_t() check in decode_sattr3(). Note that RFC 1813 permits only the WRITE procedure to return NFS3ERR_FBIG. We believe that NFSv3 reference implementations also return NFS3ERR_FBIG when ia_size is too large. Cc: stable@vger.kernel.org Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 7c45ba4db61b..2e47a07029f1 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -254,7 +254,7 @@ svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr, if (xdr_stream_decode_u64(xdr, &newsize) < 0) return false; iap->ia_valid |= ATTR_SIZE; - iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX); + iap->ia_size = newsize; } if (xdr_stream_decode_u32(xdr, &set_it) < 0) return false; From 6260d9a56ab352b54891ec66ab0eced57d55abc6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 25 Jan 2022 16:36:22 -0500 Subject: [PATCH 1219/1295] NFSD: Clamp WRITE offsets Ensure that a client cannot specify a WRITE range that falls in a byte range outside what the kernel's internal types (such as loff_t, which is signed) can represent. The kiocb iterators, invoked in nfsd_vfs_write(), should properly limit write operations to within the underlying file system's s_maxbytes. Cc: stable@vger.kernel.org Signed-off-by: Chuck Lever --- fs/nfsd/nfs3proc.c | 5 +++++ fs/nfsd/nfs4proc.c | 5 +++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index b5a52528f19f..aca38ed1526e 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -203,6 +203,11 @@ nfsd3_proc_write(struct svc_rqst *rqstp) (unsigned long long) argp->offset, argp->stable? " stable" : ""); + resp->status = nfserr_fbig; + if (argp->offset > (u64)OFFSET_MAX || + argp->offset + argp->len > (u64)OFFSET_MAX) + return rpc_success; + fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; nvecs = svc_fill_write_vector(rqstp, &argp->payload); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 71d735b125a0..b207c76a873f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1022,8 +1022,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, unsigned long cnt; int nvecs; - if (write->wr_offset >= OFFSET_MAX) - return nfserr_inval; + if (write->wr_offset > (u64)OFFSET_MAX || + write->wr_offset + write->wr_buflen > (u64)OFFSET_MAX) + return nfserr_fbig; cnt = write->wr_buflen; trace_nfsd_write_start(rqstp, &cstate->current_fh, From 3f965021c8bc38965ecb1924f570c4842b33d408 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 24 Jan 2022 15:50:31 -0500 Subject: [PATCH 1220/1295] NFSD: COMMIT operations must not return NFS?ERR_INVAL Since, well, forever, the Linux NFS server's nfsd_commit() function has returned nfserr_inval when the passed-in byte range arguments were non-sensical. However, according to RFC 1813 section 3.3.21, NFSv3 COMMIT requests are permitted to return only the following non-zero status codes: NFS3ERR_IO NFS3ERR_STALE NFS3ERR_BADHANDLE NFS3ERR_SERVERFAULT NFS3ERR_INVAL is not included in that list. Likewise, NFS4ERR_INVAL is not listed in the COMMIT row of Table 6 in RFC 8881. RFC 7530 does permit COMMIT to return NFS4ERR_INVAL, but does not specify when it can or should be used. Instead of dropping or failing a COMMIT request in a byte range that is not supported, turn it into a valid request by treating one or both arguments as zero. Offset zero means start-of-file, count zero means until-end-of-file, so we only ever extend the commit range. NFS servers are always allowed to commit more and sooner than requested. The range check is no longer bounded by NFS_OFFSET_MAX, but rather by the value that is returned in the maxfilesize field of the NFSv3 FSINFO procedure or the NFSv4 maxfilesize file attribute. Note that this change results in a new pynfs failure: CMT4 st_commit.testCommitOverflow : RUNNING CMT4 st_commit.testCommitOverflow : FAILURE COMMIT with offset + count overflow should return NFS4ERR_INVAL, instead got NFS4_OK IMO the test is not correct as written: RFC 8881 does not allow the COMMIT operation to return NFS4ERR_INVAL. Reported-by: Dan Aloni Cc: stable@vger.kernel.org Signed-off-by: Chuck Lever Reviewed-by: Bruce Fields --- fs/nfsd/nfs3proc.c | 6 ------ fs/nfsd/vfs.c | 53 +++++++++++++++++++++++++++++++--------------- fs/nfsd/vfs.h | 4 ++-- 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index aca38ed1526e..52ad1972cc33 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -663,15 +663,9 @@ nfsd3_proc_commit(struct svc_rqst *rqstp) argp->count, (unsigned long long) argp->offset); - if (argp->offset > NFS_OFFSET_MAX) { - resp->status = nfserr_inval; - goto out; - } - fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset, argp->count, resp->verf); -out: return rpc_success; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0cccceb105e7..91600e71be19 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1114,42 +1114,61 @@ out: } #ifdef CONFIG_NFSD_V3 -/* - * Commit all pending writes to stable storage. +/** + * nfsd_commit - Commit pending writes to stable storage + * @rqstp: RPC request being processed + * @fhp: NFS filehandle + * @offset: raw offset from beginning of file + * @count: raw count of bytes to sync + * @verf: filled in with the server's current write verifier * - * Note: we only guarantee that data that lies within the range specified - * by the 'offset' and 'count' parameters will be synced. + * Note: we guarantee that data that lies within the range specified + * by the 'offset' and 'count' parameters will be synced. The server + * is permitted to sync data that lies outside this range at the + * same time. * * Unfortunately we cannot lock the file to make sure we return full WCC * data to the client, as locking happens lower down in the filesystem. + * + * Return values: + * An nfsstat value in network byte order. */ __be32 -nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, unsigned long count, __be32 *verf) +nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset, + u32 count, __be32 *verf) { + u64 maxbytes; + loff_t start, end; struct nfsd_net *nn; struct nfsd_file *nf; - loff_t end = LLONG_MAX; - __be32 err = nfserr_inval; - - if (offset < 0) - goto out; - if (count != 0) { - end = offset + (loff_t)count - 1; - if (end < offset) - goto out; - } + __be32 err; err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf); if (err) goto out; + + /* + * Convert the client-provided (offset, count) range to a + * (start, end) range. If the client-provided range falls + * outside the maximum file size of the underlying FS, + * clamp the sync range appropriately. + */ + start = 0; + end = LLONG_MAX; + maxbytes = (u64)fhp->fh_dentry->d_sb->s_maxbytes; + if (offset < maxbytes) { + start = offset; + if (count && (offset + count - 1 < maxbytes)) + end = offset + count - 1; + } + nn = net_generic(nf->nf_net, nfsd_net_id); if (EX_ISSYNC(fhp->fh_export)) { errseq_t since = READ_ONCE(nf->nf_file->f_wb_err); int err2; - err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); + err2 = vfs_fsync_range(nf->nf_file, start, end, 0); switch (err2) { case 0: nfsd_copy_write_verifier(verf, nn); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 9f56dcb22ff7..2c43d10e3cab 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -74,8 +74,8 @@ __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, struct svc_fh *res, int createmode, u32 *verifier, bool *truncp, bool *created); -__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, - loff_t, unsigned long, __be32 *verf); +__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp, + u64 offset, u32 count, __be32 *verf); #endif /* CONFIG_NFSD_V3 */ #ifdef CONFIG_NFSD_V4 __be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, From 6a4d333d540041d244b2fca29b8417bfde20af81 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 4 Feb 2022 17:05:24 -0500 Subject: [PATCH 1221/1295] NFSD: Fix offset type in I/O trace points NFSv3 and NFSv4 use u64 offset values on the wire. Record these values verbatim without the implicit type case to loff_t. Signed-off-by: Chuck Lever --- fs/nfsd/trace.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index c4cf56327843..5889db66409d 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -306,14 +306,14 @@ TRACE_EVENT(nfsd_export_update, DECLARE_EVENT_CLASS(nfsd_io_class, TP_PROTO(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, - unsigned long len), + u64 offset, + u32 len), TP_ARGS(rqstp, fhp, offset, len), TP_STRUCT__entry( __field(u32, xid) __field(u32, fh_hash) - __field(loff_t, offset) - __field(unsigned long, len) + __field(u64, offset) + __field(u32, len) ), TP_fast_assign( __entry->xid = be32_to_cpu(rqstp->rq_xid); @@ -321,7 +321,7 @@ DECLARE_EVENT_CLASS(nfsd_io_class, __entry->offset = offset; __entry->len = len; ), - TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld len=%lu", + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu len=%u", __entry->xid, __entry->fh_hash, __entry->offset, __entry->len) ) @@ -330,8 +330,8 @@ DECLARE_EVENT_CLASS(nfsd_io_class, DEFINE_EVENT(nfsd_io_class, nfsd_##name, \ TP_PROTO(struct svc_rqst *rqstp, \ struct svc_fh *fhp, \ - loff_t offset, \ - unsigned long len), \ + u64 offset, \ + u32 len), \ TP_ARGS(rqstp, fhp, offset, len)) DEFINE_NFSD_IO_EVENT(read_start); From c306d737691ef84305d4ed0d302c63db2932f0bb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 25 Jan 2022 15:57:45 -0500 Subject: [PATCH 1222/1295] NFSD: Deprecate NFS_OFFSET_MAX NFS_OFFSET_MAX was introduced way back in Linux v2.3.y before there was a kernel-wide OFFSET_MAX value. As a clean up, replace the last few uses of it with its generic equivalent, and get rid of it. Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 2 +- fs/nfsd/nfs4xdr.c | 2 +- include/linux/nfs.h | 8 -------- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2e47a07029f1..0293b8d65f10 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -1060,7 +1060,7 @@ svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name, return false; /* cookie */ resp->cookie_offset = dirlist->len; - if (xdr_stream_encode_u64(xdr, NFS_OFFSET_MAX) < 0) + if (xdr_stream_encode_u64(xdr, OFFSET_MAX) < 0) return false; return true; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f5e3430bb6ff..714a3a3bd50c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3495,7 +3495,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, p = xdr_reserve_space(xdr, 3*4 + namlen); if (!p) goto fail; - p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ + p = xdr_encode_hyper(p, OFFSET_MAX); /* offset of next entry */ p = xdr_encode_array(p, name, namlen); /* name length & name */ nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen); diff --git a/include/linux/nfs.h b/include/linux/nfs.h index 0dc7ad38a0da..b06375e88e58 100644 --- a/include/linux/nfs.h +++ b/include/linux/nfs.h @@ -36,14 +36,6 @@ static inline void nfs_copy_fh(struct nfs_fh *target, const struct nfs_fh *sourc memcpy(target->data, source->data, source->size); } - -/* - * This is really a general kernel constant, but since nothing like - * this is defined in the kernel headers, I have to do it here. - */ -#define NFS_OFFSET_MAX ((__s64)((~(__u64)0) >> 1)) - - enum nfs3_stable_how { NFS_UNSTABLE = 0, NFS_DATA_SYNC = 1, From ffc58bc4af9365d4eea72526bb3cf6a83615c673 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Wed, 9 Feb 2022 11:22:51 +0800 Subject: [PATCH 1223/1295] Drivers: hv: utils: Make use of the helper macro LIST_HEAD() Replace "struct list_head head = LIST_HEAD_INIT(head)" with "LIST_HEAD(head)" to simplify the code. Signed-off-by: Cai Huoqing Link: https://lore.kernel.org/r/20220209032251.37362-1-cai.huoqing@linux.dev Signed-off-by: Wei Liu --- drivers/hv/hv_utils_transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/hv_utils_transport.c b/drivers/hv/hv_utils_transport.c index eb2833d2b5d0..832885198643 100644 --- a/drivers/hv/hv_utils_transport.c +++ b/drivers/hv/hv_utils_transport.c @@ -13,7 +13,7 @@ #include "hv_utils_transport.h" static DEFINE_SPINLOCK(hvt_list_lock); -static struct list_head hvt_list = LIST_HEAD_INIT(hvt_list); +static LIST_HEAD(hvt_list); static void hvt_reset(struct hvutil_transport *hvt) { From b42bc9a3c5115c3102a4923776bbeed3b191f2db Mon Sep 17 00:00:00 2001 From: Domenico Andreoli Date: Wed, 9 Feb 2022 08:49:20 +0100 Subject: [PATCH 1224/1295] Fix regression due to "fs: move binfmt_misc sysctl to its own file" Commit 3ba442d5331f ("fs: move binfmt_misc sysctl to its own file") did not go unnoticed, binfmt-support stopped to work on my Debian system since v5.17-rc2 (did not check with -rc1). The existance of the /proc/sys/fs/binfmt_misc is a precondition for attempting to mount the binfmt_misc fs, which in turn triggers the autoload of the binfmt_misc module. Without it, no module is loaded and no binfmt is available at boot. Building as built-in or manually loading the module and mounting the fs works fine, it's therefore only a matter of interaction with user-space. I could try to improve the Debian systemd configuration but I can't say anything about the other distributions. This patch restores a working system right after boot. Fixes: 3ba442d5331f ("fs: move binfmt_misc sysctl to its own file") Signed-off-by: Domenico Andreoli Cc: Andrew Morton Cc: Luis Chamberlain Reviewed-by: Tong Zhang Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 6 +----- fs/file_table.c | 2 ++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index c07f35719ee3..e1eae7ea823a 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -817,20 +817,16 @@ static struct file_system_type bm_fs_type = { }; MODULE_ALIAS_FS("binfmt_misc"); -static struct ctl_table_header *binfmt_misc_header; - static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); if (!err) insert_binfmt(&misc_format); - binfmt_misc_header = register_sysctl_mount_point("fs/binfmt_misc"); - return 0; + return err; } static void __exit exit_misc_binfmt(void) { - unregister_sysctl_table(binfmt_misc_header); unregister_binfmt(&misc_format); unregister_filesystem(&bm_fs_type); } diff --git a/fs/file_table.c b/fs/file_table.c index 57edef16dce4..4969021fa676 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -119,6 +119,8 @@ static struct ctl_table fs_stat_sysctls[] = { static int __init init_fs_stat_sysctls(void) { register_sysctl_init("fs", fs_stat_sysctls); + if (IS_ENABLED(CONFIG_BINFMT_MISC)) + register_sysctl_mount_point("fs/binfmt_misc"); return 0; } fs_initcall(init_fs_stat_sysctls); From ea0eba69a2a8125229b1b6011644598039bc53aa Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 30 Jan 2022 20:53:15 +0800 Subject: [PATCH 1225/1295] btrfs: don't hold CPU for too long when defragging a file There is a user report about "btrfs filesystem defrag" causing 120s timeout problem. For btrfs_defrag_file() it will iterate all file extents if called from defrag ioctl, thus it can take a long time. There is no reason not to release the CPU during such a long operation. Add cond_resched() after defragged one cluster. CC: stable@vger.kernel.org # 5.16 Link: https://lore.kernel.org/linux-btrfs/10e51417-2203-f0a4-2021-86c8511cc367@gmx.com Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 925522756e28..b51c8b783f40 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1629,6 +1629,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, ret = 0; break; } + cond_resched(); } if (ra_allocated) From 0d1ffa2228cb34f485f8fe927f134b82a0ea62ae Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 8 Feb 2022 14:54:05 +0800 Subject: [PATCH 1226/1295] btrfs: defrag: don't try to defrag extents which are under writeback Once we start writeback (have called btrfs_run_delalloc_range()), we allocate an extent, create an extent map point to that extent, with a generation of (u64)-1, created the ordered extent and then clear the DELALLOC bit from the range in the inode's io tree. Such extent map can pass the first call of defrag_collect_targets(), as its generation is (u64)-1, meets any possible minimal generation check. And the range will not have DELALLOC bit, also passing the DELALLOC bit check. It will only be re-checked in the second call of defrag_collect_targets(), which will wait for writeback. But at that stage we have already spent our time waiting for some IO we may or may not want to defrag. Let's reject such extents early so we won't waste our time. CC: stable@vger.kernel.org # 5.16 Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b51c8b783f40..90136562d865 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1210,6 +1210,10 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (em->generation < newer_than) goto next; + /* This em is under writeback, no need to defrag */ + if (em->generation == (u64)-1) + goto next; + /* * Our start offset might be in the middle of an existing extent * map, so take that into account. From a0f0cf8341e34e5d2265bfd3a7ad68342da1e2aa Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 2 Feb 2022 15:26:09 +0000 Subject: [PATCH 1227/1295] btrfs: get rid of warning on transaction commit when using flushoncommit When using the flushoncommit mount option, during almost every transaction commit we trigger a warning from __writeback_inodes_sb_nr(): $ cat fs/fs-writeback.c: (...) static void __writeback_inodes_sb_nr(struct super_block *sb, ... { (...) WARN_ON(!rwsem_is_locked(&sb->s_umount)); (...) } (...) The trace produced in dmesg looks like the following: [947.473890] WARNING: CPU: 5 PID: 930 at fs/fs-writeback.c:2610 __writeback_inodes_sb_nr+0x7e/0xb3 [947.481623] Modules linked in: nfsd nls_cp437 cifs asn1_decoder cifs_arc4 fscache cifs_md4 ipmi_ssif [947.489571] CPU: 5 PID: 930 Comm: btrfs-transacti Not tainted 95.16.3-srb-asrock-00001-g36437ad63879 #186 [947.497969] RIP: 0010:__writeback_inodes_sb_nr+0x7e/0xb3 [947.502097] Code: 24 10 4c 89 44 24 18 c6 (...) [947.519760] RSP: 0018:ffffc90000777e10 EFLAGS: 00010246 [947.523818] RAX: 0000000000000000 RBX: 0000000000963300 RCX: 0000000000000000 [947.529765] RDX: 0000000000000000 RSI: 000000000000fa51 RDI: ffffc90000777e50 [947.535740] RBP: ffff888101628a90 R08: ffff888100955800 R09: ffff888100956000 [947.541701] R10: 0000000000000002 R11: 0000000000000001 R12: ffff888100963488 [947.547645] R13: ffff888100963000 R14: ffff888112fb7200 R15: ffff888100963460 [947.553621] FS: 0000000000000000(0000) GS:ffff88841fd40000(0000) knlGS:0000000000000000 [947.560537] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [947.565122] CR2: 0000000008be50c4 CR3: 000000000220c000 CR4: 00000000001006e0 [947.571072] Call Trace: [947.572354] [947.573266] btrfs_commit_transaction+0x1f1/0x998 [947.576785] ? start_transaction+0x3ab/0x44e [947.579867] ? schedule_timeout+0x8a/0xdd [947.582716] transaction_kthread+0xe9/0x156 [947.585721] ? btrfs_cleanup_transaction.isra.0+0x407/0x407 [947.590104] kthread+0x131/0x139 [947.592168] ? set_kthread_struct+0x32/0x32 [947.595174] ret_from_fork+0x22/0x30 [947.597561] [947.598553] ---[ end trace 644721052755541c ]--- This is because we started using writeback_inodes_sb() to flush delalloc when committing a transaction (when using -o flushoncommit), in order to avoid deadlocks with filesystem freeze operations. This change was made by commit ce8ea7cc6eb313 ("btrfs: don't call btrfs_start_delalloc_roots in flushoncommit"). After that change we started producing that warning, and every now and then a user reports this since the warning happens too often, it spams dmesg/syslog, and a user is unsure if this reflects any problem that might compromise the filesystem's reliability. We can not just lock the sb->s_umount semaphore before calling writeback_inodes_sb(), because that would at least deadlock with filesystem freezing, since at fs/super.c:freeze_super() sync_filesystem() is called while we are holding that semaphore in write mode, and that can trigger a transaction commit, resulting in a deadlock. It would also trigger the same type of deadlock in the unmount path. Possibly, it could also introduce some other locking dependencies that lockdep would report. To fix this call try_to_writeback_inodes_sb() instead of writeback_inodes_sb(), because that will try to read lock sb->s_umount and then will only call writeback_inodes_sb() if it was able to lock it. This is fine because the cases where it can't read lock sb->s_umount are during a filesystem unmount or during a filesystem freeze - in those cases sb->s_umount is write locked and sync_filesystem() is called, which calls writeback_inodes_sb(). In other words, in all cases where we can't take a read lock on sb->s_umount, writeback is already being triggered elsewhere. An alternative would be to call btrfs_start_delalloc_roots() with a number of pages different from LONG_MAX, for example matching the number of delalloc bytes we currently have, in which case we would end up starting all delalloc with filemap_fdatawrite_wbc() and not with an async flush via filemap_flush() - that is only possible after the rather recent commit e076ab2a2ca70a ("btrfs: shrink delalloc pages instead of full inodes"). However that creates a whole new can of worms due to new lock dependencies, which lockdep complains, like for example: [ 8948.247280] ====================================================== [ 8948.247823] WARNING: possible circular locking dependency detected [ 8948.248353] 5.17.0-rc1-btrfs-next-111 #1 Not tainted [ 8948.248786] ------------------------------------------------------ [ 8948.249320] kworker/u16:18/933570 is trying to acquire lock: [ 8948.249812] ffff9b3de1591690 (sb_internal#2){.+.+}-{0:0}, at: find_free_extent+0x141e/0x1590 [btrfs] [ 8948.250638] but task is already holding lock: [ 8948.251140] ffff9b3e09c717d8 (&root->delalloc_mutex){+.+.}-{3:3}, at: start_delalloc_inodes+0x78/0x400 [btrfs] [ 8948.252018] which lock already depends on the new lock. [ 8948.252710] the existing dependency chain (in reverse order) is: [ 8948.253343] -> #2 (&root->delalloc_mutex){+.+.}-{3:3}: [ 8948.253950] __mutex_lock+0x90/0x900 [ 8948.254354] start_delalloc_inodes+0x78/0x400 [btrfs] [ 8948.254859] btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs] [ 8948.255408] btrfs_commit_transaction+0x32f/0xc00 [btrfs] [ 8948.255942] btrfs_mksubvol+0x380/0x570 [btrfs] [ 8948.256406] btrfs_mksnapshot+0x81/0xb0 [btrfs] [ 8948.256870] __btrfs_ioctl_snap_create+0x17f/0x190 [btrfs] [ 8948.257413] btrfs_ioctl_snap_create_v2+0xbb/0x140 [btrfs] [ 8948.257961] btrfs_ioctl+0x1196/0x3630 [btrfs] [ 8948.258418] __x64_sys_ioctl+0x83/0xb0 [ 8948.258793] do_syscall_64+0x3b/0xc0 [ 8948.259146] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 8948.259709] -> #1 (&fs_info->delalloc_root_mutex){+.+.}-{3:3}: [ 8948.260330] __mutex_lock+0x90/0x900 [ 8948.260692] btrfs_start_delalloc_roots+0x97/0x2a0 [btrfs] [ 8948.261234] btrfs_commit_transaction+0x32f/0xc00 [btrfs] [ 8948.261766] btrfs_set_free_space_cache_v1_active+0x38/0x60 [btrfs] [ 8948.262379] btrfs_start_pre_rw_mount+0x119/0x180 [btrfs] [ 8948.262909] open_ctree+0x1511/0x171e [btrfs] [ 8948.263359] btrfs_mount_root.cold+0x12/0xde [btrfs] [ 8948.263863] legacy_get_tree+0x30/0x50 [ 8948.264242] vfs_get_tree+0x28/0xc0 [ 8948.264594] vfs_kern_mount.part.0+0x71/0xb0 [ 8948.265017] btrfs_mount+0x11d/0x3a0 [btrfs] [ 8948.265462] legacy_get_tree+0x30/0x50 [ 8948.265851] vfs_get_tree+0x28/0xc0 [ 8948.266203] path_mount+0x2d4/0xbe0 [ 8948.266554] __x64_sys_mount+0x103/0x140 [ 8948.266940] do_syscall_64+0x3b/0xc0 [ 8948.267300] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 8948.267790] -> #0 (sb_internal#2){.+.+}-{0:0}: [ 8948.268322] __lock_acquire+0x12e8/0x2260 [ 8948.268733] lock_acquire+0xd7/0x310 [ 8948.269092] start_transaction+0x44c/0x6e0 [btrfs] [ 8948.269591] find_free_extent+0x141e/0x1590 [btrfs] [ 8948.270087] btrfs_reserve_extent+0x14b/0x280 [btrfs] [ 8948.270588] cow_file_range+0x17e/0x490 [btrfs] [ 8948.271051] btrfs_run_delalloc_range+0x345/0x7a0 [btrfs] [ 8948.271586] writepage_delalloc+0xb5/0x170 [btrfs] [ 8948.272071] __extent_writepage+0x156/0x3c0 [btrfs] [ 8948.272579] extent_write_cache_pages+0x263/0x460 [btrfs] [ 8948.273113] extent_writepages+0x76/0x130 [btrfs] [ 8948.273573] do_writepages+0xd2/0x1c0 [ 8948.273942] filemap_fdatawrite_wbc+0x68/0x90 [ 8948.274371] start_delalloc_inodes+0x17f/0x400 [btrfs] [ 8948.274876] btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs] [ 8948.275417] flush_space+0x1f2/0x630 [btrfs] [ 8948.275863] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs] [ 8948.276438] process_one_work+0x252/0x5a0 [ 8948.276829] worker_thread+0x55/0x3b0 [ 8948.277189] kthread+0xf2/0x120 [ 8948.277506] ret_from_fork+0x22/0x30 [ 8948.277868] other info that might help us debug this: [ 8948.278548] Chain exists of: sb_internal#2 --> &fs_info->delalloc_root_mutex --> &root->delalloc_mutex [ 8948.279601] Possible unsafe locking scenario: [ 8948.280102] CPU0 CPU1 [ 8948.280508] ---- ---- [ 8948.280915] lock(&root->delalloc_mutex); [ 8948.281271] lock(&fs_info->delalloc_root_mutex); [ 8948.281915] lock(&root->delalloc_mutex); [ 8948.282487] lock(sb_internal#2); [ 8948.282800] *** DEADLOCK *** [ 8948.283333] 4 locks held by kworker/u16:18/933570: [ 8948.283750] #0: ffff9b3dc00a9d48 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1d2/0x5a0 [ 8948.284609] #1: ffffa90349dafe70 ((work_completion)(&fs_info->async_data_reclaim_work)){+.+.}-{0:0}, at: process_one_work+0x1d2/0x5a0 [ 8948.285637] #2: ffff9b3e14db5040 (&fs_info->delalloc_root_mutex){+.+.}-{3:3}, at: btrfs_start_delalloc_roots+0x97/0x2a0 [btrfs] [ 8948.286674] #3: ffff9b3e09c717d8 (&root->delalloc_mutex){+.+.}-{3:3}, at: start_delalloc_inodes+0x78/0x400 [btrfs] [ 8948.287596] stack backtrace: [ 8948.287975] CPU: 3 PID: 933570 Comm: kworker/u16:18 Not tainted 5.17.0-rc1-btrfs-next-111 #1 [ 8948.288677] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 8948.289649] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs] [ 8948.290298] Call Trace: [ 8948.290517] [ 8948.290700] dump_stack_lvl+0x59/0x73 [ 8948.291026] check_noncircular+0xf3/0x110 [ 8948.291375] ? start_transaction+0x228/0x6e0 [btrfs] [ 8948.291826] __lock_acquire+0x12e8/0x2260 [ 8948.292241] lock_acquire+0xd7/0x310 [ 8948.292714] ? find_free_extent+0x141e/0x1590 [btrfs] [ 8948.293241] ? lock_is_held_type+0xea/0x140 [ 8948.293601] start_transaction+0x44c/0x6e0 [btrfs] [ 8948.294055] ? find_free_extent+0x141e/0x1590 [btrfs] [ 8948.294518] find_free_extent+0x141e/0x1590 [btrfs] [ 8948.294957] ? _raw_spin_unlock+0x29/0x40 [ 8948.295312] ? btrfs_get_alloc_profile+0x124/0x290 [btrfs] [ 8948.295813] btrfs_reserve_extent+0x14b/0x280 [btrfs] [ 8948.296270] cow_file_range+0x17e/0x490 [btrfs] [ 8948.296691] btrfs_run_delalloc_range+0x345/0x7a0 [btrfs] [ 8948.297175] ? find_lock_delalloc_range+0x247/0x270 [btrfs] [ 8948.297678] writepage_delalloc+0xb5/0x170 [btrfs] [ 8948.298123] __extent_writepage+0x156/0x3c0 [btrfs] [ 8948.298570] extent_write_cache_pages+0x263/0x460 [btrfs] [ 8948.299061] extent_writepages+0x76/0x130 [btrfs] [ 8948.299495] do_writepages+0xd2/0x1c0 [ 8948.299817] ? sched_clock_cpu+0xd/0x110 [ 8948.300160] ? lock_release+0x155/0x4a0 [ 8948.300494] filemap_fdatawrite_wbc+0x68/0x90 [ 8948.300874] ? do_raw_spin_unlock+0x4b/0xa0 [ 8948.301243] start_delalloc_inodes+0x17f/0x400 [btrfs] [ 8948.301706] ? lock_release+0x155/0x4a0 [ 8948.302055] btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs] [ 8948.302564] flush_space+0x1f2/0x630 [btrfs] [ 8948.302970] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs] [ 8948.303510] process_one_work+0x252/0x5a0 [ 8948.303860] ? process_one_work+0x5a0/0x5a0 [ 8948.304221] worker_thread+0x55/0x3b0 [ 8948.304543] ? process_one_work+0x5a0/0x5a0 [ 8948.304904] kthread+0xf2/0x120 [ 8948.305184] ? kthread_complete_and_exit+0x20/0x20 [ 8948.305598] ret_from_fork+0x22/0x30 [ 8948.305921] It all comes from the fact that btrfs_start_delalloc_roots() takes the delalloc_root_mutex, in the transaction commit path we are holding a read lock on one of the superblock's freeze semaphores (via sb_start_intwrite()), the async reclaim task can also do a call to btrfs_start_delalloc_roots(), which ends up triggering writeback with calls to filemap_fdatawrite_wbc(), resulting in extent allocation which in turn can call btrfs_start_transaction(), which will result in taking the freeze semaphore via sb_start_intwrite(), forming a nasty dependency on all those locks which can be taken in different orders by different code paths. So just adopt the simple approach of calling try_to_writeback_inodes_sb() at btrfs_start_delalloc_flush(). Link: https://lore.kernel.org/linux-btrfs/20220130005258.GA7465@cuci.nl/ Link: https://lore.kernel.org/linux-btrfs/43acc426-d683-d1b6-729d-c6bc4a2fff4d@gmail.com/ Link: https://lore.kernel.org/linux-btrfs/6833930a-08d7-6fbc-0141-eb9cdfd6bb4d@gmail.com/ Link: https://lore.kernel.org/linux-btrfs/20190322041731.GF16651@hungrycats.org/ Reviewed-by: Omar Sandoval Signed-off-by: Filipe Manana [ add more link reports ] Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c43bbc7f623e..c3cfdfd8de9b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1981,16 +1981,24 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { /* - * We use writeback_inodes_sb here because if we used + * We use try_to_writeback_inodes_sb() here because if we used * btrfs_start_delalloc_roots we would deadlock with fs freeze. * Currently are holding the fs freeze lock, if we do an async flush * we'll do btrfs_join_transaction() and deadlock because we need to * wait for the fs freeze lock. Using the direct flushing we benefit * from already being in a transaction and our join_transaction doesn't * have to re-take the fs freeze lock. + * + * Note that try_to_writeback_inodes_sb() will only trigger writeback + * if it can read lock sb->s_umount. It will always be able to lock it, + * except when the filesystem is being unmounted or being frozen, but in + * those cases sync_filesystem() is called, which results in calling + * writeback_inodes_sb() while holding a write lock on sb->s_umount. + * Note that we don't call writeback_inodes_sb() directly, because it + * will emit a warning if sb->s_umount is not locked. */ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); + try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); return 0; } From 2e7be9db125a0bf940c5d65eb5c40d8700f738b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C4=81vis=20Mos=C4=81ns?= Date: Sat, 5 Feb 2022 20:48:23 +0200 Subject: [PATCH 1228/1295] btrfs: send: in case of IO error log it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if we get IO error while doing send then we abort without logging information about which file caused issue. So log it to help with debugging. CC: stable@vger.kernel.org # 4.9+ Signed-off-by: Dāvis Mosāns Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d8ccb62aa7d2..201eb2628aea 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4999,6 +4999,10 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) lock_page(page); if (!PageUptodate(page)) { unlock_page(page); + btrfs_err(fs_info, + "send: IO error at offset %llu for inode %llu root %llu", + page_offset(page), sctx->cur_ino, + sctx->send_root->root_key.objectid); put_page(page); ret = -EIO; break; From da5fb9e1ad3fbf632dce735f1bdad257ca528499 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 3 Feb 2022 19:31:24 +0000 Subject: [PATCH 1229/1295] ACPI/IORT: Check node revision for PMCG resources The original version of the IORT PMCG definition had an oversight wherein there was no way to describe the second register page for an implementation using the recommended RELOC_CTRS feature. Although the spec was fixed, and the final patches merged to ACPICA and Linux written against the new version, it seems that some old firmware based on the original revision has survived and turned up in the wild. Add a check for the original PMCG definition, and avoid filling in the second memory resource with nonsense if so. Otherwise it is likely that something horrible will happen when the PMCG driver attempts to probe. Reported-by: Michael Petlan Fixes: 24e516049360 ("ACPI/IORT: Add support for PMCG") Cc: # 5.2.x Signed-off-by: Robin Murphy Acked-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/75628ae41c257fb73588f7bf1c4459160e04be2b.1643916258.git.robin.murphy@arm.com Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/iort.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 3b23fb775ac4..f2f8f05662de 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -1361,9 +1361,17 @@ static void __init arm_smmu_v3_pmcg_init_resources(struct resource *res, res[0].start = pmcg->page0_base_address; res[0].end = pmcg->page0_base_address + SZ_4K - 1; res[0].flags = IORESOURCE_MEM; - res[1].start = pmcg->page1_base_address; - res[1].end = pmcg->page1_base_address + SZ_4K - 1; - res[1].flags = IORESOURCE_MEM; + /* + * The initial version in DEN0049C lacked a way to describe register + * page 1, which makes it broken for most PMCG implementations; in + * that case, just let the driver fail gracefully if it expects to + * find a second memory resource. + */ + if (node->revision > 0) { + res[1].start = pmcg->page1_base_address; + res[1].end = pmcg->page1_base_address + SZ_4K - 1; + res[1].flags = IORESOURCE_MEM; + } if (pmcg->overflow_gsiv) acpi_iort_register_irq(pmcg->overflow_gsiv, "overflow", From 3eb616b26408ac813c67280cf883f36d98b8441d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 9 Feb 2022 17:13:42 +0100 Subject: [PATCH 1230/1295] x86/PCI: revert "Ignore E820 reservations for bridge windows on newer systems" Commit 7f7b4236f204 ("x86/PCI: Ignore E820 reservations for bridge windows on newer systems") fixes the touchpad not working on laptops like the Lenovo IdeaPad 3 15IIL05 and the Lenovo IdeaPad 5 14IIL05, as well as fixing thunderbolt hotplug issues on the Lenovo Yoga C940. Unfortunately it turns out that this is causing issues with suspend/resume on Lenovo ThinkPad X1 Carbon Gen 2 laptops. So, per the no regressions policy, rever this. Note I'm looking into another fix for the issues this fixed. Fixes: 7f7b4236f204 ("x86/PCI: Ignore E820 reservations for bridge windows on newer systems") BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=2029207 Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/resource.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 9ae64f9af956..9b9fb7882c20 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#include #include #include @@ -24,31 +23,11 @@ static void resource_clip(struct resource *res, resource_size_t start, res->start = end + 1; } -/* - * Some BIOS-es contain a bug where they add addresses which map to - * system RAM in the PCI host bridge window returned by the ACPI _CRS - * method, see commit 4dc2287c1805 ("x86: avoid E820 regions when - * allocating address space"). To avoid this Linux by default excludes - * E820 reservations when allocating addresses since 2010. - * In 2019 some systems have shown-up with E820 reservations which cover - * the entire _CRS returned PCI host bridge window, causing all attempts - * to assign memory to PCI BARs to fail if Linux uses E820 reservations. - * - * Ideally Linux would fully stop using E820 reservations, but then - * the old systems this was added for will regress. - * Instead keep the old behavior for old systems, while ignoring the - * E820 reservations for any systems from now on. - */ static void remove_e820_regions(struct resource *avail) { - int i, year = dmi_get_bios_year(); + int i; struct e820_entry *entry; - if (year >= 2018) - return; - - pr_info_once("PCI: Removing E820 reservations from host bridge windows\n"); - for (i = 0; i < e820_table->nr_entries; i++) { entry = &e820_table->entries[i]; From 03ad3093c7c069d6ab4403730009ebafeea9ee37 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 25 Jan 2022 15:49:47 -0600 Subject: [PATCH 1231/1295] display/amd: decrease message verbosity about watermarks table failure A number of BIOS versions have a problem with the watermarks table not being configured properly. This manifests as a very scary looking warning during resume from s0i3. This should be harmless in most cases and is well understood, so decrease the assertion to a clearer warning about the problem. Reviewed-by: Harry Wentland Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_smu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_smu.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_smu.c index a1011f3273f3..de3f4643eeef 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_smu.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_smu.c @@ -120,7 +120,11 @@ static int dcn31_smu_send_msg_with_param(struct clk_mgr_internal *clk_mgr, result = dcn31_smu_wait_for_response(clk_mgr, 10, 200000); if (result == VBIOSSMC_Result_Failed) { - ASSERT(0); + if (msg_id == VBIOSSMC_MSG_TransferTableDram2Smu && + param == TABLE_WATERMARKS) + DC_LOG_WARNING("Watermarks table not configured properly by SMU"); + else + ASSERT(0); REG_WRITE(MP1_SMN_C2PMSG_91, VBIOSSMC_Result_OK); return -1; } From a072312f43c33ea02ad88bff3375f650684a6f24 Mon Sep 17 00:00:00 2001 From: Aaron Liu Date: Sat, 29 Jan 2022 09:21:31 +0800 Subject: [PATCH 1232/1295] drm/amdgpu: add utcl2_harvest to gc 10.3.1 Confirmed with hardware team, there is harvesting for gc 10.3.1. Signed-off-by: Aaron Liu Reviewed-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c index b4eddf6e98a6..ff738e9725ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c @@ -543,7 +543,9 @@ static void gfxhub_v2_1_utcl2_harvest(struct amdgpu_device *adev) adev->gfx.config.max_sh_per_se * adev->gfx.config.max_shader_engines); - if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 3, 3)) { + switch (adev->ip_versions[GC_HWIP][0]) { + case IP_VERSION(10, 3, 1): + case IP_VERSION(10, 3, 3): /* Get SA disabled bitmap from eFuse setting */ efuse_setting = RREG32_SOC15(GC, 0, mmCC_GC_SA_UNIT_DISABLE); efuse_setting &= CC_GC_SA_UNIT_DISABLE__SA_DISABLE_MASK; @@ -566,6 +568,9 @@ static void gfxhub_v2_1_utcl2_harvest(struct amdgpu_device *adev) disabled_sa = tmp; WREG32_SOC15(GC, 0, mmGCUTCL2_HARVEST_BYPASS_GROUPS_YELLOW_CARP, disabled_sa); + break; + default: + break; } } From 328e34a5ad227399391891d454043e5d73e598d2 Mon Sep 17 00:00:00 2001 From: Roman Li Date: Wed, 2 Feb 2022 14:30:09 -0500 Subject: [PATCH 1233/1295] drm/amd/display: Cap pflip irqs per max otg number [Why] pflip interrupt order are mapped 1 to 1 to otg id. e.g. if irq_src=26 corresponds to otg0 then 27->otg1, 28->otg2... Linux DM registers pflip interrupts per number of crtcs. In fused pipe case crtc numbers can be less than otg id. e.g. if one pipe out of 3(otg#0-2) is fused adev->mode_info.num_crtc=2 so DM only registers irq_src 26,27. This is a bug since if pipe#2 remains unfused DM never gets otg2 pflip interrupt (irq_src=28) That may results in gfx failure due to pflip timeout. [How] Register pflip interrupts per max num of otg instead of num_crtc Signed-off-by: Roman Li Reviewed-by: Nicholas Kazlauskas Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- drivers/gpu/drm/amd/display/dc/core/dc.c | 2 ++ drivers/gpu/drm/amd/display/dc/dc.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 7f9773f8dab6..7c1c623ba799 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -3653,7 +3653,7 @@ static int dcn10_register_irq_handlers(struct amdgpu_device *adev) /* Use GRPH_PFLIP interrupt */ for (i = DCN_1_0__SRCID__HUBP0_FLIP_INTERRUPT; - i <= DCN_1_0__SRCID__HUBP0_FLIP_INTERRUPT + adev->mode_info.num_crtc - 1; + i <= DCN_1_0__SRCID__HUBP0_FLIP_INTERRUPT + dc->caps.max_otg_num - 1; i++) { r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_DCE, i, &adev->pageflip_irq); if (r) { diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 6f5528d34093..d18e9f3ea998 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -1220,6 +1220,8 @@ struct dc *dc_create(const struct dc_init_data *init_params) dc->caps.max_dp_protocol_version = DP_VERSION_1_4; + dc->caps.max_otg_num = dc->res_pool->res_cap->num_timing_generator; + if (dc->res_pool->dmcu != NULL) dc->versions.dmcu_version = dc->res_pool->dmcu->dmcu_version; } diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 288e7b01f561..b51864890621 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -202,6 +202,7 @@ struct dc_caps { bool edp_dsc_support; bool vbios_lttpr_aware; bool vbios_lttpr_enable; + uint32_t max_otg_num; }; struct dc_bug_wa { From 60fdf98a774eee244a4e00c34a9e7729b61d0f44 Mon Sep 17 00:00:00 2001 From: Dmytro Laktyushkin Date: Thu, 27 Jan 2022 11:55:49 -0500 Subject: [PATCH 1234/1295] drm/amd/display: fix yellow carp wm clamping Fix clamping to match register field size Reviewed-by: Charlene Liu Acked-by: Jasdeep Dhillon Signed-off-by: Dmytro Laktyushkin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../drm/amd/display/dc/dcn31/dcn31_hubbub.c | 61 ++++++++++--------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hubbub.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hubbub.c index 90c73a1cb986..5e3bcaf12cac 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hubbub.c +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hubbub.c @@ -138,8 +138,11 @@ static uint32_t convert_and_clamp( ret_val = wm_ns * refclk_mhz; ret_val /= 1000; - if (ret_val > clamp_value) + if (ret_val > clamp_value) { + /* clamping WMs is abnormal, unexpected and may lead to underflow*/ + ASSERT(0); ret_val = clamp_value; + } return ret_val; } @@ -159,7 +162,7 @@ static bool hubbub31_program_urgent_watermarks( if (safe_to_lower || watermarks->a.urgent_ns > hubbub2->watermarks.a.urgent_ns) { hubbub2->watermarks.a.urgent_ns = watermarks->a.urgent_ns; prog_wm_value = convert_and_clamp(watermarks->a.urgent_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0x3fff); REG_SET(DCHUBBUB_ARB_DATA_URGENCY_WATERMARK_A, 0, DCHUBBUB_ARB_DATA_URGENCY_WATERMARK_A, prog_wm_value); @@ -193,7 +196,7 @@ static bool hubbub31_program_urgent_watermarks( if (safe_to_lower || watermarks->a.urgent_latency_ns > hubbub2->watermarks.a.urgent_latency_ns) { hubbub2->watermarks.a.urgent_latency_ns = watermarks->a.urgent_latency_ns; prog_wm_value = convert_and_clamp(watermarks->a.urgent_latency_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0x3fff); REG_SET(DCHUBBUB_ARB_REFCYC_PER_TRIP_TO_MEMORY_A, 0, DCHUBBUB_ARB_REFCYC_PER_TRIP_TO_MEMORY_A, prog_wm_value); } else if (watermarks->a.urgent_latency_ns < hubbub2->watermarks.a.urgent_latency_ns) @@ -203,7 +206,7 @@ static bool hubbub31_program_urgent_watermarks( if (safe_to_lower || watermarks->b.urgent_ns > hubbub2->watermarks.b.urgent_ns) { hubbub2->watermarks.b.urgent_ns = watermarks->b.urgent_ns; prog_wm_value = convert_and_clamp(watermarks->b.urgent_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0x3fff); REG_SET(DCHUBBUB_ARB_DATA_URGENCY_WATERMARK_B, 0, DCHUBBUB_ARB_DATA_URGENCY_WATERMARK_B, prog_wm_value); @@ -237,7 +240,7 @@ static bool hubbub31_program_urgent_watermarks( if (safe_to_lower || watermarks->b.urgent_latency_ns > hubbub2->watermarks.b.urgent_latency_ns) { hubbub2->watermarks.b.urgent_latency_ns = watermarks->b.urgent_latency_ns; prog_wm_value = convert_and_clamp(watermarks->b.urgent_latency_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0x3fff); REG_SET(DCHUBBUB_ARB_REFCYC_PER_TRIP_TO_MEMORY_B, 0, DCHUBBUB_ARB_REFCYC_PER_TRIP_TO_MEMORY_B, prog_wm_value); } else if (watermarks->b.urgent_latency_ns < hubbub2->watermarks.b.urgent_latency_ns) @@ -247,7 +250,7 @@ static bool hubbub31_program_urgent_watermarks( if (safe_to_lower || watermarks->c.urgent_ns > hubbub2->watermarks.c.urgent_ns) { hubbub2->watermarks.c.urgent_ns = watermarks->c.urgent_ns; prog_wm_value = convert_and_clamp(watermarks->c.urgent_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0x3fff); REG_SET(DCHUBBUB_ARB_DATA_URGENCY_WATERMARK_C, 0, DCHUBBUB_ARB_DATA_URGENCY_WATERMARK_C, prog_wm_value); @@ -281,7 +284,7 @@ static bool hubbub31_program_urgent_watermarks( if (safe_to_lower || watermarks->c.urgent_latency_ns > hubbub2->watermarks.c.urgent_latency_ns) { hubbub2->watermarks.c.urgent_latency_ns = watermarks->c.urgent_latency_ns; prog_wm_value = convert_and_clamp(watermarks->c.urgent_latency_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0x3fff); REG_SET(DCHUBBUB_ARB_REFCYC_PER_TRIP_TO_MEMORY_C, 0, DCHUBBUB_ARB_REFCYC_PER_TRIP_TO_MEMORY_C, prog_wm_value); } else if (watermarks->c.urgent_latency_ns < hubbub2->watermarks.c.urgent_latency_ns) @@ -291,7 +294,7 @@ static bool hubbub31_program_urgent_watermarks( if (safe_to_lower || watermarks->d.urgent_ns > hubbub2->watermarks.d.urgent_ns) { hubbub2->watermarks.d.urgent_ns = watermarks->d.urgent_ns; prog_wm_value = convert_and_clamp(watermarks->d.urgent_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0x3fff); REG_SET(DCHUBBUB_ARB_DATA_URGENCY_WATERMARK_D, 0, DCHUBBUB_ARB_DATA_URGENCY_WATERMARK_D, prog_wm_value); @@ -325,7 +328,7 @@ static bool hubbub31_program_urgent_watermarks( if (safe_to_lower || watermarks->d.urgent_latency_ns > hubbub2->watermarks.d.urgent_latency_ns) { hubbub2->watermarks.d.urgent_latency_ns = watermarks->d.urgent_latency_ns; prog_wm_value = convert_and_clamp(watermarks->d.urgent_latency_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0x3fff); REG_SET(DCHUBBUB_ARB_REFCYC_PER_TRIP_TO_MEMORY_D, 0, DCHUBBUB_ARB_REFCYC_PER_TRIP_TO_MEMORY_D, prog_wm_value); } else if (watermarks->d.urgent_latency_ns < hubbub2->watermarks.d.urgent_latency_ns) @@ -351,7 +354,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->a.cstate_pstate.cstate_enter_plus_exit_ns; prog_wm_value = convert_and_clamp( watermarks->a.cstate_pstate.cstate_enter_plus_exit_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_A, 0, DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_A, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_ENTER_EXIT_WATERMARK_A calculated =%d\n" @@ -367,7 +370,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->a.cstate_pstate.cstate_exit_ns; prog_wm_value = convert_and_clamp( watermarks->a.cstate_pstate.cstate_exit_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_A, 0, DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_A, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_EXIT_WATERMARK_A calculated =%d\n" @@ -383,7 +386,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->a.cstate_pstate.cstate_enter_plus_exit_z8_ns; prog_wm_value = convert_and_clamp( watermarks->a.cstate_pstate.cstate_enter_plus_exit_z8_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_Z8_A, 0, DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_Z8_A, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_ENTER_WATERMARK_Z8_A calculated =%d\n" @@ -399,7 +402,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->a.cstate_pstate.cstate_exit_z8_ns; prog_wm_value = convert_and_clamp( watermarks->a.cstate_pstate.cstate_exit_z8_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_Z8_A, 0, DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_Z8_A, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_EXIT_WATERMARK_Z8_A calculated =%d\n" @@ -416,7 +419,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->b.cstate_pstate.cstate_enter_plus_exit_ns; prog_wm_value = convert_and_clamp( watermarks->b.cstate_pstate.cstate_enter_plus_exit_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_B, 0, DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_B, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_ENTER_EXIT_WATERMARK_B calculated =%d\n" @@ -432,7 +435,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->b.cstate_pstate.cstate_exit_ns; prog_wm_value = convert_and_clamp( watermarks->b.cstate_pstate.cstate_exit_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_B, 0, DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_B, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_EXIT_WATERMARK_B calculated =%d\n" @@ -448,7 +451,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->b.cstate_pstate.cstate_enter_plus_exit_z8_ns; prog_wm_value = convert_and_clamp( watermarks->b.cstate_pstate.cstate_enter_plus_exit_z8_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_Z8_B, 0, DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_Z8_B, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_ENTER_WATERMARK_Z8_B calculated =%d\n" @@ -464,7 +467,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->b.cstate_pstate.cstate_exit_z8_ns; prog_wm_value = convert_and_clamp( watermarks->b.cstate_pstate.cstate_exit_z8_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_Z8_B, 0, DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_Z8_B, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_EXIT_WATERMARK_Z8_B calculated =%d\n" @@ -481,7 +484,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->c.cstate_pstate.cstate_enter_plus_exit_ns; prog_wm_value = convert_and_clamp( watermarks->c.cstate_pstate.cstate_enter_plus_exit_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_C, 0, DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_C, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_ENTER_EXIT_WATERMARK_C calculated =%d\n" @@ -497,7 +500,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->c.cstate_pstate.cstate_exit_ns; prog_wm_value = convert_and_clamp( watermarks->c.cstate_pstate.cstate_exit_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_C, 0, DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_C, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_EXIT_WATERMARK_C calculated =%d\n" @@ -513,7 +516,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->c.cstate_pstate.cstate_enter_plus_exit_z8_ns; prog_wm_value = convert_and_clamp( watermarks->c.cstate_pstate.cstate_enter_plus_exit_z8_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_Z8_C, 0, DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_Z8_C, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_ENTER_WATERMARK_Z8_C calculated =%d\n" @@ -529,7 +532,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->c.cstate_pstate.cstate_exit_z8_ns; prog_wm_value = convert_and_clamp( watermarks->c.cstate_pstate.cstate_exit_z8_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_Z8_C, 0, DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_Z8_C, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_EXIT_WATERMARK_Z8_C calculated =%d\n" @@ -546,7 +549,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->d.cstate_pstate.cstate_enter_plus_exit_ns; prog_wm_value = convert_and_clamp( watermarks->d.cstate_pstate.cstate_enter_plus_exit_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_D, 0, DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_D, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_ENTER_EXIT_WATERMARK_D calculated =%d\n" @@ -562,7 +565,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->d.cstate_pstate.cstate_exit_ns; prog_wm_value = convert_and_clamp( watermarks->d.cstate_pstate.cstate_exit_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_D, 0, DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_D, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_EXIT_WATERMARK_D calculated =%d\n" @@ -578,7 +581,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->d.cstate_pstate.cstate_enter_plus_exit_z8_ns; prog_wm_value = convert_and_clamp( watermarks->d.cstate_pstate.cstate_enter_plus_exit_z8_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_Z8_D, 0, DCHUBBUB_ARB_ALLOW_SR_ENTER_WATERMARK_Z8_D, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_ENTER_WATERMARK_Z8_D calculated =%d\n" @@ -594,7 +597,7 @@ static bool hubbub31_program_stutter_watermarks( watermarks->d.cstate_pstate.cstate_exit_z8_ns; prog_wm_value = convert_and_clamp( watermarks->d.cstate_pstate.cstate_exit_z8_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_Z8_D, 0, DCHUBBUB_ARB_ALLOW_SR_EXIT_WATERMARK_Z8_D, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("SR_EXIT_WATERMARK_Z8_D calculated =%d\n" @@ -625,7 +628,7 @@ static bool hubbub31_program_pstate_watermarks( watermarks->a.cstate_pstate.pstate_change_ns; prog_wm_value = convert_and_clamp( watermarks->a.cstate_pstate.pstate_change_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_DRAM_CLK_CHANGE_WATERMARK_A, 0, DCHUBBUB_ARB_ALLOW_DRAM_CLK_CHANGE_WATERMARK_A, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("DRAM_CLK_CHANGE_WATERMARK_A calculated =%d\n" @@ -642,7 +645,7 @@ static bool hubbub31_program_pstate_watermarks( watermarks->b.cstate_pstate.pstate_change_ns; prog_wm_value = convert_and_clamp( watermarks->b.cstate_pstate.pstate_change_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_DRAM_CLK_CHANGE_WATERMARK_B, 0, DCHUBBUB_ARB_ALLOW_DRAM_CLK_CHANGE_WATERMARK_B, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("DRAM_CLK_CHANGE_WATERMARK_B calculated =%d\n" @@ -659,7 +662,7 @@ static bool hubbub31_program_pstate_watermarks( watermarks->c.cstate_pstate.pstate_change_ns; prog_wm_value = convert_and_clamp( watermarks->c.cstate_pstate.pstate_change_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_DRAM_CLK_CHANGE_WATERMARK_C, 0, DCHUBBUB_ARB_ALLOW_DRAM_CLK_CHANGE_WATERMARK_C, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("DRAM_CLK_CHANGE_WATERMARK_C calculated =%d\n" @@ -676,7 +679,7 @@ static bool hubbub31_program_pstate_watermarks( watermarks->d.cstate_pstate.pstate_change_ns; prog_wm_value = convert_and_clamp( watermarks->d.cstate_pstate.pstate_change_ns, - refclk_mhz, 0x1fffff); + refclk_mhz, 0xffff); REG_SET(DCHUBBUB_ARB_ALLOW_DRAM_CLK_CHANGE_WATERMARK_D, 0, DCHUBBUB_ARB_ALLOW_DRAM_CLK_CHANGE_WATERMARK_D, prog_wm_value); DC_LOG_BANDWIDTH_CALCS("DRAM_CLK_CHANGE_WATERMARK_D calculated =%d\n" From ad787771b43602d64e02b5963f4192232b46366b Mon Sep 17 00:00:00 2001 From: Zhan Liu Date: Thu, 27 Jan 2022 22:08:53 -0500 Subject: [PATCH 1235/1295] drm/amd/display: keep eDP Vdd on when eDP stream is already enabled [Why] Even if can_apply_edp_fast_boot is set to 1 at boot, this flag will be cleared to 0 at S3 resume. [How] Keep eDP Vdd on when eDP stream is already enabled. Reviewed-by: Charlene Liu Acked-by: Jasdeep Dhillon Signed-off-by: Zhan Liu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../display/dc/dce110/dce110_hw_sequencer.c | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c index 26ec69bb5db9..eb2755bdb30e 100644 --- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c @@ -1834,9 +1834,29 @@ void dce110_enable_accelerated_mode(struct dc *dc, struct dc_state *context) break; } } - // We are trying to enable eDP, don't power down VDD - if (can_apply_edp_fast_boot) + + /* + * TO-DO: So far the code logic below only addresses single eDP case. + * For dual eDP case, there are a few things that need to be + * implemented first: + * + * 1. Change the fastboot logic above, so eDP link[0 or 1]'s + * stream[0 or 1] will all be checked. + * + * 2. Change keep_edp_vdd_on to an array, and maintain keep_edp_vdd_on + * for each eDP. + * + * Once above 2 things are completed, we can then change the logic below + * correspondingly, so dual eDP case will be fully covered. + */ + + // We are trying to enable eDP, don't power down VDD if eDP stream is existing + if ((edp_stream_num == 1 && edp_streams[0] != NULL) || can_apply_edp_fast_boot) { keep_edp_vdd_on = true; + DC_LOG_EVENT_LINK_TRAINING("Keep eDP Vdd on\n"); + } else { + DC_LOG_EVENT_LINK_TRAINING("No eDP stream enabled, turn eDP Vdd off\n"); + } } // Check seamless boot support From a8b1e8636a3252daa729762b2e3cc9015cc91a5c Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 8 Feb 2022 14:23:55 +0800 Subject: [PATCH 1236/1295] drm/amd/pm: fix hwmon node of power1_label create issue it will cause hwmon node of power1_label is not created. v2: the hwmon node of "power1_label" is always needed for all ASICs. and the patch will remove ASIC type check for "power1_label". Fixes: ae07970a0621d6 ("drm/amd/pm: add support for hwmon control of slow and fast PPT limit on vangogh") Signed-off-by: Yang Wang Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index e2cae97f4ff1..48cc009d9bdf 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -3462,8 +3462,7 @@ static umode_t hwmon_attributes_visible(struct kobject *kobj, attr == &sensor_dev_attr_power2_cap_min.dev_attr.attr || attr == &sensor_dev_attr_power2_cap.dev_attr.attr || attr == &sensor_dev_attr_power2_cap_default.dev_attr.attr || - attr == &sensor_dev_attr_power2_label.dev_attr.attr || - attr == &sensor_dev_attr_power1_label.dev_attr.attr)) + attr == &sensor_dev_attr_power2_label.dev_attr.attr)) return 0; return effective_mode; From 7a82f89de92aac5a244d3735b2bd162c1147620c Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 9 Feb 2022 14:49:38 -0500 Subject: [PATCH 1237/1295] audit: don't deref the syscall args when checking the openat2 open_how::flags As reported by Jeff, dereferencing the openat2 syscall argument in audit_match_perm() to obtain the open_how::flags can result in an oops/page-fault. This patch fixes this by using the open_how struct that we store in the audit_context with audit_openat2_how(). Independent of this patch, Richard Guy Briggs posted a similar patch to the audit mailing list roughly 40 minutes after this patch was posted. Cc: stable@vger.kernel.org Fixes: 1c30e3af8a79 ("audit: add support for the openat2 syscall") Reported-by: Jeff Mahoney Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fce5d43a933f..a83928cbdcb7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -185,7 +185,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask) case AUDITSC_EXECVE: return mask & AUDIT_PERM_EXEC; case AUDITSC_OPENAT2: - return mask & ACC_MODE((u32)((struct open_how *)ctx->argv[2])->flags); + return mask & ACC_MODE((u32)ctx->openat2.flags); default: return 0; } From dd9cb842fa9d90653a9b48aba52f89c069f3bc50 Mon Sep 17 00:00:00 2001 From: Vineeth Vijayan Date: Wed, 2 Feb 2022 21:45:56 +0100 Subject: [PATCH 1238/1295] s390/cio: verify the driver availability for path_event call If no driver is attached to a device or the driver does not provide the path_event function, an FCES path-event on this device could end up in a kernel-panic. Verify the driver availability before the path_event function call. Fixes: 32ef938815c1 ("s390/cio: Add support for FCES status notification") Cc: stable@vger.kernel.org Signed-off-by: Vineeth Vijayan Suggested-by: Peter Oberparleiter Reviewed-by: Jan Hoeppner Reviewed-by: Peter Oberparleiter Signed-off-by: Vasily Gorbik --- drivers/s390/cio/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index cd938a26b76c..3b1cd0c96a74 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -1180,7 +1180,7 @@ static int io_subchannel_chp_event(struct subchannel *sch, else path_event[chpid] = PE_NONE; } - if (cdev) + if (cdev && cdev->drv && cdev->drv->path_event) cdev->drv->path_event(cdev, path_event); break; } From 6e7545ddb13416fd200e0b91c0acfd0404e2e27b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 3 Feb 2022 10:04:58 -0500 Subject: [PATCH 1239/1295] drm/amdgpu/display: change pipe policy for DCN 2.0 Fixes hangs on driver load with multiple displays on DCN 2.0 parts. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=215511 Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1877 Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1886 Fixes: ee2698cf79cc ("drm/amd/display: Changed pipe split policy to allow for multi-display pipe split") Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c index 2bc93df023ad..2a72517e2b28 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c @@ -1069,7 +1069,7 @@ static const struct dc_debug_options debug_defaults_drv = { .timing_trace = false, .clock_trace = true, .disable_pplib_clock_request = true, - .pipe_split_policy = MPC_SPLIT_DYNAMIC, + .pipe_split_policy = MPC_SPLIT_AVOID_MULT_DISP, .force_single_disp_pipe_split = false, .disable_dcc = DCC_ENABLE, .vsr_support = true, From d23a0c3718222a42430fd56359478a6fc7675070 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Feb 2022 15:26:18 +0900 Subject: [PATCH 1240/1295] kconfig: fix missing fclose() on error paths The file is not closed when ferror() fails. Fixes: 00d674cb3536 ("kconfig: refactor conf_write_dep()") Fixes: 57ddd07c4560 ("kconfig: refactor conf_write_autoconf()") Reported-by: Ryan Cai Signed-off-by: Masahiro Yamada --- scripts/kconfig/confdata.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index 59717be31210..16897cb8cefd 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -979,10 +979,10 @@ static int conf_write_autoconf_cmd(const char *autoconf_name) fprintf(out, "\n$(deps_config): ;\n"); - if (ferror(out)) /* error check for all fprintf() calls */ - return -1; - + ret = ferror(out); /* error check for all fprintf() calls */ fclose(out); + if (ret) + return -1; if (rename(tmp, name)) { perror("rename"); @@ -1093,10 +1093,10 @@ static int __conf_write_autoconf(const char *filename, print_symbol(file, sym); /* check possible errors in conf_write_heading() and print_symbol() */ - if (ferror(file)) - return -1; - + ret = ferror(file); fclose(file); + if (ret) + return -1; if (rename(tmp, filename)) { perror("rename"); From 8ecbb179286cbc91810c16caeb3396e06305cd0c Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Wed, 9 Feb 2022 10:47:17 +0800 Subject: [PATCH 1241/1295] net: usb: qmi_wwan: Add support for Dell DW5829e MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dell DW5829e same as DW5821e except the CAT level. DW5821e supports CAT16 but DW5829e supports CAT9. Also, DW5829e includes normal and eSIM type. Please see below test evidence: T: Bus=04 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 5 Spd=5000 MxCh= 0 D: Ver= 3.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS= 9 #Cfgs= 1 P: Vendor=413c ProdID=81e6 Rev=03.18 S: Manufacturer=Dell Inc. S: Product=DW5829e Snapdragon X20 LTE S: SerialNumber=0123456789ABCDEF C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=896mA I: If#=0x0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan I: If#=0x1 Alt= 0 #EPs= 1 Cls=03(HID ) Sub=00 Prot=00 Driver=usbhid I: If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option T: Bus=04 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 7 Spd=5000 MxCh= 0 D: Ver= 3.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS= 9 #Cfgs= 1 P: Vendor=413c ProdID=81e4 Rev=03.18 S: Manufacturer=Dell Inc. S: Product=DW5829e-eSIM Snapdragon X20 LTE S: SerialNumber=0123456789ABCDEF C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=896mA I: If#=0x0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan I: If#=0x1 Alt= 0 #EPs= 1 Cls=03(HID ) Sub=00 Prot=00 Driver=usbhid I: If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option Signed-off-by: Slark Xiao Acked-by: Bjørn Mork Link: https://lore.kernel.org/r/20220209024717.8564-1-slark_xiao@163.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 37e5f3495362..3353e761016d 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1400,6 +1400,8 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x413c, 0x81d7, 0)}, /* Dell Wireless 5821e */ {QMI_FIXED_INTF(0x413c, 0x81d7, 1)}, /* Dell Wireless 5821e preproduction config */ {QMI_FIXED_INTF(0x413c, 0x81e0, 0)}, /* Dell Wireless 5821e with eSIM support*/ + {QMI_FIXED_INTF(0x413c, 0x81e4, 0)}, /* Dell Wireless 5829e with eSIM support*/ + {QMI_FIXED_INTF(0x413c, 0x81e6, 0)}, /* Dell Wireless 5829e */ {QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)}, /* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */ {QMI_FIXED_INTF(0x03f0, 0x9d1d, 1)}, /* HP lt4120 Snapdragon X5 LTE */ {QMI_QUIRK_SET_DTR(0x22de, 0x9051, 2)}, /* Hucom Wireless HM-211S/K */ From 857898eb4b28daf3faca3ae334c78b2bb141475e Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Wed, 9 Feb 2022 17:25:07 -0800 Subject: [PATCH 1242/1295] selftests: mptcp: add missing join check This function also writes the name of the test with its ID, making clear a new test has been executed. Without that, the ADD_ADDR results from this test was appended at the end of the previous test causing confusions. Especially when the second test was failing, we had: 17 signal invalid addresses syn[ ok ] - synack[ ok ] - ack[ ok ] add[ ok ] - echo [ ok ] add[fail] got 2 ADD_ADDR[s] expected 3 In fact, this 17th test was OK but not the 18th one. Now we have: 17 signal invalid addresses syn[ ok ] - synack[ ok ] - ack[ ok ] add[ ok ] - echo [ ok ] 18 signal addresses race test syn[fail] got 2 JOIN[s] syn expected 3 - synack[fail] got 2 JOIN[s] synack expected - ack[fail] got 2 JOIN[s] ack expected 3 add[fail] got 2 ADD_ADDR[s] expected 3 Fixes: 33c563ad28e3 ("selftests: mptcp: add_addr and echo race test") Reported-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index b8bdbec0cf69..c0801df15f54 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1159,6 +1159,7 @@ signal_address_tests() ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags signal ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags signal run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "signal addresses race test" 3 3 3 # the server will not signal the address terminating # the MPC subflow From 029744cd4bc6e9eb3bd833b4a033348296d34645 Mon Sep 17 00:00:00 2001 From: Kishen Maloor Date: Wed, 9 Feb 2022 17:25:08 -0800 Subject: [PATCH 1243/1295] mptcp: netlink: process IPv6 addrs in creating listening sockets This change updates mptcp_pm_nl_create_listen_socket() to create listening sockets bound to IPv6 addresses (where IPv6 is supported). Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Acked-by: Geliang Tang Signed-off-by: Kishen Maloor Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 782b1d452269..356f596e2032 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -925,6 +925,7 @@ out: static int mptcp_pm_nl_create_listen_socket(struct sock *sk, struct mptcp_pm_addr_entry *entry) { + int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; struct mptcp_sock *msk; struct socket *ssock; @@ -949,8 +950,11 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, } mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); - err = kernel_bind(ssock, (struct sockaddr *)&addr, - sizeof(struct sockaddr_in)); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (entry->addr.family == AF_INET6) + addrlen = sizeof(struct sockaddr_in6); +#endif + err = kernel_bind(ssock, (struct sockaddr *)&addr, addrlen); if (err) { pr_warn("kernel_bind error, err=%d", err); goto out; From 5e5eddd94c8906472e3904cb11b4e3ccbb99d0d4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 10 Feb 2022 09:57:48 +0100 Subject: [PATCH 1244/1295] Revert "arm64: dts: imx8mn-venice-gw7902: disable gpu" This reverts commit 0c566618e27f17b5807086dba8c222ca8ca3dc1e, this one was meant for v5.18, not as a bugfix, though the patch itself was correct. Reported-by: Shawn Guo Signed-off-by: Arnd Bergmann --- .../boot/dts/freescale/imx8mn-venice-gw7902.dts | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts b/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts index 2d58005d20e4..236f425e1570 100644 --- a/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts +++ b/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts @@ -220,10 +220,6 @@ }; }; -&disp_blk_ctrl { - status = "disabled"; -}; - /* off-board header */ &ecspi2 { pinctrl-names = "default"; @@ -255,10 +251,6 @@ }; }; -&gpu { - status = "disabled"; -}; - &i2c1 { clock-frequency = <100000>; pinctrl-names = "default"; @@ -554,10 +546,6 @@ status = "okay"; }; -&pgc_gpumix { - status = "disabled"; -}; - /* off-board header */ &sai3 { pinctrl-names = "default"; From f34c4f2dd2445ab89e5373fff2990fab36578bd3 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 7 Feb 2022 08:41:03 +0100 Subject: [PATCH 1245/1295] xen/x86: obtain full video frame buffer address for Dom0 also under EFI The initial change would not work when Xen was booted from EFI: There is an early exit from the case block in that case. Move the necessary code ahead of that. Fixes: 335e4dd67b48 ("xen/x86: obtain upper 32 bits of video frame buffer address for Dom0") Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/2501ce9d-40e5-b49d-b0e5-435544d17d4a@suse.com Signed-off-by: Juergen Gross --- arch/x86/xen/vga.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c index 31b1e3477cb6..14ea32e734d5 100644 --- a/arch/x86/xen/vga.c +++ b/arch/x86/xen/vga.c @@ -57,6 +57,14 @@ void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size; screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos; + if (size >= offsetof(struct dom0_vga_console_info, + u.vesa_lfb.ext_lfb_base) + + sizeof(info->u.vesa_lfb.ext_lfb_base) + && info->u.vesa_lfb.ext_lfb_base) { + screen_info->ext_lfb_base = info->u.vesa_lfb.ext_lfb_base; + screen_info->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + } + if (info->video_type == XEN_VGATYPE_EFI_LFB) { screen_info->orig_video_isVGA = VIDEO_TYPE_EFI; break; @@ -66,14 +74,6 @@ void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) u.vesa_lfb.mode_attrs) + sizeof(info->u.vesa_lfb.mode_attrs)) screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs; - - if (size >= offsetof(struct dom0_vga_console_info, - u.vesa_lfb.ext_lfb_base) - + sizeof(info->u.vesa_lfb.ext_lfb_base) - && info->u.vesa_lfb.ext_lfb_base) { - screen_info->ext_lfb_base = info->u.vesa_lfb.ext_lfb_base; - screen_info->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; - } break; } } From e07e98da924e61e814bdaaa3ebc6e72b60dbf9ed Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Thu, 20 Jan 2022 16:25:27 +0100 Subject: [PATCH 1246/1295] xen/x86: detect support for extended destination ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Xen allows the usage of some previously reserved bits in the IO-APIC RTE and the MSI address fields in order to store high bits for the target APIC ID. Such feature is already implemented by QEMU/KVM and HyperV, so in order to enable it just add the handler that checks for it's presence. Signed-off-by: Roger Pau Monné Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220120152527.7524-3-roger.pau@citrix.com Signed-off-by: Juergen Gross --- arch/x86/include/asm/xen/cpuid.h | 7 +++++++ arch/x86/xen/enlighten_hvm.c | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h index a9630104f1c4..78e667a31d6c 100644 --- a/arch/x86/include/asm/xen/cpuid.h +++ b/arch/x86/include/asm/xen/cpuid.h @@ -100,6 +100,13 @@ /* Memory mapped from other domains has valid IOMMU entries */ #define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2) #define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */ +#define XEN_HVM_CPUID_DOMID_PRESENT (1u << 4) /* domid is present in ECX */ +/* + * Bits 55:49 from the IO-APIC RTE and bits 11:5 from the MSI address can be + * used to store high bits for the Destination ID. This expands the Destination + * ID field from 8 to 15 bits, allowing to target APIC IDs up 32768. + */ +#define XEN_HVM_CPUID_EXT_DEST_ID (1u << 5) /* * Leaf 6 (0x40000x05) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 6448c5071117..945363d9cdba 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -248,6 +248,11 @@ static __init bool xen_x2apic_available(void) return x2apic_supported(); } +static bool __init msi_ext_dest_id(void) +{ + return cpuid_eax(xen_cpuid_base() + 4) & XEN_HVM_CPUID_EXT_DEST_ID; +} + static __init void xen_hvm_guest_late_init(void) { #ifdef CONFIG_XEN_PVH @@ -310,6 +315,7 @@ struct hypervisor_x86 x86_hyper_xen_hvm __initdata = { .init.x2apic_available = xen_x2apic_available, .init.init_mem_mapping = xen_hvm_init_mem_mapping, .init.guest_late_init = xen_hvm_guest_late_init, + .init.msi_ext_dest_id = msi_ext_dest_id, .runtime.pin_vcpu = xen_pin_vcpu, .ignore_nopv = true, }; From afea27dc3105004080c3127c6570dc3dff8563b2 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 7 Feb 2022 18:35:06 +0800 Subject: [PATCH 1247/1295] xen/x2apic: Fix inconsistent indenting Eliminate the follow smatch warning: arch/x86/xen/enlighten_hvm.c:189 xen_cpu_dead_hvm() warn: inconsistent indenting. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220207103506.102008-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Juergen Gross --- arch/x86/xen/enlighten_hvm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 945363d9cdba..517a9d8d8f94 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -185,8 +185,7 @@ static int xen_cpu_dead_hvm(unsigned int cpu) if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock)) xen_teardown_timer(cpu); - - return 0; + return 0; } static bool no_vector_callback __initdata; From f66edf684edcb85c1db0b0aa8cf1a9392ba68a9d Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Wed, 9 Feb 2022 11:28:41 +0800 Subject: [PATCH 1248/1295] xen/pci: Make use of the helper macro LIST_HEAD() Replace "struct list_head head = LIST_HEAD_INIT(head)" with "LIST_HEAD(head)" to simplify the code. Signed-off-by: Cai Huoqing Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220209032842.38818-1-cai.huoqing@linux.dev Signed-off-by: Juergen Gross --- drivers/xen/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index 2c890f4f2cbc..72d4e3f193af 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -264,7 +264,7 @@ struct xen_device_domain_owner { }; static DEFINE_SPINLOCK(dev_domain_list_spinlock); -static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list); +static LIST_HEAD(dev_domain_list); static struct xen_device_domain_owner *find_device(struct pci_dev *dev) { From 9aa422ad326634b76309e8ff342c246800621216 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Sat, 5 Feb 2022 14:11:18 -0500 Subject: [PATCH 1249/1295] tipc: improve size validations for received domain records The function tipc_mon_rcv() allows a node to receive and process domain_record structs from peer nodes to track their views of the network topology. This patch verifies that the number of members in a received domain record does not exceed the limit defined by MAX_MON_DOMAIN, something that may otherwise lead to a stack overflow. tipc_mon_rcv() is called from the function tipc_link_proto_rcv(), where we are reading a 32 bit message data length field into a uint16. To avert any risk of bit overflow, we add an extra sanity check for this in that function. We cannot see that happen with the current code, but future designers being unaware of this risk, may introduce it by allowing delivery of very large (> 64k) sk buffers from the bearer layer. This potential problem was identified by Eric Dumazet. This fixes CVE-2022-0435 Reported-by: Samuel Page Reported-by: Eric Dumazet Fixes: 35c55c9877f8 ("tipc: add neighbor monitoring framework") Signed-off-by: Jon Maloy Reviewed-by: Xin Long Reviewed-by: Samuel Page Reviewed-by: Eric Dumazet Signed-off-by: Linus Torvalds --- net/tipc/link.c | 9 +++++++-- net/tipc/monitor.c | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/net/tipc/link.c b/net/tipc/link.c index 8d9e09f48f4c..1e14d7f8f28f 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2200,7 +2200,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, struct tipc_msg *hdr = buf_msg(skb); struct tipc_gap_ack_blks *ga = NULL; bool reply = msg_probe(hdr), retransmitted = false; - u16 dlen = msg_data_sz(hdr), glen = 0; + u32 dlen = msg_data_sz(hdr), glen = 0; u16 peers_snd_nxt = msg_next_sent(hdr); u16 peers_tol = msg_link_tolerance(hdr); u16 peers_prio = msg_linkprio(hdr); @@ -2214,6 +2214,10 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, void *data; trace_tipc_proto_rcv(skb, false, l->name); + + if (dlen > U16_MAX) + goto exit; + if (tipc_link_is_blocked(l) || !xmitq) goto exit; @@ -2309,7 +2313,8 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, /* Receive Gap ACK blocks from peer if any */ glen = tipc_get_gap_ack_blks(&ga, l, hdr, true); - + if(glen > dlen) + break; tipc_mon_rcv(l->net, data + glen, dlen - glen, l->addr, &l->mon_state, l->bearer_id); diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 407619697292..2f4d23238a7e 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -496,6 +496,8 @@ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, state->probing = false; /* Sanity check received domain record */ + if (new_member_cnt > MAX_MON_DOMAIN) + return; if (dlen < dom_rec_len(arrv_dom, 0)) return; if (dlen != dom_rec_len(arrv_dom, new_member_cnt)) From 37f7860602b5b2d99fc7465f6407f403f5941988 Mon Sep 17 00:00:00 2001 From: Marc St-Amand Date: Wed, 9 Feb 2022 15:13:25 +0530 Subject: [PATCH 1250/1295] net: macb: Align the dma and coherent dma masks Single page and coherent memory blocks can use different DMA masks when the macb accesses physical memory directly. The kernel is clever enough to allocate pages that fit into the requested address width. When using the ARM SMMU, the DMA mask must be the same for single pages and big coherent memory blocks. Otherwise the translation tables turn into one big mess. [ 74.959909] macb ff0e0000.ethernet eth0: DMA bus error: HRESP not OK [ 74.959989] arm-smmu fd800000.smmu: Unhandled context fault: fsr=0x402, iova=0x3165687460, fsynr=0x20001, cbfrsynra=0x877, cb=1 [ 75.173939] macb ff0e0000.ethernet eth0: DMA bus error: HRESP not OK [ 75.173955] arm-smmu fd800000.smmu: Unhandled context fault: fsr=0x402, iova=0x3165687460, fsynr=0x20001, cbfrsynra=0x877, cb=1 Since using the same DMA mask does not hurt direct 1:1 physical memory mappings, this commit always aligns DMA and coherent masks. Signed-off-by: Marc St-Amand Signed-off-by: Harini Katakam Acked-by: Nicolas Ferre Tested-by: Conor Dooley Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index a363da928e8b..98498a76ae16 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -4712,7 +4712,7 @@ static int macb_probe(struct platform_device *pdev) #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT if (GEM_BFEXT(DAW64, gem_readl(bp, DCFG6))) { - dma_set_mask(&pdev->dev, DMA_BIT_MASK(44)); + dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(44)); bp->hw_dma_cap |= HW_DMA_CAP_64B; } #endif From 58e61e416b5abedcacd32032144b333bca30cf1e Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Wed, 9 Feb 2022 07:02:42 -0800 Subject: [PATCH 1251/1295] skbuff: cleanup double word in comment Remove the second 'to'. Signed-off-by: Tom Rix Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0118f0afaa4f..9d0388bed0c1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -681,7 +681,7 @@ exit: * while trying to recycle fragments on __skb_frag_unref() we need * to make one SKB responsible for triggering the recycle path. * So disable the recycling bit if an SKB is cloned and we have - * additional references to to the fragmented part of the SKB. + * additional references to the fragmented part of the SKB. * Eventually the last SKB will have the recycling bit set and it's * dataref set to 0, which will trigger the recycling */ From 9ccc6e0c8959a019bb40f6b18704b142c04b19a8 Mon Sep 17 00:00:00 2001 From: Robert-Ionut Alexa Date: Wed, 9 Feb 2022 17:57:43 +0200 Subject: [PATCH 1252/1295] dpaa2-eth: unregister the netdev before disconnecting from the PHY The netdev should be unregistered before we are disconnecting from the MAC/PHY so that the dev_close callback is called and the PHY and the phylink workqueues are actually stopped before we are disconnecting and destroying the phylink instance. Fixes: 719479230893 ("dpaa2-eth: add MAC/PHY support through phylink") Signed-off-by: Robert-Ionut Alexa Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index e985ae008a97..dd9385d15f6b 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -4523,12 +4523,12 @@ static int dpaa2_eth_remove(struct fsl_mc_device *ls_dev) #ifdef CONFIG_DEBUG_FS dpaa2_dbg_remove(priv); #endif + + unregister_netdev(net_dev); rtnl_lock(); dpaa2_eth_disconnect_mac(priv); rtnl_unlock(); - unregister_netdev(net_dev); - dpaa2_eth_dl_port_del(priv); dpaa2_eth_dl_traps_unregister(priv); dpaa2_eth_dl_free(priv); From c4416f5c2eb3ed48dfba265e628a6e52da962f03 Mon Sep 17 00:00:00 2001 From: Victor Erminpour Date: Wed, 9 Feb 2022 16:28:38 -0800 Subject: [PATCH 1253/1295] net: mpls: Fix GCC 12 warning When building with automatic stack variable initialization, GCC 12 complains about variables defined outside of switch case statements. Move the variable outside the switch, which silences the warning: ./net/mpls/af_mpls.c:1624:21: error: statement will never be executed [-Werror=switch-unreachable] 1624 | int err; | ^~~ Signed-off-by: Victor Erminpour Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 48f75a56f4ae..d6fdc5782d33 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1607,6 +1607,7 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct mpls_dev *mdev; unsigned int flags; + int err; if (event == NETDEV_REGISTER) { mdev = mpls_add_dev(dev); @@ -1621,7 +1622,6 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, return NOTIFY_OK; switch (event) { - int err; case NETDEV_DOWN: err = mpls_ifdown(dev, event); From 21338d58736ef70eaae5fd75d567a358ff7902f9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 7 Jan 2022 11:02:06 +0300 Subject: [PATCH 1254/1295] ice: fix an error code in ice_cfg_phy_fec() Propagate the error code from ice_get_link_default_override() instead of returning success. Fixes: ea78ce4dab05 ("ice: add link lenient and default override support") Signed-off-by: Dan Carpenter Tested-by: Gurucharan G Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 408d15a5b0e3..a6d7d3eff186 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -3342,7 +3342,8 @@ ice_cfg_phy_fec(struct ice_port_info *pi, struct ice_aqc_set_phy_cfg_data *cfg, !ice_fw_supports_report_dflt_cfg(hw)) { struct ice_link_default_override_tlv tlv; - if (ice_get_link_default_override(&tlv, pi)) + status = ice_get_link_default_override(&tlv, pi); + if (status) goto out; if (!(tlv.options & ICE_LINK_OVERRIDE_STRICT_MODE) && From 46b699c50c0304cdbd725d7740073a7f9d5edb10 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Fri, 14 Jan 2022 15:38:39 -0800 Subject: [PATCH 1255/1295] ice: fix IPIP and SIT TSO offload The driver was avoiding offload for IPIP (at least) frames due to parsing the inner header offsets incorrectly when trying to check lengths. This length check works for VXLAN frames but fails on IPIP frames because skb_transport_offset points to the inner header in IPIP frames, which meant the subtraction of transport_header from inner_network_header returns a negative value (-20). With the code before this patch, everything continued to work, but GSO was being used to segment, causing throughputs of 1.5Gb/s per thread. After this patch, throughput is more like 10Gb/s per thread for IPIP traffic. Fixes: e94d44786693 ("ice: Implement filter sync, NDO operations and bump version") Signed-off-by: Jesse Brandeburg Reviewed-by: Paul Menzel Tested-by: Gurucharan G Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ice/ice_lan_tx_rx.h | 1 + drivers/net/ethernet/intel/ice/ice_main.c | 25 +++++++++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h index d981dc6f2323..85a612838a89 100644 --- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h +++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h @@ -568,6 +568,7 @@ struct ice_tx_ctx_desc { (0x3FFFFULL << ICE_TXD_CTX_QW1_TSO_LEN_S) #define ICE_TXD_CTX_QW1_MSS_S 50 +#define ICE_TXD_CTX_MIN_MSS 64 #define ICE_TXD_CTX_QW1_VSI_S 50 #define ICE_TXD_CTX_QW1_VSI_M (0x3FFULL << ICE_TXD_CTX_QW1_VSI_S) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 30814435f779..3b751d8b4056 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -8525,6 +8525,7 @@ ice_features_check(struct sk_buff *skb, struct net_device __always_unused *netdev, netdev_features_t features) { + bool gso = skb_is_gso(skb); size_t len; /* No point in doing any of this if neither checksum nor GSO are @@ -8537,24 +8538,32 @@ ice_features_check(struct sk_buff *skb, /* We cannot support GSO if the MSS is going to be less than * 64 bytes. If it is then we need to drop support for GSO. */ - if (skb_is_gso(skb) && (skb_shinfo(skb)->gso_size < 64)) + if (gso && (skb_shinfo(skb)->gso_size < ICE_TXD_CTX_MIN_MSS)) features &= ~NETIF_F_GSO_MASK; - len = skb_network_header(skb) - skb->data; + len = skb_network_offset(skb); if (len > ICE_TXD_MACLEN_MAX || len & 0x1) goto out_rm_features; - len = skb_transport_header(skb) - skb_network_header(skb); + len = skb_network_header_len(skb); if (len > ICE_TXD_IPLEN_MAX || len & 0x1) goto out_rm_features; if (skb->encapsulation) { - len = skb_inner_network_header(skb) - skb_transport_header(skb); - if (len > ICE_TXD_L4LEN_MAX || len & 0x1) - goto out_rm_features; + /* this must work for VXLAN frames AND IPIP/SIT frames, and in + * the case of IPIP frames, the transport header pointer is + * after the inner header! So check to make sure that this + * is a GRE or UDP_TUNNEL frame before doing that math. + */ + if (gso && (skb_shinfo(skb)->gso_type & + (SKB_GSO_GRE | SKB_GSO_UDP_TUNNEL))) { + len = skb_inner_network_header(skb) - + skb_transport_header(skb); + if (len > ICE_TXD_L4LEN_MAX || len & 0x1) + goto out_rm_features; + } - len = skb_inner_transport_header(skb) - - skb_inner_network_header(skb); + len = skb_inner_network_header_len(skb); if (len > ICE_TXD_IPLEN_MAX || len & 0x1) goto out_rm_features; } From bea1898f65b9b7096cb4e73e97c83b94718f1fa1 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 18 Jan 2022 13:08:20 -0800 Subject: [PATCH 1256/1295] ice: Fix KASAN error in LAG NETDEV_UNREGISTER handler Currently, the same handler is called for both a NETDEV_BONDING_INFO LAG unlink notification as for a NETDEV_UNREGISTER call. This is causing a problem though, since the netdev_notifier_info passed has a different structure depending on which event is passed. The problem manifests as a call trace from a BUG: KASAN stack-out-of-bounds error. Fix this by creating a handler specific to NETDEV_UNREGISTER that only is passed valid elements in the netdev_notifier_info struct for the NETDEV_UNREGISTER event. Also included is the removal of an unbalanced dev_put on the peer_netdev and related braces. Fixes: 6a8b357278f5 ("ice: Respond to a NETDEV_UNREGISTER event for LAG") Signed-off-by: Dave Ertman Acked-by: Jonathan Toppins Tested-by: Sunitha Mekala Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_lag.c | 34 +++++++++++++++++++----- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c index e375ac849aec..4f954db01b92 100644 --- a/drivers/net/ethernet/intel/ice/ice_lag.c +++ b/drivers/net/ethernet/intel/ice/ice_lag.c @@ -204,17 +204,39 @@ ice_lag_unlink(struct ice_lag *lag, lag->upper_netdev = NULL; } - if (lag->peer_netdev) { - dev_put(lag->peer_netdev); - lag->peer_netdev = NULL; - } - + lag->peer_netdev = NULL; ice_set_sriov_cap(pf); ice_set_rdma_cap(pf); lag->bonded = false; lag->role = ICE_LAG_NONE; } +/** + * ice_lag_unregister - handle netdev unregister events + * @lag: LAG info struct + * @netdev: netdev reporting the event + */ +static void ice_lag_unregister(struct ice_lag *lag, struct net_device *netdev) +{ + struct ice_pf *pf = lag->pf; + + /* check to see if this event is for this netdev + * check that we are in an aggregate + */ + if (netdev != lag->netdev || !lag->bonded) + return; + + if (lag->upper_netdev) { + dev_put(lag->upper_netdev); + lag->upper_netdev = NULL; + ice_set_sriov_cap(pf); + ice_set_rdma_cap(pf); + } + /* perform some cleanup in case we come back */ + lag->bonded = false; + lag->role = ICE_LAG_NONE; +} + /** * ice_lag_changeupper_event - handle LAG changeupper event * @lag: LAG info struct @@ -307,7 +329,7 @@ ice_lag_event_handler(struct notifier_block *notif_blk, unsigned long event, ice_lag_info_event(lag, ptr); break; case NETDEV_UNREGISTER: - ice_lag_unlink(lag, ptr); + ice_lag_unregister(lag, netdev); break; default: break; From 5dbbbd01cbba831233c6ea9a3e6bfa133606d3c0 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Thu, 20 Jan 2022 16:27:56 -0800 Subject: [PATCH 1257/1295] ice: Avoid RTNL lock when re-creating auxiliary device If a call to re-create the auxiliary device happens in a context that has already taken the RTNL lock, then the call flow that recreates auxiliary device can hang if there is another attempt to claim the RTNL lock by the auxiliary driver. To avoid this, any call to re-create auxiliary devices that comes from an source that is holding the RTNL lock (e.g. netdev notifier when interface exits a bond) should execute in a separate thread. To accomplish this, add a flag to the PF that will be evaluated in the service task and dealt with there. Fixes: f9f5301e7e2d ("ice: Register auxiliary device to provide RDMA") Signed-off-by: Dave Ertman Reviewed-by: Jonathan Toppins Tested-by: Gurucharan G Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 3 ++- drivers/net/ethernet/intel/ice/ice_main.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 4e16d185077d..a9fa701aaa95 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -483,6 +483,7 @@ enum ice_pf_flags { ICE_FLAG_VF_TRUE_PROMISC_ENA, ICE_FLAG_MDD_AUTO_RESET_VF, ICE_FLAG_LINK_LENIENT_MODE_ENA, + ICE_FLAG_PLUG_AUX_DEV, ICE_PF_FLAGS_NBITS /* must be last */ }; @@ -887,7 +888,7 @@ static inline void ice_set_rdma_cap(struct ice_pf *pf) if (pf->hw.func_caps.common_cap.rdma && pf->num_rdma_msix) { set_bit(ICE_FLAG_RDMA_ENA, pf->flags); set_bit(ICE_FLAG_AUX_ENA, pf->flags); - ice_plug_aux_dev(pf); + set_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags); } } diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 3b751d8b4056..17a9bb461dc3 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2253,6 +2253,9 @@ static void ice_service_task(struct work_struct *work) return; } + if (test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) + ice_plug_aux_dev(pf); + ice_clean_adminq_subtask(pf); ice_check_media_subtask(pf); ice_check_for_hang_subtask(pf); From f40fe31c01445f31253b15bef2412b33ae31093b Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Sun, 23 Jan 2022 20:13:52 +0800 Subject: [PATCH 1258/1295] riscv: cpu-hotplug: clear cpu from numa map when teardown There is numa_add_cpu() when cpus online, accordingly, there should be numa_remove_cpu() when cpus offline. Signed-off-by: Pingfan Liu Fixes: 4f0e8eef772e ("riscv: Add numa support for riscv64 platform") Cc: stable@vger.kernel.org [Palmer: Add missing NUMA include] Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu-hotplug.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/kernel/cpu-hotplug.c b/arch/riscv/kernel/cpu-hotplug.c index be7f05b542bb..f7a832e3a1d1 100644 --- a/arch/riscv/kernel/cpu-hotplug.c +++ b/arch/riscv/kernel/cpu-hotplug.c @@ -12,6 +12,7 @@ #include #include #include +#include #include bool cpu_has_hotplug(unsigned int cpu) @@ -40,6 +41,7 @@ int __cpu_disable(void) return ret; remove_cpu_topology(cpu); + numa_remove_cpu(cpu); set_cpu_online(cpu, false); irq_migrate_all_off_this_cpu(); From 6df2a016c0c8a3d0933ef33dd192ea6606b115e3 Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Wed, 26 Jan 2022 18:14:42 +0100 Subject: [PATCH 1259/1295] riscv: fix build with binutils 2.38 From version 2.38, binutils default to ISA spec version 20191213. This means that the csr read/write (csrr*/csrw*) instructions and fence.i instruction has separated from the `I` extension, become two standalone extensions: Zicsr and Zifencei. As the kernel uses those instruction, this causes the following build failure: CC arch/riscv/kernel/vdso/vgettimeofday.o <>/arch/riscv/include/asm/vdso/gettimeofday.h: Assembler messages: <>/arch/riscv/include/asm/vdso/gettimeofday.h:71: Error: unrecognized opcode `csrr a5,0xc01' <>/arch/riscv/include/asm/vdso/gettimeofday.h:71: Error: unrecognized opcode `csrr a5,0xc01' <>/arch/riscv/include/asm/vdso/gettimeofday.h:71: Error: unrecognized opcode `csrr a5,0xc01' <>/arch/riscv/include/asm/vdso/gettimeofday.h:71: Error: unrecognized opcode `csrr a5,0xc01' The fix is to specify those extensions explicitely in -march. However as older binutils version do not support this, we first need to detect that. Signed-off-by: Aurelien Jarno Tested-by: Alexandre Ghiti Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 8a107ed18b0d..7d81102cffd4 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -50,6 +50,12 @@ riscv-march-$(CONFIG_ARCH_RV32I) := rv32ima riscv-march-$(CONFIG_ARCH_RV64I) := rv64ima riscv-march-$(CONFIG_FPU) := $(riscv-march-y)fd riscv-march-$(CONFIG_RISCV_ISA_C) := $(riscv-march-y)c + +# Newer binutils versions default to ISA spec version 20191213 which moves some +# instructions from the I extension to the Zicsr and Zifencei extensions. +toolchain-need-zicsr-zifencei := $(call cc-option-yn, -march=$(riscv-march-y)_zicsr_zifencei) +riscv-march-$(toolchain-need-zicsr-zifencei) := $(riscv-march-y)_zicsr_zifencei + KBUILD_CFLAGS += -march=$(subst fd,,$(riscv-march-y)) KBUILD_AFLAGS += -march=$(riscv-march-y) From 7fbf6795d127a3b1bb39b0e42579904cf6db1624 Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Thu, 10 Feb 2022 07:04:51 -0800 Subject: [PATCH 1260/1295] net: mscc: ocelot: fix mutex lock error during ethtool stats read An ongoing workqueue populates the stats buffer. At the same time, a user might query the statistics. While writing to the buffer is mutex-locked, reading from the buffer wasn't. This could lead to buggy reads by ethtool. This patch fixes the former blamed commit, but the bug was introduced in the latter. Signed-off-by: Colin Foster Fixes: 1e1caa9735f90 ("ocelot: Clean up stats update deferred work") Fixes: a556c76adc052 ("net: mscc: Add initial Ocelot switch support") Reported-by: Vladimir Oltean Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/all/20220210150451.416845-2-colin.foster@in-advantage.com/ Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 354e4474bcc3..e6de86552df0 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1745,12 +1745,11 @@ void ocelot_get_strings(struct ocelot *ocelot, int port, u32 sset, u8 *data) } EXPORT_SYMBOL(ocelot_get_strings); +/* Caller must hold &ocelot->stats_lock */ static void ocelot_update_stats(struct ocelot *ocelot) { int i, j; - mutex_lock(&ocelot->stats_lock); - for (i = 0; i < ocelot->num_phys_ports; i++) { /* Configure the port to read the stats from */ ocelot_write(ocelot, SYS_STAT_CFG_STAT_VIEW(i), SYS_STAT_CFG); @@ -1769,8 +1768,6 @@ static void ocelot_update_stats(struct ocelot *ocelot) ~(u64)U32_MAX) + val; } } - - mutex_unlock(&ocelot->stats_lock); } static void ocelot_check_stats_work(struct work_struct *work) @@ -1779,7 +1776,9 @@ static void ocelot_check_stats_work(struct work_struct *work) struct ocelot *ocelot = container_of(del_work, struct ocelot, stats_work); + mutex_lock(&ocelot->stats_lock); ocelot_update_stats(ocelot); + mutex_unlock(&ocelot->stats_lock); queue_delayed_work(ocelot->stats_queue, &ocelot->stats_work, OCELOT_STATS_CHECK_DELAY); @@ -1789,12 +1788,16 @@ void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data) { int i; + mutex_lock(&ocelot->stats_lock); + /* check and update now */ ocelot_update_stats(ocelot); /* Copy all counters */ for (i = 0; i < ocelot->num_stats; i++) *data++ = ocelot->stats[port * ocelot->num_stats + i]; + + mutex_unlock(&ocelot->stats_lock); } EXPORT_SYMBOL(ocelot_get_ethtool_stats); From 51a04ebf21122d5c76a716ecd9bfc33ea44b2b39 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 10 Feb 2022 19:40:17 +0200 Subject: [PATCH 1261/1295] net: dsa: mv88e6xxx: fix use-after-free in mv88e6xxx_mdios_unregister Since struct mv88e6xxx_mdio_bus *mdio_bus is the bus->priv of something allocated with mdiobus_alloc_size(), this means that mdiobus_free(bus) will free the memory backing the mdio_bus as well. Therefore, the mdio_bus->list element is freed memory, but we continue to iterate through the list of MDIO buses using that list element. To fix this, use the proper list iterator that handles element deletion by keeping a copy of the list element next pointer. Fixes: f53a2ce893b2 ("net: dsa: mv88e6xxx: don't use devres for mdiobus") Reported-by: Rafael Richter Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220210174017.3271099-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 659f29582406..8530dbe403f4 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3449,10 +3449,10 @@ out: static void mv88e6xxx_mdios_unregister(struct mv88e6xxx_chip *chip) { - struct mv88e6xxx_mdio_bus *mdio_bus; + struct mv88e6xxx_mdio_bus *mdio_bus, *p; struct mii_bus *bus; - list_for_each_entry(mdio_bus, &chip->mdios, list) { + list_for_each_entry_safe(mdio_bus, p, &chip->mdios, list) { bus = mdio_bus->bus; if (!mdio_bus->external) From 8795359e35bc33bf86b6d0765aa7f37431db3b9c Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 8 Feb 2022 10:48:07 -0800 Subject: [PATCH 1262/1295] x86/sgx: Silence softlockup detection when releasing large enclaves Vijay reported that the "unclobbered_vdso_oversubscribed" selftest triggers the softlockup detector. Actual SGX systems have 128GB of enclave memory or more. The "unclobbered_vdso_oversubscribed" selftest creates one enclave which consumes all of the enclave memory on the system. Tearing down such a large enclave takes around a minute, most of it in the loop where the EREMOVE instruction is applied to each individual 4k enclave page. Spending one minute in a loop triggers the softlockup detector. Add a cond_resched() to give other tasks a chance to run and placate the softlockup detector. Cc: stable@vger.kernel.org Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer") Reported-by: Vijay Dhanraj Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Acked-by: Dave Hansen Tested-by: Jarkko Sakkinen (kselftest as sanity check) Link: https://lkml.kernel.org/r/ced01cac1e75f900251b0a4ae1150aa8ebd295ec.1644345232.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/sgx/encl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 001808e3901c..48afe96ae0f0 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -410,6 +410,8 @@ void sgx_encl_release(struct kref *ref) } kfree(entry); + /* Invoke scheduler to prevent soft lockups. */ + cond_resched(); } xa_destroy(&encl->page_array); From 5c72263ef2fbe99596848f03758ae2dc593adf2c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 8 Feb 2022 00:57:17 -0800 Subject: [PATCH 1263/1295] signal: HANDLER_EXIT should clear SIGNAL_UNKILLABLE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fatal SIGSYS signals (i.e. seccomp RET_KILL_* syscall filter actions) were not being delivered to ptraced pid namespace init processes. Make sure the SIGNAL_UNKILLABLE doesn't get set for these cases. Reported-by: Robert Święcki Suggested-by: "Eric W. Biederman" Fixes: 00b06da29cf9 ("signal: Add SA_IMMUTABLE to ensure forced siganls do not get changed") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: "Eric W. Biederman" Link: https://lore.kernel.org/lkml/878rui8u4a.fsf@email.froward.int.ebiederm.org --- kernel/signal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index 38602738866e..9b04631acde8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1342,9 +1342,10 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, } /* * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect - * debugging to leave init killable. + * debugging to leave init killable. But HANDLER_EXIT is always fatal. */ - if (action->sa.sa_handler == SIG_DFL && !t->ptrace) + if (action->sa.sa_handler == SIG_DFL && + (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = send_signal(sig, info, t, PIDTYPE_PID); spin_unlock_irqrestore(&t->sighand->siglock, flags); From 495ac3069a6235bfdf516812a2a9b256671bbdf9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 7 Feb 2022 20:21:13 -0800 Subject: [PATCH 1264/1295] seccomp: Invalidate seccomp mode to catch death failures If seccomp tries to kill a process, it should never see that process again. To enforce this proactively, switch the mode to something impossible. If encountered: WARN, reject all syscalls, and attempt to kill the process again even harder. Cc: Andy Lutomirski Cc: Will Drewry Fixes: 8112c4f140fa ("seccomp: remove 2-phase API") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- kernel/seccomp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 4d8f44a17727..db10e73d06e0 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -29,6 +29,9 @@ #include #include +/* Not exposed in headers: strictly internal use only. */ +#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1) + #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include #endif @@ -1010,6 +1013,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif + current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true); do_exit(SIGKILL); } @@ -1261,6 +1265,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_KILL_PROCESS: default: + current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ if (action != SECCOMP_RET_KILL_THREAD || @@ -1309,6 +1314,11 @@ int __secure_computing(const struct seccomp_data *sd) return 0; case SECCOMP_MODE_FILTER: return __seccomp_filter(this_syscall, sd, false); + /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */ + case SECCOMP_MODE_DEAD: + WARN_ON_ONCE(1); + do_exit(SIGKILL); + return -1; default: BUG(); } From eed09ad261822a7bdc441ed192c6f444375e5527 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 7 Feb 2022 20:53:34 -0800 Subject: [PATCH 1265/1295] samples/seccomp: Adjust sample to also provide kill option As a quick way to test SECCOMP_RET_KILL, have a negative errno mean to kill the process. While we're in here, also swap the arch and syscall arguments so they're ordered more like how seccomp filters order them. Signed-off-by: Kees Cook --- samples/seccomp/dropper.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/samples/seccomp/dropper.c b/samples/seccomp/dropper.c index cc0648eb389e..4bca4b70f665 100644 --- a/samples/seccomp/dropper.c +++ b/samples/seccomp/dropper.c @@ -25,7 +25,7 @@ #include #include -static int install_filter(int nr, int arch, int error) +static int install_filter(int arch, int nr, int error) { struct sock_filter filter[] = { BPF_STMT(BPF_LD+BPF_W+BPF_ABS, @@ -42,6 +42,10 @@ static int install_filter(int nr, int arch, int error) .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), .filter = filter, }; + if (error == -1) { + struct sock_filter kill = BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL); + filter[4] = kill; + } if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { perror("prctl(NO_NEW_PRIVS)"); return 1; @@ -57,9 +61,10 @@ int main(int argc, char **argv) { if (argc < 5) { fprintf(stderr, "Usage:\n" - "dropper []\n" + "dropper []\n" "Hint: AUDIT_ARCH_I386: 0x%X\n" " AUDIT_ARCH_X86_64: 0x%X\n" + " errno == -1 means SECCOMP_RET_KILL\n" "\n", AUDIT_ARCH_I386, AUDIT_ARCH_X86_64); return 1; } From 57bc3d3ae8c14df3ceb4e17d26ddf9eeab304581 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 26 Jan 2022 14:14:52 +0100 Subject: [PATCH 1266/1295] net: usb: ax88179_178a: Fix out-of-bounds accesses in RX fixup ax88179_rx_fixup() contains several out-of-bounds accesses that can be triggered by a malicious (or defective) USB device, in particular: - The metadata array (hdr_off..hdr_off+2*pkt_cnt) can be out of bounds, causing OOB reads and (on big-endian systems) OOB endianness flips. - A packet can overlap the metadata array, causing a later OOB endianness flip to corrupt data used by a cloned SKB that has already been handed off into the network stack. - A packet SKB can be constructed whose tail is far beyond its end, causing out-of-bounds heap data to be considered part of the SKB's data. I have tested that this can be used by a malicious USB device to send a bogus ICMPv6 Echo Request and receive an ICMPv6 Echo Reply in response that contains random kernel heap data. It's probably also possible to get OOB writes from this on a little-endian system somehow - maybe by triggering skb_cow() via IP options processing -, but I haven't tested that. Fixes: e2ca90c276e1 ("ax88179_178a: ASIX AX88179_178A USB 3.0/2.0 to gigabit ethernet adapter driver") Cc: stable@kernel.org Signed-off-by: Jann Horn Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/ax88179_178a.c | 66 +++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index 1a627ba4b850..a31098981a65 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1468,58 +1468,68 @@ static int ax88179_rx_fixup(struct usbnet *dev, struct sk_buff *skb) u16 hdr_off; u32 *pkt_hdr; - /* This check is no longer done by usbnet */ - if (skb->len < dev->net->hard_header_len) + /* At the end of the SKB, there's a header telling us how many packets + * are bundled into this buffer and where we can find an array of + * per-packet metadata (which contains elements encoded into u16). + */ + if (skb->len < 4) return 0; - skb_trim(skb, skb->len - 4); rx_hdr = get_unaligned_le32(skb_tail_pointer(skb)); - pkt_cnt = (u16)rx_hdr; hdr_off = (u16)(rx_hdr >> 16); + + if (pkt_cnt == 0) + return 0; + + /* Make sure that the bounds of the metadata array are inside the SKB + * (and in front of the counter at the end). + */ + if (pkt_cnt * 2 + hdr_off > skb->len) + return 0; pkt_hdr = (u32 *)(skb->data + hdr_off); - while (pkt_cnt--) { + /* Packets must not overlap the metadata array */ + skb_trim(skb, hdr_off); + + for (; ; pkt_cnt--, pkt_hdr++) { u16 pkt_len; le32_to_cpus(pkt_hdr); pkt_len = (*pkt_hdr >> 16) & 0x1fff; + if (pkt_len > skb->len) + return 0; + /* Check CRC or runt packet */ - if ((*pkt_hdr & AX_RXHDR_CRC_ERR) || - (*pkt_hdr & AX_RXHDR_DROP_ERR)) { - skb_pull(skb, (pkt_len + 7) & 0xFFF8); - pkt_hdr++; - continue; - } + if (((*pkt_hdr & (AX_RXHDR_CRC_ERR | AX_RXHDR_DROP_ERR)) == 0) && + pkt_len >= 2 + ETH_HLEN) { + bool last = (pkt_cnt == 0); - if (pkt_cnt == 0) { - skb->len = pkt_len; - /* Skip IP alignment pseudo header */ - skb_pull(skb, 2); - skb_set_tail_pointer(skb, skb->len); - skb->truesize = pkt_len + sizeof(struct sk_buff); - ax88179_rx_checksum(skb, pkt_hdr); - return 1; - } - - ax_skb = skb_clone(skb, GFP_ATOMIC); - if (ax_skb) { + if (last) { + ax_skb = skb; + } else { + ax_skb = skb_clone(skb, GFP_ATOMIC); + if (!ax_skb) + return 0; + } ax_skb->len = pkt_len; /* Skip IP alignment pseudo header */ skb_pull(ax_skb, 2); skb_set_tail_pointer(ax_skb, ax_skb->len); ax_skb->truesize = pkt_len + sizeof(struct sk_buff); ax88179_rx_checksum(ax_skb, pkt_hdr); + + if (last) + return 1; + usbnet_skb_return(dev, ax_skb); - } else { - return 0; } - skb_pull(skb, (pkt_len + 7) & 0xFFF8); - pkt_hdr++; + /* Trim this packet away from the SKB */ + if (!skb_pull(skb, (pkt_len + 7) & 0xFFF8)) + return 0; } - return 1; } static struct sk_buff * From c853685d11c09da35cb49bbf8f0c001abdc0d0a9 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 9 Feb 2022 17:45:00 +0100 Subject: [PATCH 1267/1295] usb: core: Unregister device on component_add() failure Commit 8c67d06f3fd9 ("usb: Link the ports to the connectors they are attached to") creates a link to the USB Type-C connector for every new port that is added when possible. If component_add() fails, usb_hub_create_port_device() prints a warning but does not unregister the device and does not return errors to the callers. Syzbot reported a "WARNING in component_del()". Fix this issue in usb_hub_create_port_device by calling device_unregister() and returning the errors from component_add(). Fixes: 8c67d06f3fd9 ("usb: Link the ports to the connectors they are attached to") Reported-and-tested-by: syzbot+60df062e1c41940cae0f@syzkaller.appspotmail.com Reviewed-by: Heikki Krogerus Signed-off-by: Fabio M. De Francesco Link: https://lore.kernel.org/r/20220209164500.8769-1-fmdefrancesco@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/port.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/port.c b/drivers/usb/core/port.c index c2bbf97a79be..d5bc36ca5b1f 100644 --- a/drivers/usb/core/port.c +++ b/drivers/usb/core/port.c @@ -602,11 +602,14 @@ int usb_hub_create_port_device(struct usb_hub *hub, int port1) return retval; } - find_and_link_peer(hub, port1); - retval = component_add(&port_dev->dev, &connector_ops); - if (retval) + if (retval) { dev_warn(&port_dev->dev, "failed to add component\n"); + device_unregister(&port_dev->dev); + return retval; + } + + find_and_link_peer(hub, port1); /* * Enable runtime pm and hold a refernce that hub_configure() From 75e5b4849b81e19e9efe1654b30d7f3151c33c2c Mon Sep 17 00:00:00 2001 From: Szymon Heidrich Date: Mon, 24 Jan 2022 12:14:00 +0100 Subject: [PATCH 1268/1295] USB: gadget: validate interface OS descriptor requests Stall the control endpoint in case provided index exceeds array size of MAX_CONFIG_INTERFACES or when the retrieved function pointer is null. Signed-off-by: Szymon Heidrich Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/composite.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index 16f9e3423c9f..9315313108c9 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -1988,6 +1988,9 @@ unknown: if (w_index != 0x5 || (w_value >> 8)) break; interface = w_value & 0xFF; + if (interface >= MAX_CONFIG_INTERFACES || + !os_desc_cfg->interface[interface]) + break; buf[6] = w_index; count = count_ext_prop(os_desc_cfg, interface); From 38ea1eac7d88072bbffb630e2b3db83ca649b826 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 9 Feb 2022 16:37:53 +0100 Subject: [PATCH 1269/1295] usb: gadget: rndis: check size of RNDIS_MSG_SET command Check the size of the RNDIS_MSG_SET command given to us before attempting to respond to an invalid message size. Reported-by: Szymon Heidrich Cc: stable@kernel.org Tested-by: Szymon Heidrich Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/rndis.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/function/rndis.c b/drivers/usb/gadget/function/rndis.c index 431d5a7d737e..b7ccf1803656 100644 --- a/drivers/usb/gadget/function/rndis.c +++ b/drivers/usb/gadget/function/rndis.c @@ -637,14 +637,17 @@ static int rndis_set_response(struct rndis_params *params, rndis_set_cmplt_type *resp; rndis_resp_t *r; + BufLength = le32_to_cpu(buf->InformationBufferLength); + BufOffset = le32_to_cpu(buf->InformationBufferOffset); + if ((BufLength > RNDIS_MAX_TOTAL_SIZE) || + (BufOffset + 8 >= RNDIS_MAX_TOTAL_SIZE)) + return -EINVAL; + r = rndis_add_response(params, sizeof(rndis_set_cmplt_type)); if (!r) return -ENOMEM; resp = (rndis_set_cmplt_type *)r->buf; - BufLength = le32_to_cpu(buf->InformationBufferLength); - BufOffset = le32_to_cpu(buf->InformationBufferOffset); - #ifdef VERBOSE_DEBUG pr_debug("%s: Length: %d\n", __func__, BufLength); pr_debug("%s: Offset: %d\n", __func__, BufOffset); From 269cbcf7b72de6f0016806d4a0cec1d689b55a87 Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Wed, 9 Feb 2022 17:15:53 +0100 Subject: [PATCH 1270/1295] usb: dwc2: drd: fix soft connect when gadget is unconfigured When the gadget driver hasn't been (yet) configured, and the cable is connected to a HOST, the SFTDISCON gets cleared unconditionally, so the HOST tries to enumerate it. At the host side, this can result in a stuck USB port or worse. When getting lucky, some dmesg can be observed at the host side: new high-speed USB device number ... device descriptor read/64, error -110 Fix it in drd, by checking the enabled flag before calling dwc2_hsotg_core_connect(). It will be called later, once configured, by the normal flow: - udc_bind_to_driver - usb_gadget_connect - dwc2_hsotg_pullup - dwc2_hsotg_core_connect Fixes: 17f934024e84 ("usb: dwc2: override PHY input signals with usb role switch support") Cc: stable@kernel.org Reviewed-by: Amelie Delaunay Acked-by: Minas Harutyunyan Signed-off-by: Fabrice Gasnier Link: https://lore.kernel.org/r/1644423353-17859-1-git-send-email-fabrice.gasnier@foss.st.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/drd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/dwc2/drd.c b/drivers/usb/dwc2/drd.c index 1b39c4776369..9b6d44d90ad9 100644 --- a/drivers/usb/dwc2/drd.c +++ b/drivers/usb/dwc2/drd.c @@ -130,8 +130,10 @@ static int dwc2_drd_role_sw_set(struct usb_role_switch *sw, enum usb_role role) already = dwc2_ovr_avalid(hsotg, true); } else if (role == USB_ROLE_DEVICE) { already = dwc2_ovr_bvalid(hsotg, true); - /* This clear DCTL.SFTDISCON bit */ - dwc2_hsotg_core_connect(hsotg); + if (hsotg->enabled) { + /* This clear DCTL.SFTDISCON bit */ + dwc2_hsotg_core_connect(hsotg); + } } else { if (dwc2_is_device_mode(hsotg)) { if (!dwc2_ovr_bvalid(hsotg, false)) From c72ea20503610a4a7ba26c769357d31602769c01 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Mon, 7 Feb 2022 16:01:19 +0100 Subject: [PATCH 1271/1295] iio: buffer: Fix file related error handling in IIO_BUFFER_GET_FD_IOCTL If we fail to copy the just created file descriptor to userland, we try to clean up by putting back 'fd' and freeing 'ib'. The code uses put_unused_fd() for the former which is wrong, as the file descriptor was already published by fd_install() which gets called internally by anon_inode_getfd(). This makes the error handling code leaving a half cleaned up file descriptor table around and a partially destructed 'file' object, allowing userland to play use-after-free tricks on us, by abusing the still usable fd and making the code operate on a dangling 'file->private_data' pointer. Instead of leaving the kernel in a partially corrupted state, don't attempt to explicitly clean up and leave this to the process exit path that'll release any still valid fds, including the one created by the previous call to anon_inode_getfd(). Simply return -EFAULT to indicate the error. Fixes: f73f7f4da581 ("iio: buffer: add ioctl() to support opening extra buffers for IIO device") Cc: stable@kernel.org Cc: Jonathan Cameron Cc: Alexandru Ardelean Cc: Lars-Peter Clausen Cc: Nuno Sa Reported-by: Dan Carpenter Signed-off-by: Mathias Krause Reviewed-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/industrialio-buffer.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c index 94eb9f6cf128..208b5193c621 100644 --- a/drivers/iio/industrialio-buffer.c +++ b/drivers/iio/industrialio-buffer.c @@ -1569,9 +1569,17 @@ static long iio_device_buffer_getfd(struct iio_dev *indio_dev, unsigned long arg } if (copy_to_user(ival, &fd, sizeof(fd))) { - put_unused_fd(fd); - ret = -EFAULT; - goto error_free_ib; + /* + * "Leak" the fd, as there's not much we can do about this + * anyway. 'fd' might have been closed already, as + * anon_inode_getfd() called fd_install() on it, which made + * it reachable by userland. + * + * Instead of allowing a malicious user to play tricks with + * us, rely on the process exit path to do any necessary + * cleanup, as in releasing the file, if still needed. + */ + return -EFAULT; } return 0; From bf23747ee05320903177809648002601cd140cdd Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 11 Feb 2022 16:15:54 +0900 Subject: [PATCH 1272/1295] loop: revert "make autoclear operation asynchronous" The kernel test robot is reporting that xfstest which does umount ext2 on xfs umount xfs sequence started failing, for commit 322c4293ecc58110 ("loop: make autoclear operation asynchronous") removed a guarantee that fput() of backing file is processed before lo_release() from close() returns to user mode. And syzbot is reporting that deferring destroy_workqueue() from __loop_clr_fd() to a WQ context did not help [1]. Revert that commit. Link: https://syzkaller.appspot.com/bug?extid=831661966588c802aae9 [1] Reported-by: kernel test robot Acked-by: Jan Kara Reviewed-by: Christoph Hellwig Reported-by: syzbot Signed-off-by: Tetsuo Handa Link: https://lore.kernel.org/r/20220211071554.3424-1-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Jens Axboe --- drivers/block/loop.c | 65 ++++++++++++++++++++------------------------ drivers/block/loop.h | 1 - 2 files changed, 29 insertions(+), 37 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 01cbbfc4e9e2..150012ffb387 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1082,7 +1082,7 @@ out_putf: return error; } -static void __loop_clr_fd(struct loop_device *lo) +static void __loop_clr_fd(struct loop_device *lo, bool release) { struct file *filp; gfp_t gfp = lo->old_gfp_mask; @@ -1144,6 +1144,8 @@ static void __loop_clr_fd(struct loop_device *lo) /* let user-space know about this change */ kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); mapping_set_gfp_mask(filp->f_mapping, gfp); + /* This is safe: open() is still holding a reference. */ + module_put(THIS_MODULE); blk_mq_unfreeze_queue(lo->lo_queue); disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); @@ -1151,52 +1153,44 @@ static void __loop_clr_fd(struct loop_device *lo) if (lo->lo_flags & LO_FLAGS_PARTSCAN) { int err; - mutex_lock(&lo->lo_disk->open_mutex); + /* + * open_mutex has been held already in release path, so don't + * acquire it if this function is called in such case. + * + * If the reread partition isn't from release path, lo_refcnt + * must be at least one and it can only become zero when the + * current holder is released. + */ + if (!release) + mutex_lock(&lo->lo_disk->open_mutex); err = bdev_disk_changed(lo->lo_disk, false); - mutex_unlock(&lo->lo_disk->open_mutex); + if (!release) + mutex_unlock(&lo->lo_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", __func__, lo->lo_number, err); /* Device is gone, no point in returning error */ } + /* + * lo->lo_state is set to Lo_unbound here after above partscan has + * finished. There cannot be anybody else entering __loop_clr_fd() as + * Lo_rundown state protects us from all the other places trying to + * change the 'lo' device. + */ lo->lo_flags = 0; if (!part_shift) lo->lo_disk->flags |= GENHD_FL_NO_PART; - - fput(filp); -} - -static void loop_rundown_completed(struct loop_device *lo) -{ mutex_lock(&lo->lo_mutex); lo->lo_state = Lo_unbound; mutex_unlock(&lo->lo_mutex); - module_put(THIS_MODULE); -} -static void loop_rundown_workfn(struct work_struct *work) -{ - struct loop_device *lo = container_of(work, struct loop_device, - rundown_work); - struct block_device *bdev = lo->lo_device; - struct gendisk *disk = lo->lo_disk; - - __loop_clr_fd(lo); - kobject_put(&bdev->bd_device.kobj); - module_put(disk->fops->owner); - loop_rundown_completed(lo); -} - -static void loop_schedule_rundown(struct loop_device *lo) -{ - struct block_device *bdev = lo->lo_device; - struct gendisk *disk = lo->lo_disk; - - __module_get(disk->fops->owner); - kobject_get(&bdev->bd_device.kobj); - INIT_WORK(&lo->rundown_work, loop_rundown_workfn); - queue_work(system_long_wq, &lo->rundown_work); + /* + * Need not hold lo_mutex to fput backing file. Calling fput holding + * lo_mutex triggers a circular lock dependency possibility warning as + * fput can take open_mutex which is usually taken before lo_mutex. + */ + fput(filp); } static int loop_clr_fd(struct loop_device *lo) @@ -1228,8 +1222,7 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_state = Lo_rundown; mutex_unlock(&lo->lo_mutex); - __loop_clr_fd(lo); - loop_rundown_completed(lo); + __loop_clr_fd(lo, false); return 0; } @@ -1754,7 +1747,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode) * In autoclear mode, stop the loop thread * and remove configuration after last close. */ - loop_schedule_rundown(lo); + __loop_clr_fd(lo, true); return; } else if (lo->lo_state == Lo_bound) { /* diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 918a7a2dc025..082d4b6bfc6a 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -56,7 +56,6 @@ struct loop_device { struct gendisk *lo_disk; struct mutex lo_mutex; bool idr_visible; - struct work_struct rundown_work; }; struct loop_cmd { From 356b8103d4c495d5440e3e687db9026ec2b76043 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 3 Feb 2022 14:06:56 +0100 Subject: [PATCH 1273/1295] Revert "gfs2: check context in gfs2_glock_put" It turns out that the might_sleep() call that commit 660a6126f8c3 adds is triggering occasional data corruption in testing. We're not sure about the root cause yet, but since this commit was added as a debugging aid only, revert it for now. This reverts commit 660a6126f8c3208f6df8d552039cda078a8426d1. Fixes: 660a6126f8c3 ("gfs2: check context in gfs2_glock_put") Cc: stable@vger.kernel.org # v5.16+ Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b7ab8430333c..6b23399eaee0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -301,9 +301,6 @@ void gfs2_glock_queue_put(struct gfs2_glock *gl) void gfs2_glock_put(struct gfs2_glock *gl) { - /* last put could call sleepable dlm api */ - might_sleep(); - if (lockref_put_or_lock(&gl->gl_lockref)) return; From d3add1a9519dcacd6e644ecac741c56cf18b67f5 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 18 Jan 2022 09:30:18 -0500 Subject: [PATCH 1274/1295] gfs2: Fix gfs2_release for non-writers regression When a file is opened for writing, the vfs code (do_dentry_open) calls get_write_access for the inode, thus incrementing the inode's write count. That writer normally then creates a multi-block reservation for the inode (i_res) that can be re-used by other writers, which speeds up writes for applications that stupidly loop on open/write/close. When the writes are all done, the multi-block reservation should be deleted when the file is closed by the last "writer." Commit 0ec9b9ea4f83 broke that concept when it moved the call to gfs2_rs_delete before the check for FMODE_WRITE. Non-writers have no business removing the multi-block reservations of writers. In fact, if someone opens and closes the file for RO while a writer has a multi-block reservation, the RO closer will delete the reservation midway through the write, and this results in: kernel BUG at fs/gfs2/rgrp.c:677! (or thereabouts) which is: BUG_ON(rs->rs_requested); from function gfs2_rs_deltree. This patch moves the check back inside the check for FMODE_WRITE. Fixes: 0ec9b9ea4f83 ("gfs2: Check for active reservation in gfs2_release") Cc: stable@vger.kernel.org # v5.12+ Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 3e718cfc19a7..8c39a8571b1f 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -704,10 +704,11 @@ static int gfs2_release(struct inode *inode, struct file *file) kfree(file->private_data); file->private_data = NULL; - if (gfs2_rs_active(&ip->i_res)) - gfs2_rs_delete(ip, &inode->i_writecount); - if (file->f_mode & FMODE_WRITE) + if (file->f_mode & FMODE_WRITE) { + if (gfs2_rs_active(&ip->i_res)) + gfs2_rs_delete(ip, &inode->i_writecount); gfs2_qa_put(ip); + } return 0; } From 075b7d363c675ef7fa03918881caeca3458e2a96 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 7 Feb 2022 16:33:30 -0600 Subject: [PATCH 1275/1295] Revert "PCI/portdrv: Do not setup up IRQs if there are no users" This reverts commit 0e8ae5a6ff5952253cd7cc0260df838ab4c21009. 0e8ae5a6ff59 ("PCI/portdrv: Do not setup up IRQs if there are no users") reduced usage of IRQs when we don't think we need them. But Joey, Sergiu, and David reported choppy GUI rendering, systems that became unresponsive every few seconds, incorrect values reported by cpufreq, and high IRQ 16 CPU usage. Joey bisected the issues to 0e8ae5a6ff59, so revert it until we figure out a better solution. Link: https://lore.kernel.org/r/20220210222717.GA658201@bhelgaas Link: https://bugzilla.kernel.org/show_bug.cgi?id=215533 Link: https://bugzilla.kernel.org/show_bug.cgi?id=215546 Reported-by: Joey Corleone Reported-by: Sergiu Deitsch Reported-by: David Spencer Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v5.16+ Cc: Jan Kiszka --- drivers/pci/pcie/portdrv_core.c | 47 ++++++++++++--------------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index bda630889f95..604feeb84ee4 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -166,6 +166,9 @@ static int pcie_init_service_irqs(struct pci_dev *dev, int *irqs, int mask) { int ret, i; + for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) + irqs[i] = -1; + /* * If we support PME but can't use MSI/MSI-X for it, we have to * fall back to INTx or other interrupts, e.g., a system shared @@ -314,10 +317,8 @@ static int pcie_device_init(struct pci_dev *pdev, int service, int irq) */ int pcie_port_device_register(struct pci_dev *dev) { - int status, capabilities, irq_services, i, nr_service; - int irqs[PCIE_PORT_DEVICE_MAXSERVICES] = { - [0 ... PCIE_PORT_DEVICE_MAXSERVICES-1] = -1 - }; + int status, capabilities, i, nr_service; + int irqs[PCIE_PORT_DEVICE_MAXSERVICES]; /* Enable PCI Express port device */ status = pci_enable_device(dev); @@ -330,32 +331,18 @@ int pcie_port_device_register(struct pci_dev *dev) return 0; pci_set_master(dev); - - irq_services = 0; - if (IS_ENABLED(CONFIG_PCIE_PME)) - irq_services |= PCIE_PORT_SERVICE_PME; - if (IS_ENABLED(CONFIG_PCIEAER)) - irq_services |= PCIE_PORT_SERVICE_AER; - if (IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) - irq_services |= PCIE_PORT_SERVICE_HP; - if (IS_ENABLED(CONFIG_PCIE_DPC)) - irq_services |= PCIE_PORT_SERVICE_DPC; - irq_services &= capabilities; - - if (irq_services) { - /* - * Initialize service IRQs. Don't use service devices that - * require interrupts if there is no way to generate them. - * However, some drivers may have a polling mode (e.g. - * pciehp_poll_mode) that can be used in the absence of IRQs. - * Allow them to determine if that is to be used. - */ - status = pcie_init_service_irqs(dev, irqs, irq_services); - if (status) { - irq_services &= PCIE_PORT_SERVICE_HP; - if (!irq_services) - goto error_disable; - } + /* + * Initialize service irqs. Don't use service devices that + * require interrupts if there is no way to generate them. + * However, some drivers may have a polling mode (e.g. pciehp_poll_mode) + * that can be used in the absence of irqs. Allow them to determine + * if that is to be used. + */ + status = pcie_init_service_irqs(dev, irqs, capabilities); + if (status) { + capabilities &= PCIE_PORT_SERVICE_HP; + if (!capabilities) + goto error_disable; } /* Allocate child services if any */ From 925346c129da1171222a9cdb11fa2b734d9955da Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 11 Feb 2022 16:32:22 -0800 Subject: [PATCH 1276/1295] fs/binfmt_elf: fix PT_LOAD p_align values for loaders Rui Salvaterra reported that Aisleroit solitaire crashes with "Wrong __data_start/_end pair" assertion from libgc after update to v5.17-rc1. Bisection pointed to commit 9630f0d60fec ("fs/binfmt_elf: use PT_LOAD p_align values for static PIE") that fixed handling of static PIEs, but made the condition that guards load_bias calculation to exclude loader binaries. Restoring the check for presence of interpreter fixes the problem. Link: https://lkml.kernel.org/r/20220202121433.3697146-1-rppt@kernel.org Fixes: 9630f0d60fec ("fs/binfmt_elf: use PT_LOAD p_align values for static PIE") Signed-off-by: Mike Rapoport Reported-by: Rui Salvaterra Tested-by: Rui Salvaterra Cc: Alexander Viro Cc: Eric Biederman Cc: "H.J. Lu" Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 605017eb9349..9e11e6f13e83 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1117,7 +1117,7 @@ out_free_interp: * without MAP_FIXED nor MAP_FIXED_NOREPLACE). */ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); - if (alignment > ELF_MIN_ALIGN) { + if (interpreter || alignment > ELF_MIN_ALIGN) { load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); From 24d7275ce2791829953ed4e72f68277ceb2571c6 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 11 Feb 2022 16:32:26 -0800 Subject: [PATCH 1277/1295] fs/proc: task_mmu.c: don't read mapcount for migration entry The syzbot reported the below BUG: kernel BUG at include/linux/page-flags.h:785! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline] RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744 Call Trace: page_mapcount include/linux/mm.h:837 [inline] smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466 smaps_pte_entry fs/proc/task_mmu.c:538 [inline] smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601 walk_pmd_range mm/pagewalk.c:128 [inline] walk_pud_range mm/pagewalk.c:205 [inline] walk_p4d_range mm/pagewalk.c:240 [inline] walk_pgd_range mm/pagewalk.c:277 [inline] __walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379 walk_page_vma+0x277/0x350 mm/pagewalk.c:530 smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768 smap_gather_stats fs/proc/task_mmu.c:741 [inline] show_smap+0xc6/0x440 fs/proc/task_mmu.c:822 seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272 seq_read+0x3e0/0x5b0 fs/seq_file.c:162 vfs_read+0x1b5/0x600 fs/read_write.c:479 ksys_read+0x12d/0x250 fs/read_write.c:619 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae The reproducer was trying to read /proc/$PID/smaps when calling MADV_FREE at the mean time. MADV_FREE may split THPs if it is called for partial THP. It may trigger the below race: CPU A CPU B ----- ----- smaps walk: MADV_FREE: page_mapcount() PageCompound() split_huge_page() page = compound_head(page) PageDoubleMap(page) When calling PageDoubleMap() this page is not a tail page of THP anymore so the BUG is triggered. This could be fixed by elevated refcount of the page before calling mapcount, but that would prevent it from counting migration entries, and it seems overkilling because the race just could happen when PMD is split so all PTE entries of tail pages are actually migration entries, and smaps_account() does treat migration entries as mapcount == 1 as Kirill pointed out. Add a new parameter for smaps_account() to tell this entry is migration entry then skip calling page_mapcount(). Don't skip getting mapcount for device private entries since they do track references with mapcount. Pagemap also has the similar issue although it was not reported. Fixed it as well. [shy828301@gmail.com: v4] Link: https://lkml.kernel.org/r/20220203182641.824731-1-shy828301@gmail.com [nathan@kernel.org: avoid unused variable warning in pagemap_pmd_range()] Link: https://lkml.kernel.org/r/20220207171049.1102239-1-nathan@kernel.org Link: https://lkml.kernel.org/r/20220120202805.3369-1-shy828301@gmail.com Fixes: e9b61f19858a ("thp: reintroduce split_huge_page()") Signed-off-by: Yang Shi Signed-off-by: Nathan Chancellor Reported-by: syzbot+1f52b3a18d5633fa7f82@syzkaller.appspotmail.com Acked-by: David Hildenbrand Cc: "Kirill A. Shutemov" Cc: Jann Horn Cc: Matthew Wilcox Cc: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 18f8c3acbb85..6e97ed775074 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -440,7 +440,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, } static void smaps_account(struct mem_size_stats *mss, struct page *page, - bool compound, bool young, bool dirty, bool locked) + bool compound, bool young, bool dirty, bool locked, + bool migration) { int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; @@ -467,8 +468,15 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, * page_count(page) == 1 guarantees the page is mapped exactly once. * If any subpage of the compound page mapped with PTE it would elevate * page_count(). + * + * The page_mapcount() is called to get a snapshot of the mapcount. + * Without holding the page lock this snapshot can be slightly wrong as + * we cannot always read the mapcount atomically. It is not safe to + * call page_mapcount() even with PTL held if the page is not mapped, + * especially for migration entries. Treat regular migration entries + * as mapcount == 1. */ - if (page_count(page) == 1) { + if ((page_count(page) == 1) || migration) { smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty, locked, true); return; @@ -517,6 +525,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + bool migration = false; if (pte_present(*pte)) { page = vm_normal_page(vma, addr, *pte); @@ -536,8 +545,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } - } else if (is_pfn_swap_entry(swpent)) + } else if (is_pfn_swap_entry(swpent)) { + if (is_migration_entry(swpent)) + migration = true; page = pfn_swap_entry_to_page(swpent); + } } else { smaps_pte_hole_lookup(addr, walk); return; @@ -546,7 +558,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked); + smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), + locked, migration); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -557,6 +570,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + bool migration = false; if (pmd_present(*pmd)) { /* FOLL_DUMP will return -EFAULT on huge zero page */ @@ -564,8 +578,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { swp_entry_t entry = pmd_to_swp_entry(*pmd); - if (is_migration_entry(entry)) + if (is_migration_entry(entry)) { + migration = true; page = pfn_swap_entry_to_page(entry); + } } if (IS_ERR_OR_NULL(page)) return; @@ -577,7 +593,9 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, /* pass */; else mss->file_thp += HPAGE_PMD_SIZE; - smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); + + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), + locked, migration); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -1378,6 +1396,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, { u64 frame = 0, flags = 0; struct page *page = NULL; + bool migration = false; if (pte_present(pte)) { if (pm->show_pfn) @@ -1399,13 +1418,14 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; + migration = is_migration_entry(entry); if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); } if (page && !PageAnon(page)) flags |= PM_FILE; - if (page && page_mapcount(page) == 1) + if (page && !migration && page_mapcount(page) == 1) flags |= PM_MMAP_EXCLUSIVE; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; @@ -1421,8 +1441,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, spinlock_t *ptl; pte_t *pte, *orig_pte; int err = 0; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool migration = false; + ptl = pmd_trans_huge_lock(pmdp, vma); if (ptl) { u64 flags = 0, frame = 0; @@ -1461,11 +1482,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); + migration = is_migration_entry(entry); page = pfn_swap_entry_to_page(entry); } #endif - if (page && page_mapcount(page) == 1) + if (page && !migration && page_mapcount(page) == 1) flags |= PM_MMAP_EXCLUSIVE; for (; addr != end; addr += PAGE_SIZE) { From b485c6f1f9f54b81443efda5f3d8a5036ba2cd91 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 11 Feb 2022 16:32:29 -0800 Subject: [PATCH 1278/1295] mm: vmscan: remove deadlock due to throttling failing to make progress A soft lockup bug in kcompactd was reported in a private bugzilla with the following visible in dmesg; watchdog: BUG: soft lockup - CPU#33 stuck for 26s! [kcompactd0:479] watchdog: BUG: soft lockup - CPU#33 stuck for 52s! [kcompactd0:479] watchdog: BUG: soft lockup - CPU#33 stuck for 78s! [kcompactd0:479] watchdog: BUG: soft lockup - CPU#33 stuck for 104s! [kcompactd0:479] The machine had 256G of RAM with no swap and an earlier failed allocation indicated that node 0 where kcompactd was run was potentially unreclaimable; Node 0 active_anon:29355112kB inactive_anon:2913528kB active_file:0kB inactive_file:0kB unevictable:64kB isolated(anon):0kB isolated(file):0kB mapped:8kB dirty:0kB writeback:0kB shmem:26780kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 23480320kB writeback_tmp:0kB kernel_stack:2272kB pagetables:24500kB all_unreclaimable? yes Vlastimil Babka investigated a crash dump and found that a task migrating pages was trying to drain PCP lists; PID: 52922 TASK: ffff969f820e5000 CPU: 19 COMMAND: "kworker/u128:3" Call Trace: __schedule schedule schedule_timeout wait_for_completion __flush_work __drain_all_pages __alloc_pages_slowpath.constprop.114 __alloc_pages alloc_migration_target migrate_pages migrate_to_node do_migrate_pages cpuset_migrate_mm_workfn process_one_work worker_thread kthread ret_from_fork This failure is specific to CONFIG_PREEMPT=n builds. The root of the problem is that kcompact0 is not rescheduling on a CPU while a task that has isolated a large number of the pages from the LRU is waiting on kcompact0 to reschedule so the pages can be released. While shrink_inactive_list() only loops once around too_many_isolated, reclaim can continue without rescheduling if sc->skipped_deactivate == 1 which could happen if there was no file LRU and the inactive anon list was not low. Link: https://lkml.kernel.org/r/20220203100326.GD3301@suse.de Fixes: d818fca1cac3 ("mm/vmscan: throttle reclaim and compaction when too may pages are isolated") Signed-off-by: Mel Gorman Debugged-by: Vlastimil Babka Reviewed-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Hugh Dickins Cc: Michal Hocko Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 090bfb605ecf..59b14e0d696c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1066,8 +1066,10 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) * forward progress (e.g. journalling workqueues or kthreads). */ if (!current_is_kswapd() && - current->flags & (PF_IO_WORKER|PF_KTHREAD)) + current->flags & (PF_IO_WORKER|PF_KTHREAD)) { + cond_resched(); return; + } /* * These figures are pulled out of thin air. From 0764db9b49c932b89ee4d9e3236dff4bb07b4a66 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 11 Feb 2022 16:32:32 -0800 Subject: [PATCH 1279/1295] mm: memcg: synchronize objcg lists with a dedicated spinlock Alexander reported a circular lock dependency revealed by the mmap1 ltp test: LOCKDEP_CIRCULAR (suite: ltp, case: mtest06 (mmap1)) WARNING: possible circular locking dependency detected 5.17.0-20220113.rc0.git0.f2211f194038.300.fc35.s390x+debug #1 Not tainted ------------------------------------------------------ mmap1/202299 is trying to acquire lock: 00000001892c0188 (css_set_lock){..-.}-{2:2}, at: obj_cgroup_release+0x4a/0xe0 but task is already holding lock: 00000000ca3b3818 (&sighand->siglock){-.-.}-{2:2}, at: force_sig_info_to_task+0x38/0x180 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&sighand->siglock){-.-.}-{2:2}: __lock_acquire+0x604/0xbd8 lock_acquire.part.0+0xe2/0x238 lock_acquire+0xb0/0x200 _raw_spin_lock_irqsave+0x6a/0xd8 __lock_task_sighand+0x90/0x190 cgroup_freeze_task+0x2e/0x90 cgroup_migrate_execute+0x11c/0x608 cgroup_update_dfl_csses+0x246/0x270 cgroup_subtree_control_write+0x238/0x518 kernfs_fop_write_iter+0x13e/0x1e0 new_sync_write+0x100/0x190 vfs_write+0x22c/0x2d8 ksys_write+0x6c/0xf8 __do_syscall+0x1da/0x208 system_call+0x82/0xb0 -> #0 (css_set_lock){..-.}-{2:2}: check_prev_add+0xe0/0xed8 validate_chain+0x736/0xb20 __lock_acquire+0x604/0xbd8 lock_acquire.part.0+0xe2/0x238 lock_acquire+0xb0/0x200 _raw_spin_lock_irqsave+0x6a/0xd8 obj_cgroup_release+0x4a/0xe0 percpu_ref_put_many.constprop.0+0x150/0x168 drain_obj_stock+0x94/0xe8 refill_obj_stock+0x94/0x278 obj_cgroup_charge+0x164/0x1d8 kmem_cache_alloc+0xac/0x528 __sigqueue_alloc+0x150/0x308 __send_signal+0x260/0x550 send_signal+0x7e/0x348 force_sig_info_to_task+0x104/0x180 force_sig_fault+0x48/0x58 __do_pgm_check+0x120/0x1f0 pgm_check_handler+0x11e/0x180 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&sighand->siglock); lock(css_set_lock); lock(&sighand->siglock); lock(css_set_lock); *** DEADLOCK *** 2 locks held by mmap1/202299: #0: 00000000ca3b3818 (&sighand->siglock){-.-.}-{2:2}, at: force_sig_info_to_task+0x38/0x180 #1: 00000001892ad560 (rcu_read_lock){....}-{1:2}, at: percpu_ref_put_many.constprop.0+0x0/0x168 stack backtrace: CPU: 15 PID: 202299 Comm: mmap1 Not tainted 5.17.0-20220113.rc0.git0.f2211f194038.300.fc35.s390x+debug #1 Hardware name: IBM 3906 M04 704 (LPAR) Call Trace: dump_stack_lvl+0x76/0x98 check_noncircular+0x136/0x158 check_prev_add+0xe0/0xed8 validate_chain+0x736/0xb20 __lock_acquire+0x604/0xbd8 lock_acquire.part.0+0xe2/0x238 lock_acquire+0xb0/0x200 _raw_spin_lock_irqsave+0x6a/0xd8 obj_cgroup_release+0x4a/0xe0 percpu_ref_put_many.constprop.0+0x150/0x168 drain_obj_stock+0x94/0xe8 refill_obj_stock+0x94/0x278 obj_cgroup_charge+0x164/0x1d8 kmem_cache_alloc+0xac/0x528 __sigqueue_alloc+0x150/0x308 __send_signal+0x260/0x550 send_signal+0x7e/0x348 force_sig_info_to_task+0x104/0x180 force_sig_fault+0x48/0x58 __do_pgm_check+0x120/0x1f0 pgm_check_handler+0x11e/0x180 INFO: lockdep is turned off. In this example a slab allocation from __send_signal() caused a refilling and draining of a percpu objcg stock, resulted in a releasing of another non-related objcg. Objcg release path requires taking the css_set_lock, which is used to synchronize objcg lists. This can create a circular dependency with the sighandler lock, which is taken with the locked css_set_lock by the freezer code (to freeze a task). In general it seems that using css_set_lock to synchronize objcg lists makes any slab allocations and deallocation with the locked css_set_lock and any intervened locks risky. To fix the problem and make the code more robust let's stop using css_set_lock to synchronize objcg lists and use a new dedicated spinlock instead. Link: https://lkml.kernel.org/r/Yfm1IHmoGdyUR81T@carbon.dhcp.thefacebook.com Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API") Signed-off-by: Roman Gushchin Reported-by: Alexander Egorenkov Tested-by: Alexander Egorenkov Reviewed-by: Waiman Long Acked-by: Tejun Heo Reviewed-by: Shakeel Butt Reviewed-by: Jeremy Linton Tested-by: Jeremy Linton Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++-- mm/memcontrol.c | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b72d75141e12..0abbd685703b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -219,7 +219,7 @@ struct obj_cgroup { struct mem_cgroup *memcg; atomic_t nr_charged_bytes; union { - struct list_head list; + struct list_head list; /* protected by objcg_lock */ struct rcu_head rcu; }; }; @@ -315,7 +315,8 @@ struct mem_cgroup { #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; struct obj_cgroup __rcu *objcg; - struct list_head objcg_list; /* list of inherited objcgs */ + /* list of inherited objcgs, protected by objcg_lock */ + struct list_head objcg_list; #endif MEMCG_PADDING(_pad2_); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 09d342c7cbd0..36e9f38c919d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -254,7 +254,7 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) } #ifdef CONFIG_MEMCG_KMEM -extern spinlock_t css_set_lock; +static DEFINE_SPINLOCK(objcg_lock); bool mem_cgroup_kmem_disabled(void) { @@ -298,9 +298,9 @@ static void obj_cgroup_release(struct percpu_ref *ref) if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); - spin_lock_irqsave(&css_set_lock, flags); + spin_lock_irqsave(&objcg_lock, flags); list_del(&objcg->list); - spin_unlock_irqrestore(&css_set_lock, flags); + spin_unlock_irqrestore(&objcg_lock, flags); percpu_ref_exit(ref); kfree_rcu(objcg, rcu); @@ -332,7 +332,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, objcg = rcu_replace_pointer(memcg->objcg, NULL, true); - spin_lock_irq(&css_set_lock); + spin_lock_irq(&objcg_lock); /* 1) Ready to reparent active objcg. */ list_add(&objcg->list, &memcg->objcg_list); @@ -342,7 +342,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, /* 3) Move already reparented objcgs to the parent's list */ list_splice(&memcg->objcg_list, &parent->objcg_list); - spin_unlock_irq(&css_set_lock); + spin_unlock_irq(&objcg_lock); percpu_ref_kill(&objcg->refcnt); } From 8913c61001482378d4ed8cc577b17c1ba3e847e4 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Fri, 11 Feb 2022 16:32:35 -0800 Subject: [PATCH 1280/1295] kfence: make test case compatible with run time set sample interval The parameter kfence_sample_interval can be set via boot parameter and late shell command, which is convenient for automated tests and KFENCE parameter optimization. However, KFENCE test case just uses compile-time CONFIG_KFENCE_SAMPLE_INTERVAL, which will make KFENCE test case not run as users desired. Export kfence_sample_interval, so that KFENCE test case can use run-time-set sample interval. Link: https://lkml.kernel.org/r/20220207034432.185532-1-liupeng256@huawei.com Signed-off-by: Peng Liu Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Sumit Semwal Cc: Christian Knig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfence.h | 2 ++ mm/kfence/core.c | 3 ++- mm/kfence/kfence_test.c | 8 ++++---- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 4b5e3679a72c..f49e64222628 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -17,6 +17,8 @@ #include #include +extern unsigned long kfence_sample_interval; + /* * We allocate an even number of pages, as it simplifies calculations to map * address to metadata indices; effectively, the very first page serves as an diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 5ad40e3add45..13128fa13062 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -47,7 +47,8 @@ static bool kfence_enabled __read_mostly; -static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; +unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; +EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */ #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index a22b1af85577..50dbb815a2a8 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -268,13 +268,13 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat * 100x the sample interval should be more than enough to ensure we get * a KFENCE allocation eventually. */ - timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL); + timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval); /* * Especially for non-preemption kernels, ensure the allocation-gate * timer can catch up: after @resched_after, every failed allocation * attempt yields, to ensure the allocation-gate timer is scheduled. */ - resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL); + resched_after = jiffies + msecs_to_jiffies(kfence_sample_interval); do { if (test_cache) alloc = kmem_cache_alloc(test_cache, gfp); @@ -608,7 +608,7 @@ static void test_gfpzero(struct kunit *test) int i; /* Skip if we think it'd take too long. */ - KFENCE_TEST_REQUIRES(test, CONFIG_KFENCE_SAMPLE_INTERVAL <= 100); + KFENCE_TEST_REQUIRES(test, kfence_sample_interval <= 100); setup_test_cache(test, size, 0, NULL); buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); @@ -739,7 +739,7 @@ static void test_memcache_alloc_bulk(struct kunit *test) * 100x the sample interval should be more than enough to ensure we get * a KFENCE allocation eventually. */ - timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL); + timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval); do { void *objects[100]; int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects), From 736e8d89044c1c330967fb938fa766cd9e0d8af0 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 12 Feb 2022 10:08:54 +0100 Subject: [PATCH 1281/1295] Revert "usb: dwc2: drd: fix soft connect when gadget is unconfigured" This reverts commit 269cbcf7b72de6f0016806d4a0cec1d689b55a87. It causes build errors as reported by the kernel test robot. Link: https://lore.kernel.org/r/202202112236.AwoOTtHO-lkp@intel.com Reported-by: kernel test robot Fixes: 269cbcf7b72d ("usb: dwc2: drd: fix soft connect when gadget is unconfigured") Cc: stable@kernel.org Cc: Amelie Delaunay Cc: Minas Harutyunyan Cc: Fabrice Gasnier Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/drd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/usb/dwc2/drd.c b/drivers/usb/dwc2/drd.c index 9b6d44d90ad9..1b39c4776369 100644 --- a/drivers/usb/dwc2/drd.c +++ b/drivers/usb/dwc2/drd.c @@ -130,10 +130,8 @@ static int dwc2_drd_role_sw_set(struct usb_role_switch *sw, enum usb_role role) already = dwc2_ovr_avalid(hsotg, true); } else if (role == USB_ROLE_DEVICE) { already = dwc2_ovr_bvalid(hsotg, true); - if (hsotg->enabled) { - /* This clear DCTL.SFTDISCON bit */ - dwc2_hsotg_core_connect(hsotg); - } + /* This clear DCTL.SFTDISCON bit */ + dwc2_hsotg_core_connect(hsotg); } else { if (dwc2_is_device_mode(hsotg)) { if (!dwc2_ovr_bvalid(hsotg, false)) From 1b9e740a81f91ae338b29ed70455719804957b80 Mon Sep 17 00:00:00 2001 From: Jing Leng Date: Fri, 11 Feb 2022 17:27:36 +0800 Subject: [PATCH 1282/1295] kconfig: fix failing to generate auto.conf When the KCONFIG_AUTOCONFIG is specified (e.g. export \ KCONFIG_AUTOCONFIG=output/config/auto.conf), the directory of include/config/ will not be created, so kconfig can't create deps files in it and auto.conf can't be generated. Signed-off-by: Jing Leng Signed-off-by: Masahiro Yamada --- scripts/kconfig/confdata.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index 16897cb8cefd..d3c3a61308ad 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -994,14 +994,19 @@ static int conf_write_autoconf_cmd(const char *autoconf_name) static int conf_touch_deps(void) { - const char *name; + const char *name, *tmp; struct symbol *sym; int res, i; - strcpy(depfile_path, "include/config/"); - depfile_prefix_len = strlen(depfile_path); - name = conf_get_autoconfig_name(); + tmp = strrchr(name, '/'); + depfile_prefix_len = tmp ? tmp - name + 1 : 0; + if (depfile_prefix_len + 1 > sizeof(depfile_path)) + return -1; + + strncpy(depfile_path, name, depfile_prefix_len); + depfile_path[depfile_prefix_len] = 0; + conf_read_simple(name, S_DEF_AUTO); sym_calc_value(modules_sym); From 754e0b0e35608ed5206d6a67a791563c631cec07 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 13 Feb 2022 12:13:30 -0800 Subject: [PATCH 1283/1295] Linux 5.17-rc4 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index ceb987e5c87b..51e142f760f7 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,8 @@ VERSION = 5 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc3 -NAME = Gobble Gobble +EXTRAVERSION = -rc4 +NAME = Superb Owl # *DOCUMENTATION* # To see a list of typical targets execute "make help" From b160628e9ebcdc85d0db9d7f423c26b3c7c179d0 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 13 Feb 2022 22:29:25 +0100 Subject: [PATCH 1284/1295] parisc: Show error if wrong 32/64-bit compiler is being used It happens quite often that people use the wrong compiler to build the kernel: make ARCH=parisc -> builds the 32-bit kernel make ARCH=parisc64 -> builds the 64-bit kernel This patch adds a sanity check which errors out with an instruction how use the correct ARCH= option. Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # v5.15+ --- arch/parisc/include/asm/bitops.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h index 0ec9cfc5131f..56ffd260c669 100644 --- a/arch/parisc/include/asm/bitops.h +++ b/arch/parisc/include/asm/bitops.h @@ -12,6 +12,14 @@ #include #include +/* compiler build environment sanity checks: */ +#if !defined(CONFIG_64BIT) && defined(__LP64__) +#error "Please use 'ARCH=parisc' to build the 32-bit kernel." +#endif +#if defined(CONFIG_64BIT) && !defined(__LP64__) +#error "Please use 'ARCH=parisc64' to build the 64-bit kernel." +#endif + /* See http://marc.theaimsgroup.com/?t=108826637900003 for discussion * on use of volatile and __*_bit() (set/clear/change): * *_bit() want use of volatile. From dbd0b42350d5717786cb8257fbe5b528f3af9772 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 13 Feb 2022 22:52:11 +0100 Subject: [PATCH 1285/1295] parisc: Fix some apparent put_user() failures After commit 4b9d2a731c3d ("parisc: Switch user access functions to signal errors in r29 instead of r8") bash suddenly started to report those warnings after login: -bash: cannot set terminal process group (-1): Bad file descriptor -bash: no job control in this shell It turned out, that a function call inside a put_user(), e.g.: put_user(vt_do_kdgkbmode(console), (int __user *)arg); clobbered the error register (r29) and thus the put_user() call itself seem to have failed. Rearrange the C-code to pre-calculate the intermediate value and then do the put_user(). Additionally prefer the "+" constraint on pu_err and gu_err registers to tell the compiler that those operands are both read and written by the assembly instruction. Reported-by: John David Anglin Signed-off-by: Helge Deller Fixes: 4b9d2a731c3d ("parisc: Switch user access functions to signal errors in r29 instead of r8") Signed-off-by: Helge Deller --- arch/parisc/include/asm/uaccess.h | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index ebf8a845b017..123d5f16cd9d 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -89,8 +89,8 @@ struct exception_table_entry { __asm__("1: " ldx " 0(" sr "%2),%0\n" \ "9:\n" \ ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \ - : "=r"(__gu_val), "=r"(__gu_err) \ - : "r"(ptr), "1"(__gu_err)); \ + : "=r"(__gu_val), "+r"(__gu_err) \ + : "r"(ptr)); \ \ (val) = (__force __typeof__(*(ptr))) __gu_val; \ } @@ -123,8 +123,8 @@ struct exception_table_entry { "9:\n" \ ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \ ASM_EXCEPTIONTABLE_ENTRY_EFAULT(2b, 9b) \ - : "=&r"(__gu_tmp.l), "=r"(__gu_err) \ - : "r"(ptr), "1"(__gu_err)); \ + : "=&r"(__gu_tmp.l), "+r"(__gu_err) \ + : "r"(ptr)); \ \ (val) = __gu_tmp.t; \ } @@ -135,13 +135,12 @@ struct exception_table_entry { #define __put_user_internal(sr, x, ptr) \ ({ \ ASM_EXCEPTIONTABLE_VAR(__pu_err); \ - __typeof__(*(ptr)) __x = (__typeof__(*(ptr)))(x); \ \ switch (sizeof(*(ptr))) { \ - case 1: __put_user_asm(sr, "stb", __x, ptr); break; \ - case 2: __put_user_asm(sr, "sth", __x, ptr); break; \ - case 4: __put_user_asm(sr, "stw", __x, ptr); break; \ - case 8: STD_USER(sr, __x, ptr); break; \ + case 1: __put_user_asm(sr, "stb", x, ptr); break; \ + case 2: __put_user_asm(sr, "sth", x, ptr); break; \ + case 4: __put_user_asm(sr, "stw", x, ptr); break; \ + case 8: STD_USER(sr, x, ptr); break; \ default: BUILD_BUG(); \ } \ \ @@ -150,7 +149,9 @@ struct exception_table_entry { #define __put_user(x, ptr) \ ({ \ - __put_user_internal("%%sr3,", x, ptr); \ + __typeof__(&*(ptr)) __ptr = ptr; \ + __typeof__(*(__ptr)) __x = (__typeof__(*(__ptr)))(x); \ + __put_user_internal("%%sr3,", __x, __ptr); \ }) #define __put_kernel_nofault(dst, src, type, err_label) \ @@ -180,8 +181,8 @@ struct exception_table_entry { "1: " stx " %2,0(" sr "%1)\n" \ "9:\n" \ ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \ - : "=r"(__pu_err) \ - : "r"(ptr), "r"(x), "0"(__pu_err)) + : "+r"(__pu_err) \ + : "r"(ptr), "r"(x)) #if !defined(CONFIG_64BIT) @@ -193,8 +194,8 @@ struct exception_table_entry { "9:\n" \ ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \ ASM_EXCEPTIONTABLE_ENTRY_EFAULT(2b, 9b) \ - : "=r"(__pu_err) \ - : "r"(ptr), "r"(__val), "0"(__pu_err)); \ + : "+r"(__pu_err) \ + : "r"(ptr), "r"(__val)); \ } while (0) #endif /* !defined(CONFIG_64BIT) */ From 6e8793674bb0d1135ca0e5c9f7e16fecbf815926 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 14 Feb 2022 10:00:19 -0800 Subject: [PATCH 1286/1295] serial: parisc: GSC: fix build when IOSAPIC is not set There is a build error when using a kernel .config file from 'kernel test robot' for a different build problem: hppa64-linux-ld: drivers/tty/serial/8250/8250_gsc.o: in function `.LC3': (.data.rel.ro+0x18): undefined reference to `iosapic_serial_irq' when: CONFIG_GSC=y CONFIG_SERIO_GSCPS2=y CONFIG_SERIAL_8250_GSC=y CONFIG_PCI is not set and hence PCI_LBA is not set. IOSAPIC depends on PCI_LBA, so IOSAPIC is not set/enabled. Make the use of iosapic_serial_irq() conditional to fix the build error. Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: linux-parisc@vger.kernel.org Cc: Greg Kroah-Hartman Cc: linux-serial@vger.kernel.org Cc: Jiri Slaby Cc: Johan Hovold Suggested-by: Helge Deller Signed-off-by: Helge Deller Cc: stable@vger.kernel.org Signed-off-by: Helge Deller --- drivers/tty/serial/8250/8250_gsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_gsc.c b/drivers/tty/serial/8250/8250_gsc.c index 673cda3d011d..948d0a1c6ae8 100644 --- a/drivers/tty/serial/8250/8250_gsc.c +++ b/drivers/tty/serial/8250/8250_gsc.c @@ -26,7 +26,7 @@ static int __init serial_init_chip(struct parisc_device *dev) unsigned long address; int err; -#ifdef CONFIG_64BIT +#if defined(CONFIG_64BIT) && defined(CONFIG_IOSAPIC) if (!dev->irq && (dev->id.sversion == 0xad)) dev->irq = iosapic_serial_irq(dev); #endif From f1d1b3a9b41125d8a98b6ec94713715af332ffb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Exp=C3=B3sito?= Date: Tue, 8 Feb 2022 19:37:02 +0100 Subject: [PATCH 1287/1295] HID: apple: Refactor key translation setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code used to map the apple_key_translation structs is duplicated. Extract it to a common function. Refactor, no functional changes. Signed-off-by: José Expósito Signed-off-by: Jiri Kosina --- drivers/hid/hid-apple.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 7dc89dc6b0f0..5d2778b6dbcb 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -202,6 +202,15 @@ static const struct apple_key_translation swapped_fn_leftctrl_keys[] = { { } }; +static inline void apple_setup_key_translation(struct input_dev *input, + const struct apple_key_translation *table) +{ + const struct apple_key_translation *trans; + + for (trans = table; trans->from; trans++) + set_bit(trans->to, input->keybit); +} + static const struct apple_key_translation *apple_find_translation( const struct apple_key_translation *table, u16 from) { @@ -452,30 +461,17 @@ static __u8 *apple_report_fixup(struct hid_device *hdev, __u8 *rdesc, static void apple_setup_input(struct input_dev *input) { - const struct apple_key_translation *trans; - set_bit(KEY_NUMLOCK, input->keybit); /* Enable all needed keys */ - for (trans = apple_fn_keys; trans->from; trans++) - set_bit(trans->to, input->keybit); + apple_setup_key_translation(input, apple_fn_keys); + apple_setup_key_translation(input, powerbook_fn_keys); + apple_setup_key_translation(input, powerbook_numlock_keys); + apple_setup_key_translation(input, apple_iso_keyboard); + apple_setup_key_translation(input, apple2021_fn_keys); - for (trans = powerbook_fn_keys; trans->from; trans++) - set_bit(trans->to, input->keybit); - - for (trans = powerbook_numlock_keys; trans->from; trans++) - set_bit(trans->to, input->keybit); - - for (trans = apple_iso_keyboard; trans->from; trans++) - set_bit(trans->to, input->keybit); - - for (trans = apple2021_fn_keys; trans->from; trans++) - set_bit(trans->to, input->keybit); - - if (swap_fn_leftctrl) { - for (trans = swapped_fn_leftctrl_keys; trans->from; trans++) - set_bit(trans->to, input->keybit); - } + if (swap_fn_leftctrl) + apple_setup_key_translation(input, swapped_fn_leftctrl_keys); } static int apple_input_mapping(struct hid_device *hdev, struct hid_input *hi, From 0fea6fe7d5ef1b5fa5f78048d4729f85181c04ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Exp=C3=B3sito?= Date: Tue, 8 Feb 2022 19:37:03 +0100 Subject: [PATCH 1288/1295] HID: apple: Magic Keyboard first generation FN key mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function key mapping for the Magic Keyboard first generation (2007, 2009 and 2011 aluminum wireless models) was not present and the default one was used instead. This caused two main issues: - The F5 and F6 keys were sending KEY_KBDILLUMDOWN and KEY_KBDILLUMUP; however, the keyboard is not backlited. - The keyboard has the APPLE_NUMLOCK_EMULATION quirk with F6 set as the KEY_NUMLOCK key by "powerbook_numlock_keys". However, because F6 was mapped to KEY_KBDILLUMUP by the default mapping it was not possible to switch the numlock status. This means that, if numlock was enabled on session startup, it was not possible to disable it without connecting another keyboard. Add a custom translation table for the device leaving F5 unassigned and using F6 as the KEY_NUMLOCK key. Signed-off-by: José Expósito Signed-off-by: Jiri Kosina --- drivers/hid/hid-apple.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 5d2778b6dbcb..8097765f9df6 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -76,6 +76,27 @@ struct apple_key_translation { u8 flags; }; +static const struct apple_key_translation magic_keyboard_alu_fn_keys[] = { + { KEY_BACKSPACE, KEY_DELETE }, + { KEY_ENTER, KEY_INSERT }, + { KEY_F1, KEY_BRIGHTNESSDOWN, APPLE_FLAG_FKEY }, + { KEY_F2, KEY_BRIGHTNESSUP, APPLE_FLAG_FKEY }, + { KEY_F3, KEY_SCALE, APPLE_FLAG_FKEY }, + { KEY_F4, KEY_DASHBOARD, APPLE_FLAG_FKEY }, + { KEY_F6, KEY_NUMLOCK, APPLE_FLAG_FKEY }, + { KEY_F7, KEY_PREVIOUSSONG, APPLE_FLAG_FKEY }, + { KEY_F8, KEY_PLAYPAUSE, APPLE_FLAG_FKEY }, + { KEY_F9, KEY_NEXTSONG, APPLE_FLAG_FKEY }, + { KEY_F10, KEY_MUTE, APPLE_FLAG_FKEY }, + { KEY_F11, KEY_VOLUMEDOWN, APPLE_FLAG_FKEY }, + { KEY_F12, KEY_VOLUMEUP, APPLE_FLAG_FKEY }, + { KEY_UP, KEY_PAGEUP }, + { KEY_DOWN, KEY_PAGEDOWN }, + { KEY_LEFT, KEY_HOME }, + { KEY_RIGHT, KEY_END }, + { } +}; + static const struct apple_key_translation apple2021_fn_keys[] = { { KEY_BACKSPACE, KEY_DELETE }, { KEY_ENTER, KEY_INSERT }, @@ -251,9 +272,19 @@ static int hidinput_apple_event(struct hid_device *hid, struct input_dev *input, } if (fnmode) { - if (hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2021 || - hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021 || - hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_NUMPAD_2021) + if (hid->product == USB_DEVICE_ID_APPLE_ALU_WIRELESS_ANSI || + hid->product == USB_DEVICE_ID_APPLE_ALU_WIRELESS_ISO || + hid->product == USB_DEVICE_ID_APPLE_ALU_WIRELESS_JIS || + hid->product == USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_ANSI || + hid->product == USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_ISO || + hid->product == USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_JIS || + hid->product == USB_DEVICE_ID_APPLE_ALU_WIRELESS_2011_ANSI || + hid->product == USB_DEVICE_ID_APPLE_ALU_WIRELESS_2011_ISO || + hid->product == USB_DEVICE_ID_APPLE_ALU_WIRELESS_2011_JIS) + table = magic_keyboard_alu_fn_keys; + else if (hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2021 || + hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021 || + hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_NUMPAD_2021) table = apple2021_fn_keys; else if (hid->product >= USB_DEVICE_ID_APPLE_WELLSPRING4_ANSI && hid->product <= USB_DEVICE_ID_APPLE_WELLSPRING4A_JIS) @@ -468,6 +499,7 @@ static void apple_setup_input(struct input_dev *input) apple_setup_key_translation(input, powerbook_fn_keys); apple_setup_key_translation(input, powerbook_numlock_keys); apple_setup_key_translation(input, apple_iso_keyboard); + apple_setup_key_translation(input, magic_keyboard_alu_fn_keys); apple_setup_key_translation(input, apple2021_fn_keys); if (swap_fn_leftctrl) From 250b369ed2380ba9accda31931e63ae351ab9f6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Exp=C3=B3sito?= Date: Tue, 8 Feb 2022 19:37:04 +0100 Subject: [PATCH 1289/1295] HID: apple: Magic Keyboard 2015 FN key mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Magic Keyboard 2015 function key mapping was not present and the default mapping was used. While this worked for most keys, the F5 and F6 keys were sending KEY_KBDILLUMDOWN and KEY_KBDILLUMUP; however, the keyboard is not backlited. Add a custom translation table for the keyboard leaving F5 and F6 unassigned to mimic the default behavior on macOS. Signed-off-by: José Expósito Signed-off-by: Jiri Kosina --- drivers/hid/hid-apple.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 8097765f9df6..2ce404b46adf 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -97,6 +97,26 @@ static const struct apple_key_translation magic_keyboard_alu_fn_keys[] = { { } }; +static const struct apple_key_translation magic_keyboard_2015_fn_keys[] = { + { KEY_BACKSPACE, KEY_DELETE }, + { KEY_ENTER, KEY_INSERT }, + { KEY_F1, KEY_BRIGHTNESSDOWN, APPLE_FLAG_FKEY }, + { KEY_F2, KEY_BRIGHTNESSUP, APPLE_FLAG_FKEY }, + { KEY_F3, KEY_SCALE, APPLE_FLAG_FKEY }, + { KEY_F4, KEY_DASHBOARD, APPLE_FLAG_FKEY }, + { KEY_F7, KEY_PREVIOUSSONG, APPLE_FLAG_FKEY }, + { KEY_F8, KEY_PLAYPAUSE, APPLE_FLAG_FKEY }, + { KEY_F9, KEY_NEXTSONG, APPLE_FLAG_FKEY }, + { KEY_F10, KEY_MUTE, APPLE_FLAG_FKEY }, + { KEY_F11, KEY_VOLUMEDOWN, APPLE_FLAG_FKEY }, + { KEY_F12, KEY_VOLUMEUP, APPLE_FLAG_FKEY }, + { KEY_UP, KEY_PAGEUP }, + { KEY_DOWN, KEY_PAGEDOWN }, + { KEY_LEFT, KEY_HOME }, + { KEY_RIGHT, KEY_END }, + { } +}; + static const struct apple_key_translation apple2021_fn_keys[] = { { KEY_BACKSPACE, KEY_DELETE }, { KEY_ENTER, KEY_INSERT }, @@ -282,6 +302,9 @@ static int hidinput_apple_event(struct hid_device *hid, struct input_dev *input, hid->product == USB_DEVICE_ID_APPLE_ALU_WIRELESS_2011_ISO || hid->product == USB_DEVICE_ID_APPLE_ALU_WIRELESS_2011_JIS) table = magic_keyboard_alu_fn_keys; + else if (hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2015 || + hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_NUMPAD_2015) + table = magic_keyboard_2015_fn_keys; else if (hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2021 || hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021 || hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_NUMPAD_2021) @@ -500,6 +523,7 @@ static void apple_setup_input(struct input_dev *input) apple_setup_key_translation(input, powerbook_numlock_keys); apple_setup_key_translation(input, apple_iso_keyboard); apple_setup_key_translation(input, magic_keyboard_alu_fn_keys); + apple_setup_key_translation(input, magic_keyboard_2015_fn_keys); apple_setup_key_translation(input, apple2021_fn_keys); if (swap_fn_leftctrl) From 8ae5c16c9d421d43f32f66d2308031f1bd3f9336 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Exp=C3=B3sito?= Date: Tue, 8 Feb 2022 19:50:09 +0100 Subject: [PATCH 1290/1295] HID: apple: Report Magic Keyboard 2021 battery over USB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Like the Apple Magic Keyboard 2015, when connected over USB, the 2021 version registers 2 different interfaces. One of them is used to report the battery level. However, unlike when connected over Bluetooth, the battery level is not reported automatically and it is required to fetch it manually. Add the APPLE_RDESC_BATTERY quirk to fix the battery report descriptor and manually fetch the battery level. Tested with the ANSI, ISO and JIS variants of the keyboard. Signed-off-by: José Expósito Signed-off-by: Jiri Kosina --- drivers/hid/hid-apple.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 2ce404b46adf..3ed7a6998492 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -800,7 +800,7 @@ static const struct hid_device_id apple_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_GEYSER1_TP_ONLY), .driver_data = APPLE_NUMLOCK_EMULATION | APPLE_HAS_FN }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2021), - .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK }, + .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK | APPLE_RDESC_BATTERY }, { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2021), .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021), From cbfcfbfc384890a062a5d0cc4792df094a6cc7a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Exp=C3=B3sito?= Date: Tue, 8 Feb 2022 19:55:30 +0100 Subject: [PATCH 1291/1295] HID: apple: Report Magic Keyboard 2021 with fingerprint reader battery over USB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Like the Apple Magic Keyboard 2015, when connected over USB, the 2021 version with fingerprint reader registers 2 different interfaces. One of them is used to report the battery level. However, unlike when connected over Bluetooth, the battery level is not reported automatically and it is required to fetch it manually. Add the APPLE_RDESC_BATTERY quirk to fix the battery report descriptor and manually fetch the battery level. Tested with the ANSI variant of the keyboard with and without numpad. Signed-off-by: José Expósito Signed-off-by: Jiri Kosina --- drivers/hid/hid-apple.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 3ed7a6998492..b3b33813d16c 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -804,11 +804,11 @@ static const struct hid_device_id apple_devices[] = { { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2021), .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021), - .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK }, + .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK | APPLE_RDESC_BATTERY }, { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021), .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_NUMPAD_2021), - .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK }, + .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK | APPLE_RDESC_BATTERY }, { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_NUMPAD_2021), .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK }, From 9018eacbe623b2c3535da37035e5f22d3d70b6ce Mon Sep 17 00:00:00 2001 From: Paul Pawlowski Date: Thu, 3 Feb 2022 12:21:13 +0000 Subject: [PATCH 1292/1295] HID: apple: Add support for keyboard backlight on certain T2 Macs. This patch introduces the requisite plumbing for supporting keyboard backlight on T2-attached, USB exposed models. The quirk mechanism was used to reuse the existing hid-apple driver. Signed-off-by: Paul Pawlowski Signed-off-by: Aun-Ali Zaidi Signed-off-by: Aditya Garg Signed-off-by: Jiri Kosina --- drivers/hid/hid-apple.c | 125 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index b3b33813d16c..8998d005b1db 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -7,6 +7,7 @@ * Copyright (c) 2005 Michael Haboustak for Concept2, Inc * Copyright (c) 2006-2007 Jiri Kosina * Copyright (c) 2008 Jiri Slaby + * Copyright (c) 2019 Paul Pawlowski */ /* @@ -33,6 +34,7 @@ /* BIT(7) reserved, was: APPLE_IGNORE_HIDINPUT */ #define APPLE_NUMLOCK_EMULATION BIT(8) #define APPLE_RDESC_BATTERY BIT(9) +#define APPLE_BACKLIGHT_CTL BIT(10) #define APPLE_FLAG_FKEY 0x01 @@ -61,6 +63,12 @@ MODULE_PARM_DESC(swap_fn_leftctrl, "Swap the Fn and left Control keys. " "(For people who want to keep PC keyboard muscle memory. " "[0] = as-is, Mac layout, 1 = swapped, PC layout)"); +struct apple_sc_backlight { + struct led_classdev cdev; + struct hid_device *hdev; + unsigned short backlight_off, backlight_on_min, backlight_on_max; +}; + struct apple_sc { struct hid_device *hdev; unsigned long quirks; @@ -68,6 +76,7 @@ struct apple_sc { unsigned int fn_found; DECLARE_BITMAP(pressed_numlock, KEY_CNT); struct timer_list battery_timer; + struct apple_sc_backlight *backlight; }; struct apple_key_translation { @@ -117,6 +126,20 @@ static const struct apple_key_translation magic_keyboard_2015_fn_keys[] = { { } }; +struct apple_backlight_config_report { + u8 report_id; + u8 version; + u16 backlight_off, backlight_on_min, backlight_on_max; +}; + +struct apple_backlight_set_report { + u8 report_id; + u8 version; + u16 backlight; + u16 rate; +}; + + static const struct apple_key_translation apple2021_fn_keys[] = { { KEY_BACKSPACE, KEY_DELETE }, { KEY_ENTER, KEY_INSERT }, @@ -582,6 +605,105 @@ static int apple_input_configured(struct hid_device *hdev, return 0; } +static bool apple_backlight_check_support(struct hid_device *hdev) +{ + int i; + unsigned int hid; + struct hid_report *report; + + list_for_each_entry(report, &hdev->report_enum[HID_INPUT_REPORT].report_list, list) { + for (i = 0; i < report->maxfield; i++) { + hid = report->field[i]->usage->hid; + if ((hid & HID_USAGE_PAGE) == HID_UP_MSVENDOR && (hid & HID_USAGE) == 0xf) + return true; + } + } + + return false; +} + +static int apple_backlight_set(struct hid_device *hdev, u16 value, u16 rate) +{ + int ret = 0; + struct apple_backlight_set_report *rep; + + rep = kmalloc(sizeof(*rep), GFP_KERNEL); + if (rep == NULL) + return -ENOMEM; + + rep->report_id = 0xB0; + rep->version = 1; + rep->backlight = value; + rep->rate = rate; + + ret = hid_hw_raw_request(hdev, 0xB0u, (u8 *) rep, sizeof(*rep), + HID_OUTPUT_REPORT, HID_REQ_SET_REPORT); + + kfree(rep); + return ret; +} + +static int apple_backlight_led_set(struct led_classdev *led_cdev, + enum led_brightness brightness) +{ + struct apple_sc_backlight *backlight = container_of(led_cdev, + struct apple_sc_backlight, cdev); + + return apple_backlight_set(backlight->hdev, brightness, 0); +} + +static int apple_backlight_init(struct hid_device *hdev) +{ + int ret; + struct apple_sc *asc = hid_get_drvdata(hdev); + struct apple_backlight_config_report *rep; + + if (!apple_backlight_check_support(hdev)) + return -EINVAL; + + rep = kmalloc(0x200, GFP_KERNEL); + if (rep == NULL) + return -ENOMEM; + + ret = hid_hw_raw_request(hdev, 0xBFu, (u8 *) rep, sizeof(*rep), + HID_FEATURE_REPORT, HID_REQ_GET_REPORT); + if (ret < 0) { + hid_err(hdev, "backlight request failed: %d\n", ret); + goto cleanup_and_exit; + } + if (ret < 8 || rep->version != 1) { + hid_err(hdev, "backlight config struct: bad version %i\n", rep->version); + ret = -EINVAL; + goto cleanup_and_exit; + } + + hid_dbg(hdev, "backlight config: off=%u, on_min=%u, on_max=%u\n", + rep->backlight_off, rep->backlight_on_min, rep->backlight_on_max); + + asc->backlight = devm_kzalloc(&hdev->dev, sizeof(*asc->backlight), GFP_KERNEL); + if (!asc->backlight) { + ret = -ENOMEM; + goto cleanup_and_exit; + } + + asc->backlight->hdev = hdev; + asc->backlight->cdev.name = "apple::kbd_backlight"; + asc->backlight->cdev.max_brightness = rep->backlight_on_max; + asc->backlight->cdev.brightness_set_blocking = apple_backlight_led_set; + + ret = apple_backlight_set(hdev, 0, 0); + if (ret < 0) { + hid_err(hdev, "backlight set request failed: %d\n", ret); + goto cleanup_and_exit; + } + + ret = devm_led_classdev_register(&hdev->dev, &asc->backlight->cdev); + +cleanup_and_exit: + kfree(rep); + return ret; +} + static int apple_probe(struct hid_device *hdev, const struct hid_device_id *id) { @@ -617,6 +739,9 @@ static int apple_probe(struct hid_device *hdev, jiffies + msecs_to_jiffies(APPLE_BATTERY_TIMEOUT_MS)); apple_fetch_battery(hdev); + if (quirks & APPLE_BACKLIGHT_CTL) + apple_backlight_init(hdev); + return 0; } From 42f6a2d300233ae3da48f90f3463695b89c21ea0 Mon Sep 17 00:00:00 2001 From: Aun-Ali Zaidi Date: Thu, 3 Feb 2022 12:22:09 +0000 Subject: [PATCH 1293/1295] HID: apple: Add necessary IDs and configuration for T2 Macs. This patch adds the necessary IDs and configuration for Macs with the T2 Security chip. Signed-off-by: Aun-Ali Zaidi Signed-off-by: Aditya Garg Signed-off-by: Jiri Kosina --- drivers/hid/hid-apple.c | 16 ++++++++++++++++ drivers/hid/hid-ids.h | 8 ++++++++ drivers/hid/hid-quirks.c | 16 ++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 8998d005b1db..bf48293aeb35 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -913,6 +913,22 @@ static const struct hid_device_id apple_devices[] = { .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING9_JIS), .driver_data = APPLE_HAS_FN | APPLE_RDESC_JIS }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J140K), + .driver_data = APPLE_HAS_FN | APPLE_BACKLIGHT_CTL }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J132), + .driver_data = APPLE_HAS_FN | APPLE_BACKLIGHT_CTL }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J680), + .driver_data = APPLE_HAS_FN | APPLE_BACKLIGHT_CTL }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J213), + .driver_data = APPLE_HAS_FN | APPLE_BACKLIGHT_CTL }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J214K), + .driver_data = APPLE_HAS_FN }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J223), + .driver_data = APPLE_HAS_FN }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J230K), + .driver_data = APPLE_HAS_FN }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J152F), + .driver_data = APPLE_HAS_FN }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_ANSI), .driver_data = APPLE_NUMLOCK_EMULATION | APPLE_HAS_FN }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_ISO), diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 78bd3ddda442..6c2a36b16ed2 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -167,6 +167,14 @@ #define USB_DEVICE_ID_APPLE_WELLSPRING9_ANSI 0x0272 #define USB_DEVICE_ID_APPLE_WELLSPRING9_ISO 0x0273 #define USB_DEVICE_ID_APPLE_WELLSPRING9_JIS 0x0274 +#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J140K 0x027a +#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J132 0x027b +#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J680 0x027c +#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J213 0x027d +#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J214K 0x027e +#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J223 0x027f +#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J230K 0x0280 +#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J152F 0x0340 #define USB_DEVICE_ID_APPLE_FOUNTAIN_TP_ONLY 0x030a #define USB_DEVICE_ID_APPLE_GEYSER1_TP_ONLY 0x030b #define USB_DEVICE_ID_APPLE_IRCONTROL 0x8240 diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index c066ba901867..dc67717d2dab 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -295,6 +295,14 @@ static const struct hid_device_id hid_have_special_driver[] = { { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING9_ANSI) }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING9_ISO) }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING9_JIS) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J140K) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J132) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J680) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J213) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J214K) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J223) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J230K) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J152F) }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_ANSI) }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_ISO) }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_JIS) }, @@ -930,6 +938,14 @@ static const struct hid_device_id hid_mouse_ignore_list[] = { { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING9_ANSI) }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING9_ISO) }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING9_JIS) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J140K) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J132) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J680) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J213) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J214K) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J223) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J230K) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRINGT2_J152F) }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_FOUNTAIN_TP_ONLY) }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_GEYSER1_TP_ONLY) }, { } From c5f09b1b45cbb90147846f82ec0489789c99667e Mon Sep 17 00:00:00 2001 From: Aditya Garg Date: Thu, 3 Feb 2022 12:23:02 +0000 Subject: [PATCH 1294/1295] HID: apple: Add fn mapping for MacBook Pros with Touch Bar This patch adds the Fn mapping for keyboards on certain T2 Macs. [jkosina@suse.cz: rebase on top of apple_setup_input() refactoring] Signed-off-by: Aditya Garg Signed-off-by: Jiri Kosina --- drivers/hid/hid-apple.c | 58 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index bf48293aeb35..0cf35caee9fa 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -183,6 +183,51 @@ static const struct apple_key_translation macbookair_fn_keys[] = { { } }; +static const struct apple_key_translation macbookpro_no_esc_fn_keys[] = { + { KEY_BACKSPACE, KEY_DELETE }, + { KEY_ENTER, KEY_INSERT }, + { KEY_GRAVE, KEY_ESC }, + { KEY_1, KEY_F1 }, + { KEY_2, KEY_F2 }, + { KEY_3, KEY_F3 }, + { KEY_4, KEY_F4 }, + { KEY_5, KEY_F5 }, + { KEY_6, KEY_F6 }, + { KEY_7, KEY_F7 }, + { KEY_8, KEY_F8 }, + { KEY_9, KEY_F9 }, + { KEY_0, KEY_F10 }, + { KEY_MINUS, KEY_F11 }, + { KEY_EQUAL, KEY_F12 }, + { KEY_UP, KEY_PAGEUP }, + { KEY_DOWN, KEY_PAGEDOWN }, + { KEY_LEFT, KEY_HOME }, + { KEY_RIGHT, KEY_END }, + { } +}; + +static const struct apple_key_translation macbookpro_dedicated_esc_fn_keys[] = { + { KEY_BACKSPACE, KEY_DELETE }, + { KEY_ENTER, KEY_INSERT }, + { KEY_1, KEY_F1 }, + { KEY_2, KEY_F2 }, + { KEY_3, KEY_F3 }, + { KEY_4, KEY_F4 }, + { KEY_5, KEY_F5 }, + { KEY_6, KEY_F6 }, + { KEY_7, KEY_F7 }, + { KEY_8, KEY_F8 }, + { KEY_9, KEY_F9 }, + { KEY_0, KEY_F10 }, + { KEY_MINUS, KEY_F11 }, + { KEY_EQUAL, KEY_F12 }, + { KEY_UP, KEY_PAGEUP }, + { KEY_DOWN, KEY_PAGEDOWN }, + { KEY_LEFT, KEY_HOME }, + { KEY_RIGHT, KEY_END }, + { } +}; + static const struct apple_key_translation apple_fn_keys[] = { { KEY_BACKSPACE, KEY_DELETE }, { KEY_ENTER, KEY_INSERT }, @@ -332,6 +377,17 @@ static int hidinput_apple_event(struct hid_device *hid, struct input_dev *input, hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021 || hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_NUMPAD_2021) table = apple2021_fn_keys; + else if (hid->product == USB_DEVICE_ID_APPLE_WELLSPRINGT2_J132 || + hid->product == USB_DEVICE_ID_APPLE_WELLSPRINGT2_J680 || + hid->product == USB_DEVICE_ID_APPLE_WELLSPRINGT2_J213) + table = macbookpro_no_esc_fn_keys; + else if (hid->product == USB_DEVICE_ID_APPLE_WELLSPRINGT2_J214K || + hid->product == USB_DEVICE_ID_APPLE_WELLSPRINGT2_J223 || + hid->product == USB_DEVICE_ID_APPLE_WELLSPRINGT2_J152F) + table = macbookpro_dedicated_esc_fn_keys; + else if (hid->product == USB_DEVICE_ID_APPLE_WELLSPRINGT2_J140K || + hid->product == USB_DEVICE_ID_APPLE_WELLSPRINGT2_J230K) + table = apple_fn_keys; else if (hid->product >= USB_DEVICE_ID_APPLE_WELLSPRING4_ANSI && hid->product <= USB_DEVICE_ID_APPLE_WELLSPRING4A_JIS) table = macbookair_fn_keys; @@ -548,6 +604,8 @@ static void apple_setup_input(struct input_dev *input) apple_setup_key_translation(input, magic_keyboard_alu_fn_keys); apple_setup_key_translation(input, magic_keyboard_2015_fn_keys); apple_setup_key_translation(input, apple2021_fn_keys); + apple_setup_key_translation(input, macbookpro_no_esc_fn_keys); + apple_setup_key_translation(input, macbookpro_dedicated_esc_fn_keys); if (swap_fn_leftctrl) apple_setup_key_translation(input, swapped_fn_leftctrl_keys); From ed9f4f961260de51ed76d903e86b35ee8ce8eeb3 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 17 Feb 2022 08:17:02 +0100 Subject: [PATCH 1295/1295] HID: apple: properly reflect LEDS dependency Since hid-apple driver now makes use of LEDS functionality, reflect this properly in Kconfig. Reported-by: kernel test robot Fixes: 9018eacbe623b ("HID: apple: Add support for keyboard backlight on certain T2 Macs.") Signed-off-by: Jiri Kosina --- drivers/hid/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index f5544157576c..9c0e45f56d84 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -128,6 +128,8 @@ config HID_ACRUX_FF config HID_APPLE tristate "Apple {i,Power,Mac}Books" depends on HID + depends on LEDS_CLASS + depends on NEW_LEDS default !EXPERT help Support for some Apple devices which less or more break