From 88b333b0ed790f9433ff542b163bf972953b74d3 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Tue, 20 Dec 2016 08:54:29 -0700 Subject: [PATCH 01/86] drm/msm: Ensure that the hardware write pointer is valid Currently the value written to CP_RB_WPTR is calculated on the fly as (rb->next - rb->start). But as the code is designed rb->next is wrapped before writing the commands so if a series of commands happened to fit perfectly in the ringbuffer, rb->next would end up being equal to rb->size / 4 and thus result in an out of bounds address to CP_RB_WPTR. The easiest way to fix this is to mask WPTR when writing it to the hardware; it makes the hardware happy and the rest of the ringbuffer math appears to work and there isn't any point in upsetting anything. Signed-off-by: Jordan Crouse [squash in is_power_of_2() check] Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 9 ++++++++- drivers/gpu/drm/msm/msm_ringbuffer.c | 3 ++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index a18126150e11..14ff87686a36 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -213,7 +213,14 @@ void adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, void adreno_flush(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); - uint32_t wptr = get_wptr(gpu->rb); + uint32_t wptr; + + /* + * Mask wptr value that we calculate to fit in the HW range. This is + * to account for the possibility that the last command fit exactly into + * the ringbuffer and rb->next hasn't wrapped to zero yet + */ + wptr = get_wptr(gpu->rb) & ((gpu->rb->size / 4) - 1); /* ensure writes to ringbuffer have hit system memory: */ mb(); diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c index f326cf6a32e6..67b34e069abf 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.c +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c @@ -23,7 +23,8 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size) struct msm_ringbuffer *ring; int ret; - size = ALIGN(size, 4); /* size should be dword aligned */ + if (WARN_ON(!is_power_of_2(size))) + return ERR_PTR(-EINVAL); ring = kzalloc(sizeof(*ring), GFP_KERNEL); if (!ring) { From 6490abc4bc35fa4f3bdb9c7e49096943c50e29ea Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Tue, 20 Dec 2016 08:54:30 -0700 Subject: [PATCH 02/86] drm/msm: Put back the vaddr in submit_reloc() The error cases in submit_reloc() need to put back the virtual address of the bo before failling. Add a single failure path for the function. Signed-off-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gem_submit.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 166e84e4f0d4..b6411ea6ed17 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -290,7 +290,7 @@ static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *ob { uint32_t i, last_offset = 0; uint32_t *ptr; - int ret; + int ret = 0; if (offset % 4) { DRM_ERROR("non-aligned cmdstream buffer: %u\n", offset); @@ -318,12 +318,13 @@ static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *ob ret = copy_from_user(&submit_reloc, userptr, sizeof(submit_reloc)); if (ret) - return -EFAULT; + goto out; if (submit_reloc.submit_offset % 4) { DRM_ERROR("non-aligned reloc offset: %u\n", submit_reloc.submit_offset); - return -EINVAL; + ret = -EINVAL; + goto out; } /* offset in dwords: */ @@ -332,12 +333,13 @@ static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *ob if ((off >= (obj->base.size / 4)) || (off < last_offset)) { DRM_ERROR("invalid offset %u at reloc %u\n", off, i); - return -EINVAL; + ret = -EINVAL; + goto out; } ret = submit_bo(submit, submit_reloc.reloc_idx, NULL, &iova, &valid); if (ret) - return ret; + goto out; if (valid) continue; @@ -354,9 +356,10 @@ static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *ob last_offset = off; } +out: msm_gem_put_vaddr_locked(&obj->base); - return 0; + return ret; } static void submit_cleanup(struct msm_gem_submit *submit) From a6cb3b864b21b7345f824a4faa12b723c8aaf099 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Tue, 20 Dec 2016 08:54:31 -0700 Subject: [PATCH 03/86] drm/msm: Verify that MSM_SUBMIT_BO_FLAGS are set For every submission buffer object one of MSM_SUBMIT_BO_WRITE and MSM_SUBMIT_BO_READ must be set (and nothing else). If we allowed zero then the buffer object would never get queued to be unreferenced. Signed-off-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gem_submit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index b6411ea6ed17..489676568a10 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -106,7 +106,8 @@ static int submit_lookup_objects(struct msm_gem_submit *submit, pagefault_disable(); } - if (submit_bo.flags & ~MSM_SUBMIT_BO_FLAGS) { + if ((submit_bo.flags & ~MSM_SUBMIT_BO_FLAGS) || + !(submit_bo.flags & MSM_SUBMIT_BO_FLAGS)) { DRM_ERROR("invalid flags: %x\n", submit_bo.flags); ret = -EINVAL; goto out_unlock; From abc8d5832fd142d565fbfca163348f33b08bc1fe Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 16 Dec 2016 10:08:14 +0100 Subject: [PATCH 04/86] gpio: mxs: remove __init annotation Building with an old toolchain, I ran into this warning: WARNING: vmlinux.o(.text+0x63eef0): Section mismatch in reference from the function mxs_gpio_probe() to the function .init.text:mxs_gpio_init_gc() Clearly the annotation is wrong, since the function is called from the non-init probe, so let's remove it. Signed-off-by: Arnd Bergmann Signed-off-by: Linus Walleij --- drivers/gpio/gpio-mxs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-mxs.c b/drivers/gpio/gpio-mxs.c index 1e8fde8cb803..2292742eac8f 100644 --- a/drivers/gpio/gpio-mxs.c +++ b/drivers/gpio/gpio-mxs.c @@ -205,7 +205,7 @@ static int mxs_gpio_set_wake_irq(struct irq_data *d, unsigned int enable) return 0; } -static int __init mxs_gpio_init_gc(struct mxs_gpio_port *port, int irq_base) +static int mxs_gpio_init_gc(struct mxs_gpio_port *port, int irq_base) { struct irq_chip_generic *gc; struct irq_chip_type *ct; From 5018ada69a04c8ac21d74bd682fceb8e42dc0f96 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 19 Dec 2016 18:29:23 +0100 Subject: [PATCH 05/86] gpio: Move freeing of GPIO hogs before numbing of the device When removing a gpiochip that uses GPIO hogging (e.g. by unloading the chip's DT overlay), a warning is printed: gpio gpiochip8: REMOVING GPIOCHIP WITH GPIOS STILL REQUESTED This happens because gpiochip_free_hogs() is called after the gdev->chip pointer is reset to NULL. Hence __gpiod_free() cannot determine the chip in use, and cannot clear flags nor call the optional chip-specific .free() callback. Move the call to gpiochip_free_hogs() up to fix this. Cc: stable@vger.kernel.org Fixes: ff2b135922992756 ("gpio: make the gpiochip a real device") Signed-off-by: Geert Uytterhoeven Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index f4c26c7826cd..86bf3b84ada5 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1317,12 +1317,12 @@ void gpiochip_remove(struct gpio_chip *chip) /* FIXME: should the legacy sysfs handling be moved to gpio_device? */ gpiochip_sysfs_unregister(gdev); + gpiochip_free_hogs(chip); /* Numb the device, cancelling all outstanding operations */ gdev->chip = NULL; gpiochip_irqchip_remove(chip); acpi_gpiochip_remove(chip); gpiochip_remove_pin_ranges(chip); - gpiochip_free_hogs(chip); of_gpiochip_remove(chip); /* * We accept no more calls into the driver from this point, so From 07825f0acd85dd8b7481d5ef0eb024b05364d892 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 29 Dec 2016 17:44:15 +0800 Subject: [PATCH 06/86] crypto: aesni - Fix failure when built-in with modular pcbc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If aesni is built-in but pcbc is built as a module, then aesni will fail completely because when it tries to register the pcbc variant of aes the pcbc template is not available. This patch fixes this by modifying the pcbc presence test so that if aesni is built-in then pcbc must also be built-in for it to be used by aesni. Fixes: 85671860caac ("crypto: aesni - Convert to skcipher") Reported-by: Stephan Müller Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 31c34ee131f3..6ef688a1ef3e 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1020,7 +1020,8 @@ struct { const char *basename; struct simd_skcipher_alg *simd; } aesni_simd_skciphers2[] = { -#if IS_ENABLED(CONFIG_CRYPTO_PCBC) +#if (defined(MODULE) && IS_ENABLED(CONFIG_CRYPTO_PCBC)) || \ + IS_BUILTIN(CONFIG_CRYPTO_PCBC) { .algname = "pcbc(aes)", .drvname = "pcbc-aes-aesni", From 570b90fa230b8021f51a67fab2245fe8df6fe37d Mon Sep 17 00:00:00 2001 From: Andrew Lutomirski Date: Mon, 12 Dec 2016 12:55:55 -0800 Subject: [PATCH 07/86] orinoco: Use shash instead of ahash for MIC calculations Eric Biggers pointed out that the orinoco driver pointed scatterlists at the stack. Fix it by switching from ahash to shash. The result should be simpler, faster, and more correct. kvalo: cherry picked from commit 1fef293b8a9850cfa124a53c1d8878d355010403 as I accidentally applied this patch to wireless-drivers-next when I was supposed to apply this wireless-drivers Cc: stable@vger.kernel.org # 4.9 only Reported-by: Eric Biggers Signed-off-by: Andy Lutomirski Signed-off-by: Kalle Valo --- drivers/net/wireless/intersil/orinoco/mic.c | 44 +++++++++++-------- drivers/net/wireless/intersil/orinoco/mic.h | 3 +- .../net/wireless/intersil/orinoco/orinoco.h | 4 +- 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/drivers/net/wireless/intersil/orinoco/mic.c b/drivers/net/wireless/intersil/orinoco/mic.c index bc7397d709d3..08bc7822f820 100644 --- a/drivers/net/wireless/intersil/orinoco/mic.c +++ b/drivers/net/wireless/intersil/orinoco/mic.c @@ -16,7 +16,7 @@ /********************************************************************/ int orinoco_mic_init(struct orinoco_private *priv) { - priv->tx_tfm_mic = crypto_alloc_ahash("michael_mic", 0, + priv->tx_tfm_mic = crypto_alloc_shash("michael_mic", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(priv->tx_tfm_mic)) { printk(KERN_DEBUG "orinoco_mic_init: could not allocate " @@ -25,7 +25,7 @@ int orinoco_mic_init(struct orinoco_private *priv) return -ENOMEM; } - priv->rx_tfm_mic = crypto_alloc_ahash("michael_mic", 0, + priv->rx_tfm_mic = crypto_alloc_shash("michael_mic", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(priv->rx_tfm_mic)) { printk(KERN_DEBUG "orinoco_mic_init: could not allocate " @@ -40,17 +40,16 @@ int orinoco_mic_init(struct orinoco_private *priv) void orinoco_mic_free(struct orinoco_private *priv) { if (priv->tx_tfm_mic) - crypto_free_ahash(priv->tx_tfm_mic); + crypto_free_shash(priv->tx_tfm_mic); if (priv->rx_tfm_mic) - crypto_free_ahash(priv->rx_tfm_mic); + crypto_free_shash(priv->rx_tfm_mic); } -int orinoco_mic(struct crypto_ahash *tfm_michael, u8 *key, +int orinoco_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *da, u8 *sa, u8 priority, u8 *data, size_t data_len, u8 *mic) { - AHASH_REQUEST_ON_STACK(req, tfm_michael); - struct scatterlist sg[2]; + SHASH_DESC_ON_STACK(desc, tfm_michael); u8 hdr[ETH_HLEN + 2]; /* size of header + padding */ int err; @@ -67,18 +66,27 @@ int orinoco_mic(struct crypto_ahash *tfm_michael, u8 *key, hdr[ETH_ALEN * 2 + 2] = 0; hdr[ETH_ALEN * 2 + 3] = 0; - /* Use scatter gather to MIC header and data in one go */ - sg_init_table(sg, 2); - sg_set_buf(&sg[0], hdr, sizeof(hdr)); - sg_set_buf(&sg[1], data, data_len); + desc->tfm = tfm_michael; + desc->flags = 0; - if (crypto_ahash_setkey(tfm_michael, key, MIC_KEYLEN)) - return -1; + err = crypto_shash_setkey(tfm_michael, key, MIC_KEYLEN); + if (err) + return err; + + err = crypto_shash_init(desc); + if (err) + return err; + + err = crypto_shash_update(desc, hdr, sizeof(hdr)); + if (err) + return err; + + err = crypto_shash_update(desc, data, data_len); + if (err) + return err; + + err = crypto_shash_final(desc, mic); + shash_desc_zero(desc); - ahash_request_set_tfm(req, tfm_michael); - ahash_request_set_callback(req, 0, NULL, NULL); - ahash_request_set_crypt(req, sg, mic, data_len + sizeof(hdr)); - err = crypto_ahash_digest(req); - ahash_request_zero(req); return err; } diff --git a/drivers/net/wireless/intersil/orinoco/mic.h b/drivers/net/wireless/intersil/orinoco/mic.h index ce731d05cc98..e8724e889219 100644 --- a/drivers/net/wireless/intersil/orinoco/mic.h +++ b/drivers/net/wireless/intersil/orinoco/mic.h @@ -6,6 +6,7 @@ #define _ORINOCO_MIC_H_ #include +#include #define MICHAEL_MIC_LEN 8 @@ -15,7 +16,7 @@ struct crypto_ahash; int orinoco_mic_init(struct orinoco_private *priv); void orinoco_mic_free(struct orinoco_private *priv); -int orinoco_mic(struct crypto_ahash *tfm_michael, u8 *key, +int orinoco_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *da, u8 *sa, u8 priority, u8 *data, size_t data_len, u8 *mic); diff --git a/drivers/net/wireless/intersil/orinoco/orinoco.h b/drivers/net/wireless/intersil/orinoco/orinoco.h index 2f0c84b1c440..5fa1c3e3713f 100644 --- a/drivers/net/wireless/intersil/orinoco/orinoco.h +++ b/drivers/net/wireless/intersil/orinoco/orinoco.h @@ -152,8 +152,8 @@ struct orinoco_private { u8 *wpa_ie; int wpa_ie_len; - struct crypto_ahash *rx_tfm_mic; - struct crypto_ahash *tx_tfm_mic; + struct crypto_shash *rx_tfm_mic; + struct crypto_shash *tx_tfm_mic; unsigned int wpa_enabled:1; unsigned int tkip_cm_active:1; From 60f59ce0278557f7896d5158ae6d12a4855a72cc Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Wed, 21 Dec 2016 11:18:55 -0600 Subject: [PATCH 08/86] rtlwifi: rtl_usb: Fix missing entry in USB driver's private data These drivers need to be able to reference "struct ieee80211_hw" from the driver's private data, and vice versa. The USB driver failed to store the address of ieee80211_hw in the private data. Although this bug has been present for a long time, it was not exposed until commit ba9f93f82aba ("rtlwifi: Fix enter/exit power_save"). Fixes: ba9f93f82aba ("rtlwifi: Fix enter/exit power_save") Signed-off-by: Larry Finger Signed-off-by: Kalle Valo --- drivers/net/wireless/realtek/rtlwifi/usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/realtek/rtlwifi/usb.c b/drivers/net/wireless/realtek/rtlwifi/usb.c index 0a508649903d..49015b05f3d1 100644 --- a/drivers/net/wireless/realtek/rtlwifi/usb.c +++ b/drivers/net/wireless/realtek/rtlwifi/usb.c @@ -1063,6 +1063,7 @@ int rtl_usb_probe(struct usb_interface *intf, return -ENOMEM; } rtlpriv = hw->priv; + rtlpriv->hw = hw; rtlpriv->usb_data = kzalloc(RTL_USB_MAX_RX_COUNT * sizeof(u32), GFP_KERNEL); if (!rtlpriv->usb_data) From dcafc45dcb6d8bb6d159ed0a903bd0f3de597fac Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 2 Jan 2017 16:09:59 +0100 Subject: [PATCH 09/86] drm/meson: Fix plane atomic check when no crtc for the plane When no CRTC is associated with the plane, the meson_plane_atomic_check() call breaks the kernel with an Oops. Fixes: bbbe775ec5b5 ("drm: Add support for Amlogic Meson Graphic Controller") Signed-off-by: Neil Armstrong --- drivers/gpu/drm/meson/meson_plane.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/meson/meson_plane.c b/drivers/gpu/drm/meson/meson_plane.c index 4942ca090b46..7890e30eb584 100644 --- a/drivers/gpu/drm/meson/meson_plane.c +++ b/drivers/gpu/drm/meson/meson_plane.c @@ -51,6 +51,9 @@ static int meson_plane_atomic_check(struct drm_plane *plane, struct drm_crtc_state *crtc_state; struct drm_rect clip = { 0, }; + if (!state->crtc) + return 0; + crtc_state = drm_atomic_get_crtc_state(state->state, state->crtc); if (IS_ERR(crtc_state)) return PTR_ERR(crtc_state); From f97fd383d9a10fd125bcdafba03240685aed5608 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 19 Dec 2016 15:47:14 +0100 Subject: [PATCH 10/86] drm: tilcdc: simplify the recovery from sync lost error on rev1 Revision 2 of LCDC suffers from an issue where a SYNC_LOST error caused by limited memory bandwidth may leave the picture shifted a couple pixels to the right. This issue has not been observed on revision 1, while the recovery mechanism introduces a different issue, where the END_OF_FRAME interrupt doesn't fire while drm is waiting for vblanks. On rev1: recover from sync lost errors by simply clearing the RASTER_ENABLE bit in the RASTER_CTRL register and re-enabling it again as is suggested by the datasheet. Signed-off-by: Bartosz Golaszewski Reviewed-by: Jyri Sarha Signed-off-by: Jyri Sarha --- drivers/gpu/drm/tilcdc/tilcdc_crtc.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/tilcdc/tilcdc_crtc.c b/drivers/gpu/drm/tilcdc/tilcdc_crtc.c index 9942b0577d6e..20041073e46d 100644 --- a/drivers/gpu/drm/tilcdc/tilcdc_crtc.c +++ b/drivers/gpu/drm/tilcdc/tilcdc_crtc.c @@ -856,7 +856,7 @@ irqreturn_t tilcdc_crtc_irq(struct drm_crtc *crtc) struct tilcdc_crtc *tilcdc_crtc = to_tilcdc_crtc(crtc); struct drm_device *dev = crtc->dev; struct tilcdc_drm_private *priv = dev->dev_private; - uint32_t stat; + uint32_t stat, reg; stat = tilcdc_read_irqstatus(dev); tilcdc_clear_irqstatus(dev, stat); @@ -921,17 +921,26 @@ irqreturn_t tilcdc_crtc_irq(struct drm_crtc *crtc) dev_err_ratelimited(dev->dev, "%s(0x%08x): Sync lost", __func__, stat); tilcdc_crtc->frame_intact = false; - if (tilcdc_crtc->sync_lost_count++ > - SYNC_LOST_COUNT_LIMIT) { - dev_err(dev->dev, "%s(0x%08x): Sync lost flood detected, recovering", __func__, stat); - queue_work(system_wq, &tilcdc_crtc->recover_work); - if (priv->rev == 1) + if (priv->rev == 1) { + reg = tilcdc_read(dev, LCDC_RASTER_CTRL_REG); + if (reg & LCDC_RASTER_ENABLE) { tilcdc_clear(dev, LCDC_RASTER_CTRL_REG, - LCDC_V1_SYNC_LOST_INT_ENA); - else + LCDC_RASTER_ENABLE); + tilcdc_set(dev, LCDC_RASTER_CTRL_REG, + LCDC_RASTER_ENABLE); + } + } else { + if (tilcdc_crtc->sync_lost_count++ > + SYNC_LOST_COUNT_LIMIT) { + dev_err(dev->dev, + "%s(0x%08x): Sync lost flood detected, recovering", + __func__, stat); + queue_work(system_wq, + &tilcdc_crtc->recover_work); tilcdc_write(dev, LCDC_INT_ENABLE_CLR_REG, LCDC_SYNC_LOST); - tilcdc_crtc->sync_lost_count = 0; + tilcdc_crtc->sync_lost_count = 0; + } } } From aebe55c2d4b998741c0847ace1b4af47d73c763b Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Tue, 3 Jan 2017 01:14:27 +0200 Subject: [PATCH 11/86] drm: Clean up planes in atomic commit helper failure path If waiting for fences fails for blocking commits, planes must be cleaned up before returning. Cc: stable@vger.kernel.org Fixes: f6ce410a59a4 ("drm/fence: allow fence waiting to be interrupted by userspace") Signed-off-by: Laurent Pinchart Signed-off-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/20170102231427.7192-1-laurent.pinchart@ideasonboard.com --- drivers/gpu/drm/drm_atomic_helper.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index 583f47f27b36..34f757bcabae 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -1259,8 +1259,10 @@ int drm_atomic_helper_commit(struct drm_device *dev, if (!nonblock) { ret = drm_atomic_helper_wait_for_fences(dev, state, true); - if (ret) + if (ret) { + drm_atomic_helper_cleanup_planes(dev, state); return ret; + } } /* From 0c931a290cc0377c99a8cd970a49e736dbb23e0e Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 2 Jan 2017 16:14:15 +0100 Subject: [PATCH 12/86] drm/meson: Fix CVBS initialization when HDMI is configured by bootloader When the HDMI output is configured by the bootloader, there is mismatch is the pipeline configuration and the Vsync interrupt fails to trigger. This commit disables the HDMI blocks in the probe phase. Fixes: bbbe775ec5b5 ("drm: Add support for Amlogic Meson Graphic Controller") Signed-off-by: Neil Armstrong --- drivers/gpu/drm/meson/meson_venc.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/gpu/drm/meson/meson_venc.c b/drivers/gpu/drm/meson/meson_venc.c index d836b2274531..f7c870172220 100644 --- a/drivers/gpu/drm/meson/meson_venc.c +++ b/drivers/gpu/drm/meson/meson_venc.c @@ -38,6 +38,11 @@ * - TV Panel encoding via ENCT */ +/* HHI Registers */ +#define HHI_VDAC_CNTL0 0x2F4 /* 0xbd offset in data sheet */ +#define HHI_VDAC_CNTL1 0x2F8 /* 0xbe offset in data sheet */ +#define HHI_HDMI_PHY_CNTL0 0x3a0 /* 0xe8 offset in data sheet */ + struct meson_cvbs_enci_mode meson_cvbs_enci_pal = { .mode_tag = MESON_VENC_MODE_CVBS_PAL, .hso_begin = 3, @@ -242,6 +247,20 @@ void meson_venc_disable_vsync(struct meson_drm *priv) void meson_venc_init(struct meson_drm *priv) { + /* Disable CVBS VDAC */ + regmap_write(priv->hhi, HHI_VDAC_CNTL0, 0); + regmap_write(priv->hhi, HHI_VDAC_CNTL1, 8); + + /* Power Down Dacs */ + writel_relaxed(0xff, priv->io_base + _REG(VENC_VDAC_SETTING)); + + /* Disable HDMI PHY */ + regmap_write(priv->hhi, HHI_HDMI_PHY_CNTL0, 0); + + /* Disable HDMI */ + writel_bits_relaxed(0x3, 0, + priv->io_base + _REG(VPU_HDMI_SETTING)); + /* Disable all encoders */ writel_relaxed(0, priv->io_base + _REG(ENCI_VIDEO_EN)); writel_relaxed(0, priv->io_base + _REG(ENCP_VIDEO_EN)); From 5db60ea93d4fbf146c8f7ca286b8b2a091761460 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Wed, 4 Jan 2017 10:51:02 +0100 Subject: [PATCH 13/86] drm/meson: Fix CVBS VDAC disable This commit fixes the VDAC disabling register write values. Fixes: bbbe775ec5b5 ("drm: Add support for Amlogic Meson Graphic Controller") Signed-off-by: Neil Armstrong --- drivers/gpu/drm/meson/meson_venc_cvbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/meson/meson_venc_cvbs.c b/drivers/gpu/drm/meson/meson_venc_cvbs.c index c809c085fd78..a2bcc70a03ef 100644 --- a/drivers/gpu/drm/meson/meson_venc_cvbs.c +++ b/drivers/gpu/drm/meson/meson_venc_cvbs.c @@ -167,7 +167,7 @@ static void meson_venc_cvbs_encoder_disable(struct drm_encoder *encoder) /* Disable CVBS VDAC */ regmap_write(priv->hhi, HHI_VDAC_CNTL0, 0); - regmap_write(priv->hhi, HHI_VDAC_CNTL1, 0); + regmap_write(priv->hhi, HHI_VDAC_CNTL1, 8); } static void meson_venc_cvbs_encoder_enable(struct drm_encoder *encoder) From a2b1e8a20c992b01eeb76de00d4f534cbe9f3822 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Wed, 14 Dec 2016 11:59:34 +0100 Subject: [PATCH 14/86] selftests: do not require bash for the generated test Nothing in this minimal script seems to require bash. We often run these tests on embedded devices where the only shell available is the busybox ash. Use sh instead. Signed-off-by: Rolf Eike Beer Cc: stable@vger.kernel.org Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 71b05891a6a1..831022b12848 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -90,7 +90,7 @@ ifdef INSTALL_PATH done; @# Ask all targets to emit their test scripts - echo "#!/bin/bash" > $(ALL_SCRIPT) + echo "#!/bin/sh" > $(ALL_SCRIPT) echo "cd \$$(dirname \$$0)" >> $(ALL_SCRIPT) echo "ROOT=\$$PWD" >> $(ALL_SCRIPT) From d979e13a3fa9067c8cd46e292ed859626d443996 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Wed, 14 Dec 2016 11:58:20 +0100 Subject: [PATCH 15/86] selftests: do not require bash to run bpf tests Nothing in this minimal script seems to require bash. We often run these tests on embedded devices where the only shell available is the busybox ash. Use sh instead. Signed-off-by: Rolf Eike Beer Cc: stable@vger.kernel.org Acked-by: Daniel Borkmann Signed-off-by: Shuah Khan --- tools/testing/selftests/bpf/test_kmod.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_kmod.sh b/tools/testing/selftests/bpf/test_kmod.sh index 92e627adf354..6d58cca8e235 100755 --- a/tools/testing/selftests/bpf/test_kmod.sh +++ b/tools/testing/selftests/bpf/test_kmod.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh SRC_TREE=../../../../ From 3659f98b5375d195f1870c3e508fe51e52206839 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Wed, 14 Dec 2016 11:59:57 +0100 Subject: [PATCH 16/86] selftests: do not require bash to run netsocktests testcase Nothing in this minimal script seems to require bash. We often run these tests on embedded devices where the only shell available is the busybox ash. Use sh instead. Signed-off-by: Rolf Eike Beer Cc: stable@vger.kernel.org Signed-off-by: Shuah Khan --- tools/testing/selftests/net/run_netsocktests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/run_netsocktests b/tools/testing/selftests/net/run_netsocktests index c09a682df56a..16058bbea7a8 100755 --- a/tools/testing/selftests/net/run_netsocktests +++ b/tools/testing/selftests/net/run_netsocktests @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh echo "--------------------" echo "running socket test" From 7738789fba09108a28a5fb4739595d9a0a2f85fe Mon Sep 17 00:00:00 2001 From: Colin King Date: Tue, 27 Dec 2016 16:17:21 +0000 Subject: [PATCH 17/86] selftests: x86/pkeys: fix spelling mistake: "itertation" -> "iteration" Fix spelling mistake in print test pass message. Signed-off-by: Colin Ian King Signed-off-by: Shuah Khan --- tools/testing/selftests/x86/protection_keys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c index bdd58c78902e..df9e0a0cdf29 100644 --- a/tools/testing/selftests/x86/protection_keys.c +++ b/tools/testing/selftests/x86/protection_keys.c @@ -1367,7 +1367,7 @@ void run_tests_once(void) tracing_off(); close_test_fds(); - printf("test %2d PASSED (itertation %d)\n", test_nr, iteration_nr); + printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr); dprintf1("======================\n\n"); } iteration_nr++; From 7f4c4f80fd22ec7722e778c1d099e828d2b5dc40 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 19 Dec 2016 13:30:13 -0500 Subject: [PATCH 18/86] MAINTAINERS: Update mailing list for radeon and amdgpu amdgpu and radeon development has moved to this list. Signed-off-by: Alex Deucher --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index c7b8cf1240d9..cb4611867a88 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4116,7 +4116,7 @@ F: drivers/gpu/drm/cirrus/ RADEON and AMDGPU DRM DRIVERS M: Alex Deucher M: Christian König -L: dri-devel@lists.freedesktop.org +L: amd-gfx@lists.freedesktop.org T: git git://people.freedesktop.org/~agd5f/linux S: Supported F: drivers/gpu/drm/radeon/ From c4642a479fac9f5c224ff7425d86c427b94011af Mon Sep 17 00:00:00 2001 From: Junwei Zhang Date: Wed, 14 Dec 2016 15:32:28 -0500 Subject: [PATCH 19/86] drm/amd/amdgpu: add Polaris12 support (v3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v2: agd: squash in various fixes v3: agd: squash in: drm/amdgpu: remove unnecessary smc sk firmware for polaris12 Signed-off-by: Junwei Zhang Reviewed-by: Alex Deucher Reviewed-by: Christian König Reviewed-by: Ken Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 5 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 5 ++++ drivers/gpu/drm/amd/amdgpu/dce_v11_0.c | 13 ++++++--- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 31 ++++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 5 ++++ drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c | 6 +++++ drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 5 ++-- drivers/gpu/drm/amd/amdgpu/vi.c | 10 +++++++ drivers/gpu/drm/amd/include/amd_shared.h | 1 + 11 files changed, 76 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c index 9ada56c16a58..4c851fde1e82 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c @@ -840,6 +840,9 @@ static int amdgpu_cgs_get_firmware_info(struct cgs_device *cgs_device, else if (type == CGS_UCODE_ID_SMU_SK) strcpy(fw_name, "amdgpu/polaris10_smc_sk.bin"); break; + case CHIP_POLARIS12: + strcpy(fw_name, "amdgpu/polaris12_smc.bin"); + break; default: DRM_ERROR("SMC firmware not supported\n"); return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 60bd4afe45c8..fe3bb94fe58d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -73,6 +73,7 @@ static const char *amdgpu_asic_name[] = { "STONEY", "POLARIS10", "POLARIS11", + "POLARIS12", "LAST", }; @@ -1277,6 +1278,7 @@ static int amdgpu_early_init(struct amdgpu_device *adev) case CHIP_FIJI: case CHIP_POLARIS11: case CHIP_POLARIS10: + case CHIP_POLARIS12: case CHIP_CARRIZO: case CHIP_STONEY: if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index a81dfaeeb8c0..1d564beb0fde 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -65,6 +65,7 @@ #define FIRMWARE_STONEY "amdgpu/stoney_uvd.bin" #define FIRMWARE_POLARIS10 "amdgpu/polaris10_uvd.bin" #define FIRMWARE_POLARIS11 "amdgpu/polaris11_uvd.bin" +#define FIRMWARE_POLARIS12 "amdgpu/polaris12_uvd.bin" /** * amdgpu_uvd_cs_ctx - Command submission parser context @@ -98,6 +99,7 @@ MODULE_FIRMWARE(FIRMWARE_FIJI); MODULE_FIRMWARE(FIRMWARE_STONEY); MODULE_FIRMWARE(FIRMWARE_POLARIS10); MODULE_FIRMWARE(FIRMWARE_POLARIS11); +MODULE_FIRMWARE(FIRMWARE_POLARIS12); static void amdgpu_uvd_idle_work_handler(struct work_struct *work); @@ -149,6 +151,9 @@ int amdgpu_uvd_sw_init(struct amdgpu_device *adev) case CHIP_POLARIS11: fw_name = FIRMWARE_POLARIS11; break; + case CHIP_POLARIS12: + fw_name = FIRMWARE_POLARIS12; + break; default: return -EINVAL; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c index 69b66b9e7f57..8fec802d3908 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c @@ -52,6 +52,7 @@ #define FIRMWARE_STONEY "amdgpu/stoney_vce.bin" #define FIRMWARE_POLARIS10 "amdgpu/polaris10_vce.bin" #define FIRMWARE_POLARIS11 "amdgpu/polaris11_vce.bin" +#define FIRMWARE_POLARIS12 "amdgpu/polaris12_vce.bin" #ifdef CONFIG_DRM_AMDGPU_CIK MODULE_FIRMWARE(FIRMWARE_BONAIRE); @@ -66,6 +67,7 @@ MODULE_FIRMWARE(FIRMWARE_FIJI); MODULE_FIRMWARE(FIRMWARE_STONEY); MODULE_FIRMWARE(FIRMWARE_POLARIS10); MODULE_FIRMWARE(FIRMWARE_POLARIS11); +MODULE_FIRMWARE(FIRMWARE_POLARIS12); static void amdgpu_vce_idle_work_handler(struct work_struct *work); @@ -121,6 +123,9 @@ int amdgpu_vce_sw_init(struct amdgpu_device *adev, unsigned long size) case CHIP_POLARIS11: fw_name = FIRMWARE_POLARIS11; break; + case CHIP_POLARIS12: + fw_name = FIRMWARE_POLARIS12; + break; default: return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c index b3d62b909f43..2006abbbfb62 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c @@ -167,6 +167,7 @@ static void dce_v11_0_init_golden_registers(struct amdgpu_device *adev) (const u32)ARRAY_SIZE(stoney_golden_settings_a11)); break; case CHIP_POLARIS11: + case CHIP_POLARIS12: amdgpu_program_register_sequence(adev, polaris11_golden_settings_a11, (const u32)ARRAY_SIZE(polaris11_golden_settings_a11)); @@ -608,6 +609,7 @@ static int dce_v11_0_get_num_crtc (struct amdgpu_device *adev) num_crtc = 6; break; case CHIP_POLARIS11: + case CHIP_POLARIS12: num_crtc = 5; break; default: @@ -1589,6 +1591,7 @@ static int dce_v11_0_audio_init(struct amdgpu_device *adev) adev->mode_info.audio.num_pins = 8; break; case CHIP_POLARIS11: + case CHIP_POLARIS12: adev->mode_info.audio.num_pins = 6; break; default: @@ -2388,7 +2391,8 @@ static u32 dce_v11_0_pick_pll(struct drm_crtc *crtc) int pll; if ((adev->asic_type == CHIP_POLARIS10) || - (adev->asic_type == CHIP_POLARIS11)) { + (adev->asic_type == CHIP_POLARIS11) || + (adev->asic_type == CHIP_POLARIS12)) { struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(amdgpu_crtc->encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; @@ -2822,7 +2826,8 @@ static int dce_v11_0_crtc_mode_set(struct drm_crtc *crtc, return -EINVAL; if ((adev->asic_type == CHIP_POLARIS10) || - (adev->asic_type == CHIP_POLARIS11)) { + (adev->asic_type == CHIP_POLARIS11) || + (adev->asic_type == CHIP_POLARIS12)) { struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(amdgpu_crtc->encoder); int encoder_mode = @@ -2992,6 +2997,7 @@ static int dce_v11_0_early_init(void *handle) adev->mode_info.num_dig = 6; break; case CHIP_POLARIS11: + case CHIP_POLARIS12: adev->mode_info.num_hpd = 5; adev->mode_info.num_dig = 5; break; @@ -3101,7 +3107,8 @@ static int dce_v11_0_hw_init(void *handle) amdgpu_atombios_crtc_powergate_init(adev); amdgpu_atombios_encoder_init_dig(adev); if ((adev->asic_type == CHIP_POLARIS10) || - (adev->asic_type == CHIP_POLARIS11)) { + (adev->asic_type == CHIP_POLARIS11) || + (adev->asic_type == CHIP_POLARIS12)) { amdgpu_atombios_crtc_set_dce_clock(adev, adev->clock.default_dispclk, DCE_CLOCK_TYPE_DISPCLK, ATOM_GCK_DFS); amdgpu_atombios_crtc_set_dce_clock(adev, 0, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index d0ec00986f38..373374164bd5 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -139,6 +139,13 @@ MODULE_FIRMWARE("amdgpu/polaris10_mec.bin"); MODULE_FIRMWARE("amdgpu/polaris10_mec2.bin"); MODULE_FIRMWARE("amdgpu/polaris10_rlc.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_ce.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_pfp.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_me.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_mec.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_mec2.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_rlc.bin"); + static const struct amdgpu_gds_reg_offset amdgpu_gds_reg_offset[] = { {mmGDS_VMID0_BASE, mmGDS_VMID0_SIZE, mmGDS_GWS_VMID0, mmGDS_OA_VMID0}, @@ -689,6 +696,7 @@ static void gfx_v8_0_init_golden_registers(struct amdgpu_device *adev) (const u32)ARRAY_SIZE(tonga_golden_common_all)); break; case CHIP_POLARIS11: + case CHIP_POLARIS12: amdgpu_program_register_sequence(adev, golden_settings_polaris11_a11, (const u32)ARRAY_SIZE(golden_settings_polaris11_a11)); @@ -903,6 +911,9 @@ static int gfx_v8_0_init_microcode(struct amdgpu_device *adev) case CHIP_POLARIS10: chip_name = "polaris10"; break; + case CHIP_POLARIS12: + chip_name = "polaris12"; + break; case CHIP_STONEY: chip_name = "stoney"; break; @@ -1768,6 +1779,7 @@ static int gfx_v8_0_gpu_early_init(struct amdgpu_device *adev) gb_addr_config = TONGA_GB_ADDR_CONFIG_GOLDEN; break; case CHIP_POLARIS11: + case CHIP_POLARIS12: ret = amdgpu_atombios_get_gfx_info(adev); if (ret) return ret; @@ -2682,6 +2694,7 @@ static void gfx_v8_0_tiling_mode_table_init(struct amdgpu_device *adev) break; case CHIP_POLARIS11: + case CHIP_POLARIS12: modearray[0] = (ARRAY_MODE(ARRAY_2D_TILED_THIN1) | PIPE_CONFIG(ADDR_SURF_P4_16x16) | TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | @@ -3503,6 +3516,7 @@ gfx_v8_0_raster_config(struct amdgpu_device *adev, u32 *rconf, u32 *rconf1) *rconf1 |= 0x0; break; case CHIP_POLARIS11: + case CHIP_POLARIS12: *rconf |= RB_MAP_PKR0(2) | RB_XSEL2(1) | SE_MAP(2) | SE_XSEL(1) | SE_YSEL(1); *rconf1 |= 0x0; @@ -4021,7 +4035,8 @@ static void gfx_v8_0_init_pg(struct amdgpu_device *adev) cz_enable_cp_power_gating(adev, true); else cz_enable_cp_power_gating(adev, false); - } else if (adev->asic_type == CHIP_POLARIS11) { + } else if ((adev->asic_type == CHIP_POLARIS11) || + (adev->asic_type == CHIP_POLARIS12)) { gfx_v8_0_init_csb(adev); gfx_v8_0_init_save_restore_list(adev); gfx_v8_0_enable_save_restore_machine(adev); @@ -4095,7 +4110,8 @@ static int gfx_v8_0_rlc_resume(struct amdgpu_device *adev) RLC_CGCG_CGLS_CTRL__CGLS_EN_MASK); WREG32(mmRLC_CGCG_CGLS_CTRL, tmp); if (adev->asic_type == CHIP_POLARIS11 || - adev->asic_type == CHIP_POLARIS10) { + adev->asic_type == CHIP_POLARIS10 || + adev->asic_type == CHIP_POLARIS12) { tmp = RREG32(mmRLC_CGCG_CGLS_CTRL_3D); tmp &= ~0x3; WREG32(mmRLC_CGCG_CGLS_CTRL_3D, tmp); @@ -4283,6 +4299,7 @@ static int gfx_v8_0_cp_gfx_start(struct amdgpu_device *adev) amdgpu_ring_write(ring, 0x0000002A); break; case CHIP_POLARIS11: + case CHIP_POLARIS12: amdgpu_ring_write(ring, 0x16000012); amdgpu_ring_write(ring, 0x00000000); break; @@ -4664,7 +4681,8 @@ static int gfx_v8_0_cp_compute_resume(struct amdgpu_device *adev) (adev->asic_type == CHIP_FIJI) || (adev->asic_type == CHIP_STONEY) || (adev->asic_type == CHIP_POLARIS11) || - (adev->asic_type == CHIP_POLARIS10)) { + (adev->asic_type == CHIP_POLARIS10) || + (adev->asic_type == CHIP_POLARIS12)) { WREG32(mmCP_MEC_DOORBELL_RANGE_LOWER, AMDGPU_DOORBELL_KIQ << 2); WREG32(mmCP_MEC_DOORBELL_RANGE_UPPER, @@ -4700,7 +4718,8 @@ static int gfx_v8_0_cp_compute_resume(struct amdgpu_device *adev) mqd->cp_hqd_persistent_state = tmp; if (adev->asic_type == CHIP_STONEY || adev->asic_type == CHIP_POLARIS11 || - adev->asic_type == CHIP_POLARIS10) { + adev->asic_type == CHIP_POLARIS10 || + adev->asic_type == CHIP_POLARIS12) { tmp = RREG32(mmCP_ME1_PIPE3_INT_CNTL); tmp = REG_SET_FIELD(tmp, CP_ME1_PIPE3_INT_CNTL, GENERIC2_INT_ENABLE, 1); WREG32(mmCP_ME1_PIPE3_INT_CNTL, tmp); @@ -5279,7 +5298,8 @@ static int gfx_v8_0_late_init(void *handle) static void gfx_v8_0_enable_gfx_static_mg_power_gating(struct amdgpu_device *adev, bool enable) { - if (adev->asic_type == CHIP_POLARIS11) + if ((adev->asic_type == CHIP_POLARIS11) || + (adev->asic_type == CHIP_POLARIS12)) /* Send msg to SMU via Powerplay */ amdgpu_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_SMC, @@ -5353,6 +5373,7 @@ static int gfx_v8_0_set_powergating_state(void *handle, gfx_v8_0_enable_gfx_dynamic_mg_power_gating(adev, false); break; case CHIP_POLARIS11: + case CHIP_POLARIS12: if ((adev->pg_flags & AMD_PG_SUPPORT_GFX_SMG) && enable) gfx_v8_0_enable_gfx_static_mg_power_gating(adev, true); else diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index 0daac3a5be79..476bc9f1954b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -46,6 +46,7 @@ static int gmc_v8_0_wait_for_idle(void *handle); MODULE_FIRMWARE("amdgpu/tonga_mc.bin"); MODULE_FIRMWARE("amdgpu/polaris11_mc.bin"); MODULE_FIRMWARE("amdgpu/polaris10_mc.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_mc.bin"); static const u32 golden_settings_tonga_a11[] = { @@ -130,6 +131,7 @@ static void gmc_v8_0_init_golden_registers(struct amdgpu_device *adev) (const u32)ARRAY_SIZE(golden_settings_tonga_a11)); break; case CHIP_POLARIS11: + case CHIP_POLARIS12: amdgpu_program_register_sequence(adev, golden_settings_polaris11_a11, (const u32)ARRAY_SIZE(golden_settings_polaris11_a11)); @@ -225,6 +227,9 @@ static int gmc_v8_0_init_microcode(struct amdgpu_device *adev) case CHIP_POLARIS10: chip_name = "polaris10"; break; + case CHIP_POLARIS12: + chip_name = "polaris12"; + break; case CHIP_FIJI: case CHIP_CARRIZO: case CHIP_STONEY: diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c index 1170a64a3184..034ace79ed49 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c @@ -60,6 +60,8 @@ MODULE_FIRMWARE("amdgpu/polaris10_sdma.bin"); MODULE_FIRMWARE("amdgpu/polaris10_sdma1.bin"); MODULE_FIRMWARE("amdgpu/polaris11_sdma.bin"); MODULE_FIRMWARE("amdgpu/polaris11_sdma1.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_sdma.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_sdma1.bin"); static const u32 sdma_offsets[SDMA_MAX_INSTANCE] = @@ -206,6 +208,7 @@ static void sdma_v3_0_init_golden_registers(struct amdgpu_device *adev) (const u32)ARRAY_SIZE(golden_settings_tonga_a11)); break; case CHIP_POLARIS11: + case CHIP_POLARIS12: amdgpu_program_register_sequence(adev, golden_settings_polaris11_a11, (const u32)ARRAY_SIZE(golden_settings_polaris11_a11)); @@ -278,6 +281,9 @@ static int sdma_v3_0_init_microcode(struct amdgpu_device *adev) case CHIP_POLARIS10: chip_name = "polaris10"; break; + case CHIP_POLARIS12: + chip_name = "polaris12"; + break; case CHIP_CARRIZO: chip_name = "carrizo"; break; diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c index 6b3293a1c7b8..5fb0b7f5c065 100644 --- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c @@ -320,11 +320,12 @@ static unsigned vce_v3_0_get_harvest_config(struct amdgpu_device *adev) { u32 tmp; - /* Fiji, Stoney, Polaris10, Polaris11 are single pipe */ + /* Fiji, Stoney, Polaris10, Polaris11, Polaris12 are single pipe */ if ((adev->asic_type == CHIP_FIJI) || (adev->asic_type == CHIP_STONEY) || (adev->asic_type == CHIP_POLARIS10) || - (adev->asic_type == CHIP_POLARIS11)) + (adev->asic_type == CHIP_POLARIS11) || + (adev->asic_type == CHIP_POLARIS12)) return AMDGPU_VCE_HARVEST_VCE1; /* Tonga and CZ are dual or single pipe */ diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c index bf088d6d9bf1..c2ac54f11341 100644 --- a/drivers/gpu/drm/amd/amdgpu/vi.c +++ b/drivers/gpu/drm/amd/amdgpu/vi.c @@ -88,6 +88,7 @@ MODULE_FIRMWARE("amdgpu/polaris10_smc.bin"); MODULE_FIRMWARE("amdgpu/polaris10_smc_sk.bin"); MODULE_FIRMWARE("amdgpu/polaris11_smc.bin"); MODULE_FIRMWARE("amdgpu/polaris11_smc_sk.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_smc.bin"); /* * Indirect registers accessor @@ -312,6 +313,7 @@ static void vi_init_golden_registers(struct amdgpu_device *adev) break; case CHIP_POLARIS11: case CHIP_POLARIS10: + case CHIP_POLARIS12: default: break; } @@ -671,6 +673,7 @@ static int vi_read_register(struct amdgpu_device *adev, u32 se_num, case CHIP_TONGA: case CHIP_POLARIS11: case CHIP_POLARIS10: + case CHIP_POLARIS12: case CHIP_CARRIZO: case CHIP_STONEY: asic_register_table = cz_allowed_read_registers; @@ -994,6 +997,11 @@ static int vi_common_early_init(void *handle) adev->pg_flags = 0; adev->external_rev_id = adev->rev_id + 0x50; break; + case CHIP_POLARIS12: + adev->cg_flags = AMD_CG_SUPPORT_UVD_MGCG; + adev->pg_flags = 0; + adev->external_rev_id = adev->rev_id + 0x64; + break; case CHIP_CARRIZO: adev->cg_flags = AMD_CG_SUPPORT_UVD_MGCG | AMD_CG_SUPPORT_GFX_MGCG | @@ -1346,6 +1354,7 @@ static int vi_common_set_clockgating_state(void *handle, case CHIP_TONGA: case CHIP_POLARIS10: case CHIP_POLARIS11: + case CHIP_POLARIS12: vi_common_set_clockgating_state_by_smu(adev, state); default: break; @@ -1429,6 +1438,7 @@ int vi_set_ip_blocks(struct amdgpu_device *adev) break; case CHIP_POLARIS11: case CHIP_POLARIS10: + case CHIP_POLARIS12: amdgpu_ip_block_add(adev, &vi_common_ip_block); amdgpu_ip_block_add(adev, &gmc_v8_1_ip_block); amdgpu_ip_block_add(adev, &tonga_ih_ip_block); diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h index c02469ada9f1..5f59117109d7 100644 --- a/drivers/gpu/drm/amd/include/amd_shared.h +++ b/drivers/gpu/drm/amd/include/amd_shared.h @@ -46,6 +46,7 @@ enum amd_asic_type { CHIP_STONEY, CHIP_POLARIS10, CHIP_POLARIS11, + CHIP_POLARIS12, CHIP_LAST, }; From f4309526576db325264b6dc9ee150ee70b330a42 Mon Sep 17 00:00:00 2001 From: Junwei Zhang Date: Wed, 14 Dec 2016 15:40:48 -0500 Subject: [PATCH 20/86] drm/amdgpu/powerplay: add Polaris12 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Junwei Zhang Reviewed-by: Alex Deucher Reviewed-by: Christian König Reviewed-by: Ken Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c | 1 + drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c | 3 ++- drivers/gpu/drm/amd/powerplay/hwmgr/smu7_powertune.c | 2 +- drivers/gpu/drm/amd/powerplay/smumgr/smumgr.c | 1 + 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c index fc592c2b0e16..95a568df8551 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c @@ -98,6 +98,7 @@ static int amdgpu_pp_early_init(void *handle) switch (adev->asic_type) { case CHIP_POLARIS11: case CHIP_POLARIS10: + case CHIP_POLARIS12: case CHIP_TONGA: case CHIP_FIJI: case CHIP_TOPAZ: diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c index dc6700aee18f..b03606405a53 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c @@ -95,6 +95,7 @@ int hwmgr_init(struct amd_pp_init *pp_init, struct pp_instance *handle) break; case CHIP_POLARIS11: case CHIP_POLARIS10: + case CHIP_POLARIS12: polaris_set_asic_special_caps(hwmgr); hwmgr->feature_mask &= ~(PP_UVD_HANDSHAKE_MASK); break; @@ -745,7 +746,7 @@ int polaris_set_asic_special_caps(struct pp_hwmgr *hwmgr) phm_cap_set(hwmgr->platform_descriptor.platformCaps, PHM_PlatformCaps_TablelessHardwareInterface); - if (hwmgr->chip_id == CHIP_POLARIS11) + if ((hwmgr->chip_id == CHIP_POLARIS11) || (hwmgr->chip_id == CHIP_POLARIS12)) phm_cap_set(hwmgr->platform_descriptor.platformCaps, PHM_PlatformCaps_SPLLShutdownSupport); return 0; diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_powertune.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_powertune.c index 26477f0f09dc..6cd1287a7a8f 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_powertune.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_powertune.c @@ -521,7 +521,7 @@ int smu7_enable_didt_config(struct pp_hwmgr *hwmgr) PP_ASSERT_WITH_CODE((result == 0), "DIDT Config failed.", return result); result = smu7_program_pt_config_registers(hwmgr, DIDTConfig_Polaris10); PP_ASSERT_WITH_CODE((result == 0), "DIDT Config failed.", return result); - } else if (hwmgr->chip_id == CHIP_POLARIS11) { + } else if ((hwmgr->chip_id == CHIP_POLARIS11) || (hwmgr->chip_id == CHIP_POLARIS12)) { result = smu7_program_pt_config_registers(hwmgr, GCCACConfig_Polaris11); PP_ASSERT_WITH_CODE((result == 0), "DIDT Config failed.", return result); result = smu7_program_pt_config_registers(hwmgr, DIDTConfig_Polaris11); diff --git a/drivers/gpu/drm/amd/powerplay/smumgr/smumgr.c b/drivers/gpu/drm/amd/powerplay/smumgr/smumgr.c index e5812aa456f3..6e618aa20719 100644 --- a/drivers/gpu/drm/amd/powerplay/smumgr/smumgr.c +++ b/drivers/gpu/drm/amd/powerplay/smumgr/smumgr.c @@ -65,6 +65,7 @@ int smum_init(struct amd_pp_init *pp_init, struct pp_instance *handle) break; case CHIP_POLARIS11: case CHIP_POLARIS10: + case CHIP_POLARIS12: polaris10_smum_init(smumgr); break; default: From fc8e9c54699e42754094ff475da46440778d8f19 Mon Sep 17 00:00:00 2001 From: Junwei Zhang Date: Thu, 4 Aug 2016 12:54:22 +0800 Subject: [PATCH 21/86] drm/amd/amdgpu: add Polaris12 PCI ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Junwei Zhang Reviewed-by: Alex Deucher Reviewed-by: Christian König Reviewed-by: Ken Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 8cb937b2bfcc..2534adaebe30 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -418,6 +418,13 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x67CA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, {0x1002, 0x67CC, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, {0x1002, 0x67CF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, + /* Polaris12 */ + {0x1002, 0x6980, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, + {0x1002, 0x6981, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, + {0x1002, 0x6985, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, + {0x1002, 0x6986, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, + {0x1002, 0x6987, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, + {0x1002, 0x699F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, {0, 0, 0} }; From d6df71e125b4e4ab8932349ce81e09ef73304b91 Mon Sep 17 00:00:00 2001 From: Yintian Tao Date: Wed, 21 Dec 2016 14:32:21 +0800 Subject: [PATCH 22/86] drm/amdgpu: remove static integer for uvd pp state At two gpu core condition, static integer will cause that second gpu core uvd state setting will be directly skipped due to the first one setting Signed-off-by: Yintian Tao Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c index a79e283590fb..6de6becce745 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c @@ -791,15 +791,10 @@ static int uvd_v5_0_set_clockgating_state(void *handle, { struct amdgpu_device *adev = (struct amdgpu_device *)handle; bool enable = (state == AMD_CG_STATE_GATE) ? true : false; - static int curstate = -1; if (!(adev->cg_flags & AMD_CG_SUPPORT_UVD_MGCG)) return 0; - if (curstate == state) - return 0; - - curstate = state; if (enable) { /* wait for STATUS to clear */ if (uvd_v5_0_wait_for_idle(handle)) From 70fd80d6f7e37bf637331c682fafcce1112750ac Mon Sep 17 00:00:00 2001 From: Rex Zhu Date: Wed, 21 Dec 2016 17:44:59 +0800 Subject: [PATCH 23/86] drm/amd/powerplay: extend smu's response timeout time. Signed-off-by: Rex Zhu Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/include/amd_shared.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h index 5f59117109d7..85f358764bbc 100644 --- a/drivers/gpu/drm/amd/include/amd_shared.h +++ b/drivers/gpu/drm/amd/include/amd_shared.h @@ -23,7 +23,7 @@ #ifndef __AMD_SHARED_H__ #define __AMD_SHARED_H__ -#define AMD_MAX_USEC_TIMEOUT 100000 /* 100 ms */ +#define AMD_MAX_USEC_TIMEOUT 200000 /* 200 ms */ /* * Supported ASIC types From 5165484b02f2cbedb5bf3a41ff5e8ae16069016c Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Thu, 15 Dec 2016 13:43:59 +0800 Subject: [PATCH 24/86] drm/amdgpu: update si kicker smc firmware Use the appropriate smc firmware for each chip revision. Using the wrong one can cause stability issues. Acked-by: Edward O'Callaghan Signed-off-by: Flora Cui Reviewed-by: Junwei Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/si_dpm.c | 57 ++++++++++++++--------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/si_dpm.c b/drivers/gpu/drm/amd/amdgpu/si_dpm.c index 6c65a1a2de79..0566e9adaf8a 100644 --- a/drivers/gpu/drm/amd/amdgpu/si_dpm.c +++ b/drivers/gpu/drm/amd/amdgpu/si_dpm.c @@ -56,7 +56,6 @@ #define BIOS_SCRATCH_4 0x5cd MODULE_FIRMWARE("radeon/tahiti_smc.bin"); -MODULE_FIRMWARE("radeon/tahiti_k_smc.bin"); MODULE_FIRMWARE("radeon/pitcairn_smc.bin"); MODULE_FIRMWARE("radeon/pitcairn_k_smc.bin"); MODULE_FIRMWARE("radeon/verde_smc.bin"); @@ -7687,49 +7686,49 @@ static int si_dpm_init_microcode(struct amdgpu_device *adev) chip_name = "tahiti"; break; case CHIP_PITCAIRN: - if ((adev->pdev->revision == 0x81) || - (adev->pdev->device == 0x6810) || - (adev->pdev->device == 0x6811) || - (adev->pdev->device == 0x6816) || - (adev->pdev->device == 0x6817) || - (adev->pdev->device == 0x6806)) + if ((adev->pdev->revision == 0x81) && + ((adev->pdev->device == 0x6810) || + (adev->pdev->device == 0x6811))) chip_name = "pitcairn_k"; else chip_name = "pitcairn"; break; case CHIP_VERDE: - if ((adev->pdev->revision == 0x81) || - (adev->pdev->revision == 0x83) || - (adev->pdev->revision == 0x87) || - (adev->pdev->device == 0x6820) || - (adev->pdev->device == 0x6821) || - (adev->pdev->device == 0x6822) || - (adev->pdev->device == 0x6823) || - (adev->pdev->device == 0x682A) || - (adev->pdev->device == 0x682B)) + if (((adev->pdev->device == 0x6820) && + ((adev->pdev->revision == 0x81) || + (adev->pdev->revision == 0x83))) || + ((adev->pdev->device == 0x6821) && + ((adev->pdev->revision == 0x83) || + (adev->pdev->revision == 0x87))) || + ((adev->pdev->revision == 0x87) && + ((adev->pdev->device == 0x6823) || + (adev->pdev->device == 0x682b)))) chip_name = "verde_k"; else chip_name = "verde"; break; case CHIP_OLAND: - if ((adev->pdev->revision == 0xC7) || - (adev->pdev->revision == 0x80) || - (adev->pdev->revision == 0x81) || - (adev->pdev->revision == 0x83) || - (adev->pdev->revision == 0x87) || - (adev->pdev->device == 0x6604) || - (adev->pdev->device == 0x6605)) + if (((adev->pdev->revision == 0x81) && + ((adev->pdev->device == 0x6600) || + (adev->pdev->device == 0x6604) || + (adev->pdev->device == 0x6605) || + (adev->pdev->device == 0x6610))) || + ((adev->pdev->revision == 0x83) && + (adev->pdev->device == 0x6610))) chip_name = "oland_k"; else chip_name = "oland"; break; case CHIP_HAINAN: - if ((adev->pdev->revision == 0x81) || - (adev->pdev->revision == 0x83) || - (adev->pdev->revision == 0xC3) || - (adev->pdev->device == 0x6664) || - (adev->pdev->device == 0x6665) || - (adev->pdev->device == 0x6667)) + if (((adev->pdev->revision == 0x81) && + (adev->pdev->device == 0x6660)) || + ((adev->pdev->revision == 0x83) && + ((adev->pdev->device == 0x6660) || + (adev->pdev->device == 0x6663) || + (adev->pdev->device == 0x6665) || + (adev->pdev->device == 0x6667))) || + ((adev->pdev->revision == 0xc3) && + (adev->pdev->device == 0x6665))) chip_name = "hainan_k"; else chip_name = "hainan"; From 6458bd4dfd9414cba5804eb9907fe2a824278c34 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 5 Jan 2017 12:15:52 -0500 Subject: [PATCH 25/86] drm/radeon: update smc firmware selection for SI Use the appropriate smc firmware for each chip revision. Using the wrong one can cause stability issues. Acked-by: Edward O'Callaghan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/si.c | 60 +++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index ad4d7b8b8322..e8a38d296855 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -50,7 +50,6 @@ MODULE_FIRMWARE("radeon/tahiti_ce.bin"); MODULE_FIRMWARE("radeon/tahiti_mc.bin"); MODULE_FIRMWARE("radeon/tahiti_rlc.bin"); MODULE_FIRMWARE("radeon/tahiti_smc.bin"); -MODULE_FIRMWARE("radeon/tahiti_k_smc.bin"); MODULE_FIRMWARE("radeon/PITCAIRN_pfp.bin"); MODULE_FIRMWARE("radeon/PITCAIRN_me.bin"); @@ -1657,9 +1656,6 @@ static int si_init_microcode(struct radeon_device *rdev) switch (rdev->family) { case CHIP_TAHITI: chip_name = "TAHITI"; - /* XXX: figure out which Tahitis need the new ucode */ - if (0) - new_smc = true; new_chip_name = "tahiti"; pfp_req_size = SI_PFP_UCODE_SIZE * 4; me_req_size = SI_PM4_UCODE_SIZE * 4; @@ -1671,12 +1667,9 @@ static int si_init_microcode(struct radeon_device *rdev) break; case CHIP_PITCAIRN: chip_name = "PITCAIRN"; - if ((rdev->pdev->revision == 0x81) || - (rdev->pdev->device == 0x6810) || - (rdev->pdev->device == 0x6811) || - (rdev->pdev->device == 0x6816) || - (rdev->pdev->device == 0x6817) || - (rdev->pdev->device == 0x6806)) + if ((rdev->pdev->revision == 0x81) && + ((rdev->pdev->device == 0x6810) || + (rdev->pdev->device == 0x6811))) new_smc = true; new_chip_name = "pitcairn"; pfp_req_size = SI_PFP_UCODE_SIZE * 4; @@ -1689,15 +1682,15 @@ static int si_init_microcode(struct radeon_device *rdev) break; case CHIP_VERDE: chip_name = "VERDE"; - if ((rdev->pdev->revision == 0x81) || - (rdev->pdev->revision == 0x83) || - (rdev->pdev->revision == 0x87) || - (rdev->pdev->device == 0x6820) || - (rdev->pdev->device == 0x6821) || - (rdev->pdev->device == 0x6822) || - (rdev->pdev->device == 0x6823) || - (rdev->pdev->device == 0x682A) || - (rdev->pdev->device == 0x682B)) + if (((rdev->pdev->device == 0x6820) && + ((rdev->pdev->revision == 0x81) || + (rdev->pdev->revision == 0x83))) || + ((rdev->pdev->device == 0x6821) && + ((rdev->pdev->revision == 0x83) || + (rdev->pdev->revision == 0x87))) || + ((rdev->pdev->revision == 0x87) && + ((rdev->pdev->device == 0x6823) || + (rdev->pdev->device == 0x682b)))) new_smc = true; new_chip_name = "verde"; pfp_req_size = SI_PFP_UCODE_SIZE * 4; @@ -1710,13 +1703,13 @@ static int si_init_microcode(struct radeon_device *rdev) break; case CHIP_OLAND: chip_name = "OLAND"; - if ((rdev->pdev->revision == 0xC7) || - (rdev->pdev->revision == 0x80) || - (rdev->pdev->revision == 0x81) || - (rdev->pdev->revision == 0x83) || - (rdev->pdev->revision == 0x87) || - (rdev->pdev->device == 0x6604) || - (rdev->pdev->device == 0x6605)) + if (((rdev->pdev->revision == 0x81) && + ((rdev->pdev->device == 0x6600) || + (rdev->pdev->device == 0x6604) || + (rdev->pdev->device == 0x6605) || + (rdev->pdev->device == 0x6610))) || + ((rdev->pdev->revision == 0x83) && + (rdev->pdev->device == 0x6610))) new_smc = true; new_chip_name = "oland"; pfp_req_size = SI_PFP_UCODE_SIZE * 4; @@ -1728,12 +1721,15 @@ static int si_init_microcode(struct radeon_device *rdev) break; case CHIP_HAINAN: chip_name = "HAINAN"; - if ((rdev->pdev->revision == 0x81) || - (rdev->pdev->revision == 0x83) || - (rdev->pdev->revision == 0xC3) || - (rdev->pdev->device == 0x6664) || - (rdev->pdev->device == 0x6665) || - (rdev->pdev->device == 0x6667)) + if (((rdev->pdev->revision == 0x81) && + (rdev->pdev->device == 0x6660)) || + ((rdev->pdev->revision == 0x83) && + ((rdev->pdev->device == 0x6660) || + (rdev->pdev->device == 0x6663) || + (rdev->pdev->device == 0x6665) || + (rdev->pdev->device == 0x6667))) || + ((rdev->pdev->revision == 0xc3) && + (rdev->pdev->device == 0x6665))) new_smc = true; new_chip_name = "hainan"; pfp_req_size = SI_PFP_UCODE_SIZE * 4; From 8a08403bcb39f5d0e733bcf59a8a74f16b538f6e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 5 Jan 2017 12:39:01 -0500 Subject: [PATCH 26/86] drm/radeon: drop verde dpm quirks fixes: https://bugs.freedesktop.org/show_bug.cgi?id=98897 https://bugs.launchpad.net/bugs/1651981 Acked-by: Edward O'Callaghan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Cc: Adrian Fiergolski --- drivers/gpu/drm/radeon/si_dpm.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/gpu/drm/radeon/si_dpm.c b/drivers/gpu/drm/radeon/si_dpm.c index 8b5e697f2549..13ba73fd9b68 100644 --- a/drivers/gpu/drm/radeon/si_dpm.c +++ b/drivers/gpu/drm/radeon/si_dpm.c @@ -3008,19 +3008,6 @@ static void si_apply_state_adjust_rules(struct radeon_device *rdev, (rdev->pdev->device == 0x6817) || (rdev->pdev->device == 0x6806)) max_mclk = 120000; - } else if (rdev->family == CHIP_VERDE) { - if ((rdev->pdev->revision == 0x81) || - (rdev->pdev->revision == 0x83) || - (rdev->pdev->revision == 0x87) || - (rdev->pdev->device == 0x6820) || - (rdev->pdev->device == 0x6821) || - (rdev->pdev->device == 0x6822) || - (rdev->pdev->device == 0x6823) || - (rdev->pdev->device == 0x682A) || - (rdev->pdev->device == 0x682B)) { - max_sclk = 75000; - max_mclk = 80000; - } } else if (rdev->family == CHIP_OLAND) { if ((rdev->pdev->revision == 0xC7) || (rdev->pdev->revision == 0x80) || From 7192c54a68013f6058b1bb505645fcd07015191c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 5 Jan 2017 13:02:37 -0500 Subject: [PATCH 27/86] drm/amdgpu: drop verde dpm quirks Port of radeon change to amdgpu. Acked-by: Edward O'Callaghan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/si_dpm.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/si_dpm.c b/drivers/gpu/drm/amd/amdgpu/si_dpm.c index 0566e9adaf8a..10bedfac27b8 100644 --- a/drivers/gpu/drm/amd/amdgpu/si_dpm.c +++ b/drivers/gpu/drm/amd/amdgpu/si_dpm.c @@ -3487,19 +3487,6 @@ static void si_apply_state_adjust_rules(struct amdgpu_device *adev, (adev->pdev->device == 0x6817) || (adev->pdev->device == 0x6806)) max_mclk = 120000; - } else if (adev->asic_type == CHIP_VERDE) { - if ((adev->pdev->revision == 0x81) || - (adev->pdev->revision == 0x83) || - (adev->pdev->revision == 0x87) || - (adev->pdev->device == 0x6820) || - (adev->pdev->device == 0x6821) || - (adev->pdev->device == 0x6822) || - (adev->pdev->device == 0x6823) || - (adev->pdev->device == 0x682A) || - (adev->pdev->device == 0x682B)) { - max_sclk = 75000; - max_mclk = 80000; - } } else if (adev->asic_type == CHIP_OLAND) { if ((adev->pdev->revision == 0xC7) || (adev->pdev->revision == 0x80) || From 67c408cfa8862fe7e45b3a1f762f7140e03b7217 Mon Sep 17 00:00:00 2001 From: Alexander Alemayhu Date: Sat, 7 Jan 2017 23:53:00 +0100 Subject: [PATCH 28/86] ipv6: fix typos o s/approriate/appropriate o s/discouvery/discovery Signed-off-by: Alexander Alemayhu Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8417c41d8ec8..ce5aaf448c54 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1464,7 +1464,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_node *fn; /* Get the "current" route for this destination and - * check if the redirect has come from approriate router. + * check if the redirect has come from appropriate router. * * RFC 4861 specifies that redirects should only be * accepted if they come from the nexthop to the target. @@ -2768,7 +2768,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) old MTU is the lowest MTU in the path, update the route PMTU to reflect the increase. In this case if the other nodes' MTU also have the lowest MTU, TOO BIG MESSAGE will be lead to - PMTU discouvery. + PMTU discovery. */ if (rt->dst.dev == arg->dev && dst_metric_raw(&rt->dst, RTAX_MTU) && From b007f09072ca8afa118ade333e717ba443e8d807 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 9 Jan 2017 10:45:49 +0300 Subject: [PATCH 29/86] ipv4: make tcp_notsent_lowat sysctl knob behave as true unsigned int > cat /proc/sys/net/ipv4/tcp_notsent_lowat -1 > echo 4294967295 > /proc/sys/net/ipv4/tcp_notsent_lowat -bash: echo: write error: Invalid argument > echo -2147483648 > /proc/sys/net/ipv4/tcp_notsent_lowat > cat /proc/sys/net/ipv4/tcp_notsent_lowat -2147483648 but in documentation we have "tcp_notsent_lowat - UNSIGNED INTEGER" v2: simplify to just proc_douintvec Signed-off-by: Pavel Tikhomirov Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 22cbd61079b5..b2fa498b15d1 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -951,7 +951,7 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_tcp_notsent_lowat, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_douintvec, }, { .procname = "tcp_tw_reuse", From ce7e40c432ba84da104438f6799d460a4cad41bc Mon Sep 17 00:00:00 2001 From: Vlad Tsyrklevich Date: Mon, 9 Jan 2017 20:57:48 +0700 Subject: [PATCH 30/86] net/appletalk: Fix kernel memory disclosure ipddp_route structs contain alignment padding so kernel heap memory is leaked when they are copied to user space in ipddp_ioctl(SIOCFINDIPDDPRT). Change kmalloc() to kzalloc() to clear that memory. Signed-off-by: Vlad Tsyrklevich Signed-off-by: David S. Miller --- drivers/net/appletalk/ipddp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c index b8c293373ecc..a306de4318d7 100644 --- a/drivers/net/appletalk/ipddp.c +++ b/drivers/net/appletalk/ipddp.c @@ -190,7 +190,7 @@ static netdev_tx_t ipddp_xmit(struct sk_buff *skb, struct net_device *dev) */ static int ipddp_create(struct ipddp_route *new_rt) { - struct ipddp_route *rt = kmalloc(sizeof(*rt), GFP_KERNEL); + struct ipddp_route *rt = kzalloc(sizeof(*rt), GFP_KERNEL); if (rt == NULL) return -ENOMEM; From 2ebae8bd60188f57e26e95e7fde6b8943297d348 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 9 Jan 2017 15:17:27 +0100 Subject: [PATCH 31/86] net: phy: Add Meson GXL PHY hardware dependency As I understand it the Meson GXL PHY driver is only useful on one architecture so only make it visible on that architecture. Signed-off-by: Jean Delvare Fixes: 7334b3e47aee ("net: phy: Add Meson GXL Internal PHY driver") Cc: Neil Armstrong Cc: Florian Fainelli Cc: Andrew Lunn Cc: David S. Miller Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index d361835b315d..8dbd59baa34d 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -279,6 +279,7 @@ config MARVELL_PHY config MESON_GXL_PHY tristate "Amlogic Meson GXL Internal PHY" + depends on ARCH_MESON || COMPILE_TEST ---help--- Currently has a driver for the Amlogic Meson GXL Internal PHY From 6bb629db5e7daa619f5242b6ad93e4dd9bf7432c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Jan 2017 08:51:32 -0800 Subject: [PATCH 32/86] tcp: do not export tcp_peer_is_proven() After commit 1fb6f159fd21 ("tcp: add tcp_conn_request"), tcp_peer_is_proven() no longer needs to be exported. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index d46f4d5b1c62..ba8f02d0f283 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -606,7 +606,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, return ret; } -EXPORT_SYMBOL_GPL(tcp_peer_is_proven); void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) { From dc647ec88e029307e60e6bf9988056605f11051a Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 10 Jan 2017 09:30:51 +0100 Subject: [PATCH 33/86] net: socket: Make unnecessarily global sockfs_setattr() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sockfs_setattr() static as it is not used outside of net/socket.c This fixes the following GCC warning: net/socket.c:534:5: warning: no previous prototype for ‘sockfs_setattr’ [-Wmissing-prototypes] Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.") Cc: Lorenzo Colitti Signed-off-by: Tobias Klauser Acked-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/socket.c b/net/socket.c index a8c2307590b8..0758e13754e2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -533,7 +533,7 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, return used; } -int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) +static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) { int err = simple_setattr(dentry, iattr); From 8fb280616878b81c0790a0c33acbeec59c5711f4 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Tue, 10 Jan 2017 17:04:06 +0800 Subject: [PATCH 34/86] r8152: split rtl8152_suspend function Split rtl8152_suspend() into rtl8152_system_suspend() and rtl8152_rumtime_suspend(). Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 57 +++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 7dc61228c55b..c5e6d88de4e4 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -3576,39 +3576,62 @@ static bool delay_autosuspend(struct r8152 *tp) return false; } -static int rtl8152_suspend(struct usb_interface *intf, pm_message_t message) +static int rtl8152_rumtime_suspend(struct r8152 *tp) { - struct r8152 *tp = usb_get_intfdata(intf); struct net_device *netdev = tp->netdev; int ret = 0; - mutex_lock(&tp->control); - - if (PMSG_IS_AUTO(message)) { - if (netif_running(netdev) && delay_autosuspend(tp)) { + if (netif_running(netdev) && test_bit(WORK_ENABLE, &tp->flags)) { + if (delay_autosuspend(tp)) { ret = -EBUSY; goto out1; } - set_bit(SELECTIVE_SUSPEND, &tp->flags); - } else { - netif_device_detach(netdev); + clear_bit(WORK_ENABLE, &tp->flags); + usb_kill_urb(tp->intr_urb); + napi_disable(&tp->napi); + rtl_stop_rx(tp); + tp->rtl_ops.autosuspend_en(tp, true); + napi_enable(&tp->napi); } + set_bit(SELECTIVE_SUSPEND, &tp->flags); + +out1: + return ret; +} + +static int rtl8152_system_suspend(struct r8152 *tp) +{ + struct net_device *netdev = tp->netdev; + int ret = 0; + + netif_device_detach(netdev); + if (netif_running(netdev) && test_bit(WORK_ENABLE, &tp->flags)) { clear_bit(WORK_ENABLE, &tp->flags); usb_kill_urb(tp->intr_urb); napi_disable(&tp->napi); - if (test_bit(SELECTIVE_SUSPEND, &tp->flags)) { - rtl_stop_rx(tp); - tp->rtl_ops.autosuspend_en(tp, true); - } else { - cancel_delayed_work_sync(&tp->schedule); - tp->rtl_ops.down(tp); - } + cancel_delayed_work_sync(&tp->schedule); + tp->rtl_ops.down(tp); napi_enable(&tp->napi); } -out1: + + return ret; +} + +static int rtl8152_suspend(struct usb_interface *intf, pm_message_t message) +{ + struct r8152 *tp = usb_get_intfdata(intf); + int ret; + + mutex_lock(&tp->control); + + if (PMSG_IS_AUTO(message)) + ret = rtl8152_rumtime_suspend(tp); + else + ret = rtl8152_system_suspend(tp); + mutex_unlock(&tp->control); return ret; From 75dc692eda114cb234a46cb11893a9c3ea520934 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Tue, 10 Jan 2017 17:04:07 +0800 Subject: [PATCH 35/86] r8152: fix rx issue for runtime suspend Pause the rx and make sure the rx fifo is empty when the autosuspend occurs. If the rx data comes when the driver is canceling the rx urb, the host controller would stop getting the data from the device and continue it after next rx urb is submitted. That is, one continuing data is split into two different urb buffers. That let the driver take the data as a rx descriptor, and unexpected behavior happens. Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index c5e6d88de4e4..be418563cb18 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -3582,17 +3582,42 @@ static int rtl8152_rumtime_suspend(struct r8152 *tp) int ret = 0; if (netif_running(netdev) && test_bit(WORK_ENABLE, &tp->flags)) { + u32 rcr = 0; + if (delay_autosuspend(tp)) { ret = -EBUSY; goto out1; } + if (netif_carrier_ok(netdev)) { + u32 ocp_data; + + rcr = ocp_read_dword(tp, MCU_TYPE_PLA, PLA_RCR); + ocp_data = rcr & ~RCR_ACPT_ALL; + ocp_write_dword(tp, MCU_TYPE_PLA, PLA_RCR, ocp_data); + rxdy_gated_en(tp, true); + ocp_data = ocp_read_byte(tp, MCU_TYPE_PLA, + PLA_OOB_CTRL); + if (!(ocp_data & RXFIFO_EMPTY)) { + rxdy_gated_en(tp, false); + ocp_write_dword(tp, MCU_TYPE_PLA, PLA_RCR, rcr); + ret = -EBUSY; + goto out1; + } + } + clear_bit(WORK_ENABLE, &tp->flags); usb_kill_urb(tp->intr_urb); - napi_disable(&tp->napi); - rtl_stop_rx(tp); + tp->rtl_ops.autosuspend_en(tp, true); - napi_enable(&tp->napi); + + if (netif_carrier_ok(netdev)) { + napi_disable(&tp->napi); + rtl_stop_rx(tp); + rxdy_gated_en(tp, false); + ocp_write_dword(tp, MCU_TYPE_PLA, PLA_RCR, rcr); + napi_enable(&tp->napi); + } } set_bit(SELECTIVE_SUSPEND, &tp->flags); From d9584d8ccc06ba98f4fad8ec720de66b6659fd35 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Jan 2017 11:18:01 -0800 Subject: [PATCH 36/86] net: skb_flow_get_be16() can be static Removes following sparse complain : net/core/flow_dissector.c:70:8: warning: symbol 'skb_flow_get_be16' was not declared. Should it be static? Fixes: 972d3876faa8 ("flow dissector: ICMP support") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index fe4e1531976c..1b7673aac59d 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -67,8 +67,8 @@ EXPORT_SYMBOL(skb_flow_dissector_init); * The function will try to retrieve a be32 entity at * offset poff */ -__be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, void *data, - int hlen) +static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, + void *data, int hlen) { __be16 *u, _u; From faf3a932fbeb77860226a8323eacb835edc98648 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 9 Jan 2017 11:58:34 -0800 Subject: [PATCH 37/86] net: dsa: Ensure validity of dst->ds[0] It is perfectly possible to have non zero indexed switches being present in a DSA switch tree, in such a case, we will be deferencing a NULL pointer while dsa_cpu_port_ethtool_{setup,restore}. Be more defensive and ensure that dst->ds[0] is valid before doing anything with it. Fixes: 0c73c523cf73 ("net: dsa: Initialize CPU port ethtool ops per tree") Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 5fff951a0a49..da3862124545 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -394,9 +394,11 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } - err = dsa_cpu_port_ethtool_setup(dst->ds[0]); - if (err) - return err; + if (dst->ds[0]) { + err = dsa_cpu_port_ethtool_setup(dst->ds[0]); + if (err) + return err; + } /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get @@ -433,7 +435,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - dsa_cpu_port_ethtool_restore(dst->ds[0]); + if (dst->ds[0]) + dsa_cpu_port_ethtool_restore(dst->ds[0]); pr_info("DSA: tree %d unapplied\n", dst->tree); dst->applied = false; From 3512a1ad56174308a9fd3e10f4b1e3e152e9ec01 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 9 Jan 2017 14:31:58 -0800 Subject: [PATCH 38/86] net: qrtr: Mark 'buf' as little endian Failure to mark this pointer as __le32 causes checkers like sparse to complain: net/qrtr/qrtr.c:274:16: warning: incorrect type in assignment (different base types) net/qrtr/qrtr.c:274:16: expected unsigned int [unsigned] [usertype] net/qrtr/qrtr.c:274:16: got restricted __le32 [usertype] net/qrtr/qrtr.c:275:16: warning: incorrect type in assignment (different base types) net/qrtr/qrtr.c:275:16: expected unsigned int [unsigned] [usertype] net/qrtr/qrtr.c:275:16: got restricted __le32 [usertype] net/qrtr/qrtr.c:276:16: warning: incorrect type in assignment (different base types) net/qrtr/qrtr.c:276:16: expected unsigned int [unsigned] [usertype] net/qrtr/qrtr.c:276:16: got restricted __le32 [usertype] Silence it. Cc: Bjorn Andersson Signed-off-by: Stephen Boyd Acked-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index c985ecbe9bd6..ae5ac175b2be 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -252,7 +252,7 @@ static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node, const int pkt_len = 20; struct qrtr_hdr *hdr; struct sk_buff *skb; - u32 *buf; + __le32 *buf; skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL); if (!skb) @@ -269,7 +269,7 @@ static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node, hdr->dst_node_id = cpu_to_le32(dst_node); hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); - buf = (u32 *)skb_put(skb, pkt_len); + buf = (__le32 *)skb_put(skb, pkt_len); memset(buf, 0, pkt_len); buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX); buf[1] = cpu_to_le32(src_node); From 5d722b3024f6762addb8642ffddc9f275b5107ae Mon Sep 17 00:00:00 2001 From: "Anna, Suman" Date: Mon, 9 Jan 2017 21:48:56 -0600 Subject: [PATCH 39/86] net: add the AF_QIPCRTR entries to family name tables Commit bdabad3e363d ("net: Add Qualcomm IPC router") introduced a new address family. Update the family name tables accordingly so that the lockdep initialization can use the proper names for this family. Cc: Courtney Cavin Cc: Bjorn Andersson Signed-off-by: Suman Anna Signed-off-by: David S. Miller --- net/core/sock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index f560e0826009..4eca27dc5c94 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -222,7 +222,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" , - "sk_lock-AF_MAX" + "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX" }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , @@ -239,7 +239,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" , - "slock-AF_MAX" + "slock-AF_QIPCRTR", "slock-AF_MAX" }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , @@ -256,7 +256,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" , - "clock-AF_MAX" + "clock-AF_QIPCRTR", "clock-AF_MAX" }; /* From dc5367bcc556e97555fc94a32cd1aadbebdff47e Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 10 Jan 2017 17:10:34 +0100 Subject: [PATCH 40/86] net/af_iucv: don't use paged skbs for TX on HiperSockets With commit e53743994e21 ("af_iucv: use paged SKBs for big outbound messages"), we transmit paged skbs for both of AF_IUCV's transport modes (IUCV or HiperSockets). The qeth driver for Layer 3 HiperSockets currently doesn't support NETIF_F_SG, so these skbs would just be linearized again by the stack. Avoid that overhead by using paged skbs only for IUCV transport. cc stable, since this also circumvents a significant skb leak when sending large messages (where the skb then needs to be linearized). Signed-off-by: Julian Wiedmann Signed-off-by: Ursula Braun Cc: # v4.8+ Fixes: e53743994e21 ("af_iucv: use paged SKBs for big outbound messages") Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index cfb9e5f4e28f..13190b38f22e 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1044,7 +1044,8 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); - size_t headroom, linear; + size_t headroom = 0; + size_t linear; struct sk_buff *skb; struct iucv_message txmsg = {0}; struct cmsghdr *cmsg; @@ -1122,18 +1123,20 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, * this is fine for SOCK_SEQPACKET (unless we want to support * segmented records using the MSG_EOR flag), but * for SOCK_STREAM we might want to improve it in future */ - headroom = (iucv->transport == AF_IUCV_TRANS_HIPER) - ? sizeof(struct af_iucv_trans_hdr) + ETH_HLEN : 0; - if (headroom + len < PAGE_SIZE) { + if (iucv->transport == AF_IUCV_TRANS_HIPER) { + headroom = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN; linear = len; } else { - /* In nonlinear "classic" iucv skb, - * reserve space for iucv_array - */ - if (iucv->transport != AF_IUCV_TRANS_HIPER) - headroom += sizeof(struct iucv_array) * - (MAX_SKB_FRAGS + 1); - linear = PAGE_SIZE - headroom; + if (len < PAGE_SIZE) { + linear = len; + } else { + /* In nonlinear "classic" iucv skb, + * reserve space for iucv_array + */ + headroom = sizeof(struct iucv_array) * + (MAX_SKB_FRAGS + 1); + linear = PAGE_SIZE - headroom; + } } skb = sock_alloc_send_pskb(sk, headroom + linear, len - linear, noblock, &err, 0); From 9f9b74ef896792399dc7b5121896b9c963db80fb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 10 Jan 2017 09:41:49 -0800 Subject: [PATCH 41/86] mlx4: Return EOPNOTSUPP instead of ENOTSUPP In commit b45f0674b997 ("mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs"), it changed EOPNOTSUPP to ENOTSUPP by mistake. This patch fixes it. Fixes: b45f0674b997 ("mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs") Signed-off-by: Martin KaFai Lau Acked-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index edbe200ac2fa..4910d9af1933 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2277,7 +2277,7 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) if (priv->tx_ring_num[TX_XDP] && !mlx4_en_check_xdp_mtu(dev, new_mtu)) - return -ENOTSUPP; + return -EOPNOTSUPP; dev->mtu = new_mtu; From 1272ce87fa017ca4cf32920764d879656b7a005a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 10 Jan 2017 12:24:01 -0800 Subject: [PATCH 42/86] gro: Enter slow-path if there is no tailroom The GRO path has a fast-path where we avoid calling pskb_may_pull and pskb_expand by directly accessing frag0. However, this should only be done if we have enough tailroom in the skb as otherwise we'll have to expand it later anyway. This patch adds the check by capping frag0_len with the skb tailroom. Fixes: cb18978cbf45 ("gro: Open-code final pskb_may_pull") Reported-by: Slava Shwartsman Signed-off-by: Herbert Xu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 8db5a0b4b520..88d2907ca2cd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4441,7 +4441,8 @@ static void skb_gro_reset_offset(struct sk_buff *skb) pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); + NAPI_GRO_CB(skb)->frag0_len = min(skb_frag_size(frag0), + skb->end - skb->tail); } } From 57ea52a865144aedbcd619ee0081155e658b6f7d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 10 Jan 2017 12:24:15 -0800 Subject: [PATCH 43/86] gro: Disable frag0 optimization on IPv6 ext headers The GRO fast path caches the frag0 address. This address becomes invalid if frag0 is modified by pskb_may_pull or its variants. So whenever that happens we must disable the frag0 optimization. This is usually done through the combination of gro_header_hard and gro_header_slow, however, the IPv6 extension header path did the pulling directly and would continue to use the GRO fast path incorrectly. This patch fixes it by disabling the fast path when we enter the IPv6 extension header path. Fixes: 78a478d0efd9 ("gro: Inline skb_gro_header and cache frag0 virtual address") Reported-by: Slava Shwartsman Signed-off-by: Herbert Xu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++-- net/ipv6/ip6_offload.c | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 994f7423a74b..9bde9558b596 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2477,14 +2477,19 @@ static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen) return NAPI_GRO_CB(skb)->frag0_len < hlen; } +static inline void skb_gro_frag0_invalidate(struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; +} + static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, unsigned int offset) { if (!pskb_may_pull(skb, hlen)) return NULL; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; + skb_gro_frag0_invalidate(skb); return skb->data + offset; } diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 89c59e656f44..fc7b4017ba24 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -191,6 +191,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) { __pskb_pull(skb, skb_gro_offset(skb)); + skb_gro_frag0_invalidate(skb); proto = ipv6_gso_pull_exthdrs(skb, proto); skb_gro_pull(skb, -skb_transport_offset(skb)); skb_reset_transport_header(skb); From 5771f6ea8d5ccc0df4d02ae65833413150a1b829 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 10 Jan 2017 16:57:12 -0800 Subject: [PATCH 44/86] MAINTAINERS: remove duplicate bug filling description I have noticed that two different descriptions for B: entries in MAINTAINERS were merged: commit 686564434e88 ("MAINTAINERS: Add bug tracking system location entry type") and 2de2bd95f456 ("MAINTAINERS: add "B:" for URI where to file bugs"). This patch keeps the description from 2de2bd95f456. There has been a discussion [1] about whether this more detailed description is useful and what it exactly implies. I find it more useful and general, and the author of 686564434e88 agreed in the end that either is fine. [1] https://lkml.org/lkml/2016/12/8/71 Link: http://lkml.kernel.org/r/20161219085158.12114-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 35c9cbfe4f2d..0277df881da4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -81,7 +81,6 @@ Descriptions of section entries: Q: Patchwork web based patch tracking system site T: SCM tree type and location. Type is one of: git, hg, quilt, stgit, topgit - B: Bug tracking system location. S: Status, one of the following: Supported: Someone is actually paid to look after this. Maintained: Someone actually looks after it. From 965d004af54088d138f806d04d803fb60d441986 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 10 Jan 2017 16:57:15 -0800 Subject: [PATCH 45/86] dax: fix deadlock with DAX 4k holes Currently in DAX if we have three read faults on the same hole address we can end up with the following: Thread 0 Thread 1 Thread 2 -------- -------- -------- dax_iomap_fault grab_mapping_entry lock_slot dax_iomap_fault grab_mapping_entry get_unlocked_mapping_entry dax_iomap_fault grab_mapping_entry get_unlocked_mapping_entry dax_load_hole find_or_create_page ... page_cache_tree_insert dax_wake_mapping_entry_waiter __radix_tree_replace get_page lock_page ... put_locked_mapping_entry unlock_page put_page The crux of the problem is that once we insert a 4k zero page, all locking from then on is done in terms of that 4k zero page and any additional threads sleeping on the empty DAX entry will never be woken. Fix this by waking all sleepers when we replace the DAX radix tree entry with a 4k zero page. This will allow all sleeping threads to successfully transition from locking based on the DAX empty entry to locking on the 4k zero page. With the test case reported by Xiong this happens very regularly in my test setup, with some runs resulting in 9+ threads in this deadlocked state. With this fix I've been able to run that same test dozens of times in a loop without issue. Fixes: ac401cc78242 ("dax: New fault locking") Link: http://lkml.kernel.org/r/1483479365-13607-1-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reported-by: Xiong Zhou Reviewed-by: Jan Kara Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index d0e4d1002059..b772a33ef640 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -138,7 +138,7 @@ static int page_cache_tree_insert(struct address_space *mapping, dax_radix_locked_entry(0, RADIX_DAX_EMPTY)); /* Wakeup waiters for exceptional entry lock */ dax_wake_mapping_entry_waiter(mapping, page->index, p, - false); + true); } } __radix_tree_replace(&mapping->page_tree, node, slot, page, From d670ffd87509b6b136d8ed6f757851a8ebe442b2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 10 Jan 2017 16:57:18 -0800 Subject: [PATCH 46/86] mm/thp/pagecache/collapse: free the pte page table on collapse for thp page cache. With THP page cache, when trying to build a huge page from regular pte pages, we just clear the pmd entry. We will take another fault and at that point we will find the huge page in the radix tree, thereby using the huge page to complete the page fault The second fault path will allocate the needed pgtable_t page for archs like ppc64. So no need to deposit the same in collapse path. Depositing them in the collapse path resulting in a pgtable_t memory leak also giving errors like BUG: non-zero nr_ptes on freeing mm: 3 Fixes: 953c66c2b22a ("mm: THP page cache support for ppc64") Link: http://lkml.kernel.org/r/20161212163428.6780-2-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e32389a97030..b0924a68cc36 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1242,7 +1242,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) struct vm_area_struct *vma; unsigned long addr; pmd_t *pmd, _pmd; - bool deposited = false; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1267,26 +1266,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); - /* - * now deposit the pgtable for arch that need it - * otherwise free it. - */ - if (arch_needs_pgtable_deposit()) { - /* - * The deposit should be visibile only after - * collapse is seen by others. - */ - smp_wmb(); - pgtable_trans_huge_deposit(vma->vm_mm, pmd, - pmd_pgtable(_pmd)); - deposited = true; - } spin_unlock(ptl); up_write(&vma->vm_mm->mmap_sem); - if (!deposited) { - atomic_long_dec(&vma->vm_mm->nr_ptes); - pte_free(vma->vm_mm, pmd_pgtable(_pmd)); - } + atomic_long_dec(&vma->vm_mm->nr_ptes); + pte_free(vma->vm_mm, pmd_pgtable(_pmd)); } } i_mmap_unlock_write(mapping); From 097963959594c5eccaba42510f7033f703211bda Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 10 Jan 2017 16:57:21 -0800 Subject: [PATCH 47/86] mm: add follow_pte_pmd() Patch series "Write protect DAX PMDs in *sync path". Currently dax_mapping_entry_mkclean() fails to clean and write protect the pmd_t of a DAX PMD entry during an *sync operation. This can result in data loss, as detailed in patch 2. This series is based on Dan's "libnvdimm-pending" branch, which is the current home for Jan's "dax: Page invalidation fixes" series. You can find a working tree here: https://git.kernel.org/cgit/linux/kernel/git/zwisler/linux.git/log/?h=dax_pmd_clean This patch (of 2): Similar to follow_pte(), follow_pte_pmd() allows either a PTE leaf or a huge page PMD leaf to be found and returned. Link: http://lkml.kernel.org/r/1482272586-21177-2-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Suggested-by: Dave Hansen Cc: Alexander Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Jan Kara Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/memory.c | 37 ++++++++++++++++++++++++++++++------- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index fe6b4036664a..02793ac64ac6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1212,6 +1212,8 @@ void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp); +int follow_pte_pmd(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index 9f2c15cdb32c..b62f3bc63481 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3772,8 +3772,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -static int __follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) +static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; pud_t *pud; @@ -3790,11 +3790,20 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address, pmd = pmd_offset(pud, address); VM_BUG_ON(pmd_trans_huge(*pmd)); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto out; - /* We cannot handle huge page PFN maps. Luckily they don't exist. */ - if (pmd_huge(*pmd)) + if (pmd_huge(*pmd)) { + if (!pmdpp) + goto out; + + *ptlp = pmd_lock(mm, pmd); + if (pmd_huge(*pmd)) { + *pmdpp = pmd; + return 0; + } + spin_unlock(*ptlp); + } + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; ptep = pte_offset_map_lock(mm, pmd, address, ptlp); @@ -3817,10 +3826,24 @@ int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte(mm, address, ptepp, ptlp))); + !(res = __follow_pte_pmd(mm, address, ptepp, NULL, + ptlp))); return res; } +int follow_pte_pmd(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) +{ + int res; + + /* (void) is needed to make gcc happy */ + (void) __cond_lock(*ptlp, + !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp, + ptlp))); + return res; +} +EXPORT_SYMBOL(follow_pte_pmd); + /** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping From f729c8c9b24f0540a6e6b86e68f3888ba90ef7e7 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 10 Jan 2017 16:57:24 -0800 Subject: [PATCH 48/86] dax: wrprotect pmd_t in dax_mapping_entry_mkclean Currently dax_mapping_entry_mkclean() fails to clean and write protect the pmd_t of a DAX PMD entry during an *sync operation. This can result in data loss in the following sequence: 1) mmap write to DAX PMD, dirtying PMD radix tree entry and making the pmd_t dirty and writeable 2) fsync, flushing out PMD data and cleaning the radix tree entry. We currently fail to mark the pmd_t as clean and write protected. 3) more mmap writes to the PMD. These don't cause any page faults since the pmd_t is dirty and writeable. The radix tree entry remains clean. 4) fsync, which fails to flush the dirty PMD data because the radix tree entry was clean. 5) crash - dirty data that should have been fsync'd as part of 4) could still have been in the processor cache, and is lost. Fix this by marking the pmd_t clean and write protected in dax_mapping_entry_mkclean(), which is called as part of the fsync operation 2). This will cause the writes in step 3) above to generate page faults where we'll re-dirty the PMD radix tree entry, resulting in flushes in the fsync that happens in step 4). Fixes: 4b4bb46d00b3 ("dax: clear dirty entry tags on cache flush") Link: http://lkml.kernel.org/r/1482272586-21177-3-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Jan Kara Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 51 ++++++++++++++++++++++++++++++++-------------- include/linux/mm.h | 2 -- mm/memory.c | 4 ++-- 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 5c74f60d0a50..ddcddfeaa03b 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -691,8 +691,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pgoff_t index, unsigned long pfn) { struct vm_area_struct *vma; - pte_t *ptep; - pte_t pte; + pte_t pte, *ptep = NULL; + pmd_t *pmdp = NULL; spinlock_t *ptl; bool changed; @@ -707,21 +707,42 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, address = pgoff_address(index, vma); changed = false; - if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) continue; - if (pfn != pte_pfn(*ptep)) - goto unlock; - if (!pte_dirty(*ptep) && !pte_write(*ptep)) - goto unlock; - flush_cache_page(vma, address, pfn); - pte = ptep_clear_flush(vma, address, ptep); - pte = pte_wrprotect(pte); - pte = pte_mkclean(pte); - set_pte_at(vma->vm_mm, address, ptep, pte); - changed = true; -unlock: - pte_unmap_unlock(ptep, ptl); + if (pmdp) { +#ifdef CONFIG_FS_DAX_PMD + pmd_t pmd; + + if (pfn != pmd_pfn(*pmdp)) + goto unlock_pmd; + if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) + goto unlock_pmd; + + flush_cache_page(vma, address, pfn); + pmd = pmdp_huge_clear_flush(vma, address, pmdp); + pmd = pmd_wrprotect(pmd); + pmd = pmd_mkclean(pmd); + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + changed = true; +unlock_pmd: + spin_unlock(ptl); +#endif + } else { + if (pfn != pte_pfn(*ptep)) + goto unlock_pte; + if (!pte_dirty(*ptep) && !pte_write(*ptep)) + goto unlock_pte; + + flush_cache_page(vma, address, pfn); + pte = ptep_clear_flush(vma, address, ptep); + pte = pte_wrprotect(pte); + pte = pte_mkclean(pte); + set_pte_at(vma->vm_mm, address, ptep, pte); + changed = true; +unlock_pte: + pte_unmap_unlock(ptep, ptl); + } if (changed) mmu_notifier_invalidate_page(vma->vm_mm, address); diff --git a/include/linux/mm.h b/include/linux/mm.h index 02793ac64ac6..b84615b0f64c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1210,8 +1210,6 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); -int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, - spinlock_t **ptlp); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index b62f3bc63481..6bf2b471e30c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3819,8 +3819,8 @@ out: return -EINVAL; } -int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, - spinlock_t **ptlp) +static inline int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) { int res; From bb1107f7c6052c863692a41f78c000db792334bf Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:57:27 -0800 Subject: [PATCH 49/86] mm, slab: make sure that KMALLOC_MAX_SIZE will fit into MAX_ORDER Andrey Konovalov has reported the following warning triggered by the syzkaller fuzzer. WARNING: CPU: 1 PID: 9935 at mm/page_alloc.c:3511 __alloc_pages_nodemask+0x159c/0x1e20 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 9935 Comm: syz-executor0 Not tainted 4.9.0-rc7+ #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __alloc_pages_slowpath mm/page_alloc.c:3511 __alloc_pages_nodemask+0x159c/0x1e20 mm/page_alloc.c:3781 alloc_pages_current+0x1c7/0x6b0 mm/mempolicy.c:2072 alloc_pages include/linux/gfp.h:469 kmalloc_order+0x1f/0x70 mm/slab_common.c:1015 kmalloc_order_trace+0x1f/0x160 mm/slab_common.c:1026 kmalloc_large include/linux/slab.h:422 __kmalloc+0x210/0x2d0 mm/slub.c:3723 kmalloc include/linux/slab.h:495 ep_write_iter+0x167/0xb50 drivers/usb/gadget/legacy/inode.c:664 new_sync_write fs/read_write.c:499 __vfs_write+0x483/0x760 fs/read_write.c:512 vfs_write+0x170/0x4e0 fs/read_write.c:560 SYSC_write fs/read_write.c:607 SyS_write+0xfb/0x230 fs/read_write.c:599 entry_SYSCALL_64_fastpath+0x1f/0xc2 The issue is caused by a lack of size check for the request size in ep_write_iter which should be fixed. It, however, points to another problem, that SLUB defines KMALLOC_MAX_SIZE too large because the its KMALLOC_SHIFT_MAX is (MAX_ORDER + PAGE_SHIFT) which means that the resulting page allocator request might be MAX_ORDER which is too large (see __alloc_pages_slowpath). The same applies to the SLOB allocator which allows even larger sizes. Make sure that they are capped properly and never request more than MAX_ORDER order. Link: http://lkml.kernel.org/r/20161220130659.16461-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Andrey Konovalov Acked-by: Christoph Lameter Cc: Alexei Starovoitov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 084b12bad198..4c5363566815 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -226,7 +226,7 @@ static inline const char *__check_heap_object(const void *ptr, * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif @@ -239,7 +239,7 @@ static inline const char *__check_heap_object(const void *ptr, * be allocated from the same page. */ #define KMALLOC_SHIFT_HIGH PAGE_SHIFT -#define KMALLOC_SHIFT_MAX 30 +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif From 7984c27c2c5cd3298de8afdba3e1bd46f884e934 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:57:30 -0800 Subject: [PATCH 50/86] bpf: do not use KMALLOC_SHIFT_MAX Commit 01b3f52157ff ("bpf: fix allocation warnings in bpf maps and integer overflow") has added checks for the maximum allocateable size. It (ab)used KMALLOC_SHIFT_MAX for that purpose. While this is not incorrect it is not very clean because we already have KMALLOC_MAX_SIZE for this very reason so let's change both checks to use KMALLOC_MAX_SIZE instead. The original motivation for using KMALLOC_SHIFT_MAX was to work around an incorrect KMALLOC_MAX_SIZE which could lead to allocation warnings but it is no longer needed since "slab: make sure that KMALLOC_MAX_SIZE will fit into MAX_ORDER". Link: http://lkml.kernel.org/r/20161220130659.16461-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Christoph Lameter Cc: Alexei Starovoitov Cc: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/bpf/arraymap.c | 2 +- kernel/bpf/hashtab.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index a2ac051c342f..229a5d5df977 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -56,7 +56,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) attr->value_size == 0 || attr->map_flags) return ERR_PTR(-EINVAL); - if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) + if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 34debc1a9641..3f2bb58952d8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -274,7 +274,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; - if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) - + if (htab->map.value_size >= KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct htab_elem)) /* if value_size is bigger, the user space won't be able to * access the elements via bpf syscall. This check also makes From e7ee2c089e94067d68475990bdeed211c8852917 Mon Sep 17 00:00:00 2001 From: Eric Ren Date: Tue, 10 Jan 2017 16:57:33 -0800 Subject: [PATCH 51/86] ocfs2: fix crash caused by stale lvb with fsdlm plugin The crash happens rather often when we reset some cluster nodes while nodes contend fiercely to do truncate and append. The crash backtrace is below: dlm: C21CBDA5E0774F4BA5A9D4F317717495: dlm_recover_grant 1 locks on 971 resources dlm: C21CBDA5E0774F4BA5A9D4F317717495: dlm_recover 9 generation 5 done: 4 ms ocfs2: Begin replay journal (node 318952601, slot 2) on device (253,18) ocfs2: End replay journal (node 318952601, slot 2) on device (253,18) ocfs2: Beginning quota recovery on device (253,18) for slot 2 ocfs2: Finishing quota recovery on device (253,18) for slot 2 (truncate,30154,1):ocfs2_truncate_file:470 ERROR: bug expression: le64_to_cpu(fe->i_size) != i_size_read(inode) (truncate,30154,1):ocfs2_truncate_file:470 ERROR: Inode 290321, inode i_size = 732 != di i_size = 937, i_flags = 0x1 ------------[ cut here ]------------ kernel BUG at /usr/src/linux/fs/ocfs2/file.c:470! invalid opcode: 0000 [#1] SMP Modules linked in: ocfs2_stack_user(OEN) ocfs2(OEN) ocfs2_nodemanager ocfs2_stackglue(OEN) quota_tree dlm(OEN) configfs fuse sd_mod iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi af_packet iscsi_ibft iscsi_boot_sysfs softdog xfs libcrc32c ppdev parport_pc pcspkr parport joydev virtio_balloon virtio_net i2c_piix4 acpi_cpufreq button processor ext4 crc16 jbd2 mbcache ata_generic cirrus virtio_blk ata_piix drm_kms_helper ahci syscopyarea libahci sysfillrect sysimgblt fb_sys_fops ttm floppy libata drm virtio_pci virtio_ring uhci_hcd virtio ehci_hcd usbcore serio_raw usb_common sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod autofs4 Supported: No, Unsupported modules are loaded CPU: 1 PID: 30154 Comm: truncate Tainted: G OE N 4.4.21-69-default #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20151112_172657-sheep25 04/01/2014 task: ffff88004ff6d240 ti: ffff880074e68000 task.ti: ffff880074e68000 RIP: 0010:[] [] ocfs2_truncate_file+0x640/0x6c0 [ocfs2] RSP: 0018:ffff880074e6bd50 EFLAGS: 00010282 RAX: 0000000000000074 RBX: 000000000000029e RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000246 RDI: 0000000000000246 RBP: ffff880074e6bda8 R08: 000000003675dc7a R09: ffffffff82013414 R10: 0000000000034c50 R11: 0000000000000000 R12: ffff88003aab3448 R13: 00000000000002dc R14: 0000000000046e11 R15: 0000000000000020 FS: 00007f839f965700(0000) GS:ffff88007fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f839f97e000 CR3: 0000000036723000 CR4: 00000000000006e0 Call Trace: ocfs2_setattr+0x698/0xa90 [ocfs2] notify_change+0x1ae/0x380 do_truncate+0x5e/0x90 do_sys_ftruncate.constprop.11+0x108/0x160 entry_SYSCALL_64_fastpath+0x12/0x6d Code: 24 28 ba d6 01 00 00 48 c7 c6 30 43 62 a0 8b 41 2c 89 44 24 08 48 8b 41 20 48 c7 c1 78 a3 62 a0 48 89 04 24 31 c0 e8 a0 97 f9 ff <0f> 0b 3d 00 fe ff ff 0f 84 ab fd ff ff 83 f8 fc 0f 84 a2 fd ff RIP [] ocfs2_truncate_file+0x640/0x6c0 [ocfs2] It's because ocfs2_inode_lock() get us stale LVB in which the i_size is not equal to the disk i_size. We mistakenly trust the LVB because the underlaying fsdlm dlm_lock() doesn't set lkb_sbflags with DLM_SBF_VALNOTVALID properly for us. But, why? The current code tries to downconvert lock without DLM_LKF_VALBLK flag to tell o2cb don't update RSB's LVB if it's a PR->NULL conversion, even if the lock resource type needs LVB. This is not the right way for fsdlm. The fsdlm plugin behaves different on DLM_LKF_VALBLK, it depends on DLM_LKF_VALBLK to decide if we care about the LVB in the LKB. If DLM_LKF_VALBLK is not set, fsdlm will skip recovering RSB's LVB from this lkb and set the right DLM_SBF_VALNOTVALID appropriately when node failure happens. The following diagram briefly illustrates how this crash happens: RSB1 is inode metadata lock resource with LOCK_TYPE_USES_LVB; The 1st round: Node1 Node2 RSB1: PR RSB1(master): NULL->EX ocfs2_downconvert_lock(PR->NULL, set_lvb==0) ocfs2_dlm_lock(no DLM_LKF_VALBLK) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - dlm_lock(no DLM_LKF_VALBLK) convert_lock(overwrite lkb->lkb_exflags with no DLM_LKF_VALBLK) RSB1: NULL RSB1: EX reset Node2 dlm_recover_rsbs() recover_lvb() /* The LVB is not trustable if the node with EX fails and * no lock >= PR is left. We should set RSB_VALNOTVALID for RSB1. */ if(!(kb_exflags & DLM_LKF_VALBLK)) /* This means we miss the chance to return; * to invalid the LVB here. */ The 2nd round: Node 1 Node2 RSB1(become master from recovery) ocfs2_setattr() ocfs2_inode_lock(NULL->EX) /* dlm_lock() return the stale lvb without setting DLM_SBF_VALNOTVALID */ ocfs2_meta_lvb_is_trustable() return 1 /* so we don't refresh inode from disk */ ocfs2_truncate_file() mlog_bug_on_msg(disk isize != i_size_read(inode)) /* crash! */ The fix is quite straightforward. We keep to set DLM_LKF_VALBLK flag for dlm_lock() if the lock resource type needs LVB and the fsdlm plugin is uesed. Link: http://lkml.kernel.org/r/1481275846-6604-1-git-send-email-zren@suse.com Signed-off-by: Eric Ren Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 10 ++++++++++ fs/ocfs2/stackglue.c | 6 ++++++ fs/ocfs2/stackglue.h | 3 +++ 3 files changed, 19 insertions(+) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 83d576f6a287..77d1632e905d 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3303,6 +3303,16 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name, lockres->l_level, new_level); + /* + * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always + * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that + * we can recover correctly from node failure. Otherwise, we may get + * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set. + */ + if (!ocfs2_is_o2cb_active() && + lockres->l_ops->flags & LOCK_TYPE_USES_LVB) + lvb = 1; + if (lvb) dlm_flags |= DLM_LKF_VALBLK; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 52c07346bea3..820359096c7a 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -48,6 +48,12 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; */ static struct ocfs2_stack_plugin *active_stack; +inline int ocfs2_is_o2cb_active(void) +{ + return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB); +} +EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active); + static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index f2dce10fae54..e3036e1790e8 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,6 +298,9 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); +/* In ocfs2_downconvert_lock(), we need to know which stack we are using */ +int ocfs2_is_o2cb_active(void); + extern struct kset *ocfs2_kset; #endif /* STACKGLUE_H */ From f931ab479dd24cf7a2c6e2df19778406892591fb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 10 Jan 2017 16:57:36 -0800 Subject: [PATCH 52/86] mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done} Both arch_add_memory() and arch_remove_memory() expect a single threaded context. For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does not hold any locks over this check and branch: if (pgd_val(*pgd)) { pud = (pud_t *)pgd_page_vaddr(*pgd); paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), page_size_mask); continue; } pud = alloc_low_page(); paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), page_size_mask); The result is that two threads calling devm_memremap_pages() simultaneously can end up colliding on pgd initialization. This leads to crash signatures like the following where the loser of the race initializes the wrong pgd entry: BUG: unable to handle kernel paging request at ffff888ebfff0000 IP: memcpy_erms+0x6/0x10 PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */ Oops: 0000 [#1] SMP DEBUG_PAGEALLOC CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13 task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000 RIP: memcpy_erms+0x6/0x10 [..] Call Trace: ? pmem_do_bvec+0x205/0x370 [nd_pmem] ? blk_queue_enter+0x3a/0x280 pmem_rw_page+0x38/0x80 [nd_pmem] bdev_read_page+0x84/0xb0 Hold the standard memory hotplug mutex over calls to arch_{add,remove}_memory(). Fixes: 41e94a851304 ("add devm_memremap_pages") Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/memremap.c b/kernel/memremap.c index b501e390bb34..9ecedc28b928 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -246,7 +246,9 @@ static void devm_memremap_pages_release(struct device *dev, void *data) /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); + mem_hotplug_begin(); arch_remove_memory(align_start, align_size); + mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, @@ -358,7 +360,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (error) goto err_pfn_remap; + mem_hotplug_begin(); error = arch_add_memory(nid, align_start, align_size, true); + mem_hotplug_done(); if (error) goto err_add_memory; From 2df26639e708a88dcc22171949da638a9998f3bc Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:57:39 -0800 Subject: [PATCH 53/86] mm: fix remote numa hits statistics Jia He has noticed that commit b9f00e147f27 ("mm, page_alloc: reduce branches in zone_statistics") has an unintentional side effect that remote node allocation requests are accounted as NUMA_MISS rathat than NUMA_HIT and NUMA_OTHER if such a request doesn't use __GFP_OTHER_NODE. There are many of these potentially because the flag is used very rarely while we have many users of __alloc_pages_node. Fix this by simply ignoring __GFP_OTHER_NODE (it can be removed in a follow up patch) and treat all allocations that were satisfied from the preferred zone's node as NUMA_HITS because this is the same node we requested the allocation from in most cases. If this is not the local node then we just account it as NUMA_OTHER rather than NUMA_LOCAL. One downsize would be that an allocation request for a node which is outside of the mempolicy nodemask would be reported as a hit which is a bit weird but that was the case before b9f00e147f27 already. Fixes: b9f00e147f27 ("mm, page_alloc: reduce branches in zone_statistics") Link: http://lkml.kernel.org/r/20170102153057.9451-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Jia He Reviewed-by: Vlastimil Babka # with cbmc[1] superpowers Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Taku Izumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2c6d5f64feca..cba2a64792e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2583,30 +2583,23 @@ int __isolate_free_page(struct page *page, unsigned int order) * Update NUMA hit/miss statistics * * Must be called with interrupts disabled. - * - * When __GFP_OTHER_NODE is set assume the node of the preferred - * zone is the local node. This is useful for daemons who allocate - * memory on behalf of other processes. */ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) { #ifdef CONFIG_NUMA - int local_nid = numa_node_id(); enum zone_stat_item local_stat = NUMA_LOCAL; - if (unlikely(flags & __GFP_OTHER_NODE)) { + if (z->node != numa_node_id()) local_stat = NUMA_OTHER; - local_nid = preferred_zone->node; - } - if (z->node == local_nid) { + if (z->node == preferred_zone->node) __inc_zone_state(z, NUMA_HIT); - __inc_zone_state(z, local_stat); - } else { + else { __inc_zone_state(z, NUMA_MISS); __inc_zone_state(preferred_zone, NUMA_FOREIGN); } + __inc_zone_state(z, local_stat); #endif } From 41b6167e8f746b475668f1da78599fc4284f18db Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:57:42 -0800 Subject: [PATCH 54/86] mm: get rid of __GFP_OTHER_NODE The flag was introduced by commit 78afd5612deb ("mm: add __GFP_OTHER_NODE flag") to allow proper accounting of remote node allocations done by kernel daemons on behalf of a process - e.g. khugepaged. After "mm: fix remote numa hits statistics" we do not need and actually use the flag so we can safely remove it because all allocations which are satisfied from their "home" node are accounted properly. [mhocko@suse.com: fix build] Link: http://lkml.kernel.org/r/20170106122225.GK5556@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20170102153057.9451-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Taku Izumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 13 +++---------- include/trace/events/mmflags.h | 3 +-- mm/huge_memory.c | 3 +-- mm/khugepaged.c | 5 ++--- mm/page_alloc.c | 5 ++--- tools/perf/builtin-kmem.c | 1 - 6 files changed, 9 insertions(+), 21 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4175dca4ac39..7806a8f80abc 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -38,9 +38,8 @@ struct vm_area_struct; #define ___GFP_ACCOUNT 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_DIRECT_RECLAIM 0x400000u -#define ___GFP_OTHER_NODE 0x800000u -#define ___GFP_WRITE 0x1000000u -#define ___GFP_KSWAPD_RECLAIM 0x2000000u +#define ___GFP_WRITE 0x800000u +#define ___GFP_KSWAPD_RECLAIM 0x1000000u /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -172,11 +171,6 @@ struct vm_area_struct; * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of * distinguishing in the source between false positives and allocations that * cannot be supported (e.g. page tables). - * - * __GFP_OTHER_NODE is for allocations that are on a remote node but that - * should not be accounted for as a remote allocation in vmstat. A - * typical user would be khugepaged collapsing a huge page on a remote - * node. */ #define __GFP_COLD ((__force gfp_t)___GFP_COLD) #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) @@ -184,10 +178,9 @@ struct vm_area_struct; #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT 26 +#define __GFP_BITS_SHIFT 25 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 9e687ca9a307..15bf875d0e4a 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -47,8 +47,7 @@ {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ - {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ - {(unsigned long)__GFP_OTHER_NODE, "__GFP_OTHER_NODE"} \ + {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"}\ #define show_gfp_flags(flags) \ (flags) ? __print_flags(flags, "|", \ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 10eedbf14421..72339a646fb1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -919,8 +919,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, } for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | - __GFP_OTHER_NODE, vma, + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, vmf->address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_try_charge(pages[i], vma->vm_mm, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b0924a68cc36..77ae3239c3de 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -943,7 +943,7 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; /* * Before allocating the hugepage, release the mmap_sem read lock. @@ -1309,8 +1309,7 @@ static void collapse_shmem(struct mm_struct *mm, VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | - __GFP_OTHER_NODE | __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; new_page = khugepaged_alloc_page(hpage, gfp, node); if (!new_page) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cba2a64792e6..872caae544ef 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2584,8 +2584,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * * Must be called with interrupts disabled. */ -static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, - gfp_t flags) +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) { #ifdef CONFIG_NUMA enum zone_stat_item local_stat = NUMA_LOCAL; @@ -2667,7 +2666,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, } __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone, gfp_flags); + zone_statistics(preferred_zone, zone); local_irq_restore(flags); VM_BUG_ON_PAGE(bad_range(zone, page), page); diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 35a02f8e5a4a..915869e00d86 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -655,7 +655,6 @@ static const struct { { "__GFP_RECLAIM", "R" }, { "__GFP_DIRECT_RECLAIM", "DR" }, { "__GFP_KSWAPD_RECLAIM", "KR" }, - { "__GFP_OTHER_NODE", "ON" }, }; static size_t max_gfp_len; From da0510c47519fe0999cffe316e1d370e29f952be Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 10 Jan 2017 16:57:45 -0800 Subject: [PATCH 55/86] lib/Kconfig.debug: fix frv build failure The build of frv allmodconfig was failing with the errors like: /tmp/cc0JSPc3.s: Assembler messages: /tmp/cc0JSPc3.s:1839: Error: symbol `.LSLT0' is already defined /tmp/cc0JSPc3.s:1842: Error: symbol `.LASLTP0' is already defined /tmp/cc0JSPc3.s:1969: Error: symbol `.LELTP0' is already defined /tmp/cc0JSPc3.s:1970: Error: symbol `.LELT0' is already defined Commit 866ced950bcd ("kbuild: Support split debug info v4") introduced splitting the debug info and keeping that in a separate file. Somehow, the frv-linux gcc did not like that and I am guessing that instead of splitting it started copying. The first report about this is at: https://lists.01.org/pipermail/kbuild-all/2015-July/010527.html. I will try and see if this can work with frv and if still fails I will open a bug report with gcc. But meanwhile this is the easiest option to solve build failure of frv. Fixes: 866ced950bcd ("kbuild: Support split debug info v4") Link: http://lkml.kernel.org/r/1482062348-5352-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Reported-by: Fengguang Wu Cc: Andi Kleen Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b06848a104e6..eb9e9a7870fa 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -164,7 +164,7 @@ config DEBUG_INFO_REDUCED config DEBUG_INFO_SPLIT bool "Produce split debuginfo in .dwo files" - depends on DEBUG_INFO + depends on DEBUG_INFO && !FRV help Generate debug info into separate .dwo files. This significantly reduces the build directory size for builds with DEBUG_INFO, From c626bc46edb0fec289adfc86b02e07d34127ef6c Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Tue, 10 Jan 2017 16:57:48 -0800 Subject: [PATCH 56/86] ipc/sem.c: fix incorrect sem_lock pairing Based on the syzcaller test case from dvyukov: https://gist.githubusercontent.com/dvyukov/d0e5efefe4d7d6daed829f5c3ca26a40/raw/08d0a261fe3c987bed04fbf267e08ba04bd533ea/gistfile1.txt The slow (i.e.: failure to acquire) syscall exit from semtimedop() incorrectly assumed that the the same lock is acquired as it was at the initial syscall entry. This is wrong: - thread A: single semop semop(), sleeps - thread B: multi semop semop(), sleeps - thread A: woken up by signal/timeout With this sequence, the initial sem_lock() call locks the per-semaphore spinlock, and it is unlocked with sem_unlock(). The call at the syscall return locks the global spinlock. Because locknum is not updated, the following sem_unlock() call unlocks the per-semaphore spinlock, which is actually not locked. The fix is trivial: Use the return value from sem_lock. Fixes: 370b262c896e ("ipc/sem: avoid idr tree lookup for interrupted semop") Link: http://lkml.kernel.org/r/1482215645-22328-1-git-send-email-manfred@colorfullife.com Signed-off-by: Manfred Spraul Reported-by: Dmitry Vyukov Reported-by: Johanna Abrahamsson Tested-by: Johanna Abrahamsson Acked-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/sem.c b/ipc/sem.c index e08b94851922..3ec5742b5640 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1977,7 +1977,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, } rcu_read_lock(); - sem_lock(sma, sops, nsops); + locknum = sem_lock(sma, sops, nsops); if (!ipc_valid_object(&sma->sem_perm)) goto out_unlock_free; From 20f664aabeb88d582b623a625f83b0454fa34f07 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2017 16:57:51 -0800 Subject: [PATCH 57/86] mm: pmd dirty emulation in page fault handler Andreas reported [1] made a test in jemalloc hang in THP mode in arm64: http://lkml.kernel.org/r/mvmmvfy37g1.fsf@hawking.suse.de The problem is currently page fault handler doesn't supports dirty bit emulation of pmd for non-HW dirty-bit architecture so that application stucks until VM marked the pmd dirty. How the emulation work depends on the architecture. In case of arm64, when it set up pte firstly, it sets pte PTE_RDONLY to get a chance to mark the pte dirty via triggering page fault when store access happens. Once the page fault occurs, VM marks the pmd dirty and arch code for setting pmd will clear PTE_RDONLY for application to proceed. IOW, if VM doesn't mark the pmd dirty, application hangs forever by repeated fault(i.e., store op but the pmd is PTE_RDONLY). This patch enables pmd dirty-bit emulation for those architectures. [1] b8d3c4c3009d, mm/huge_memory.c: don't split THP page when MADV_FREE syscall is called Fixes: b8d3c4c3009d ("mm/huge_memory.c: don't split THP page when MADV_FREE syscall is called") Link: http://lkml.kernel.org/r/1482506098-6149-1-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: Andreas Schwab Tested-by: Andreas Schwab Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Jason Evans Cc: Will Deacon Cc: Catalin Marinas Cc: [4.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 72339a646fb1..9a6bd6c8d55a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -883,15 +883,17 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) { pmd_t entry; unsigned long haddr; + bool write = vmf->flags & FAULT_FLAG_WRITE; vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto unlock; entry = pmd_mkyoung(orig_pmd); + if (write) + entry = pmd_mkdirty(entry); haddr = vmf->address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, - vmf->flags & FAULT_FLAG_WRITE)) + if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write)) update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); unlock: From 2d39b3cd34e6d323720d4c61bd714f5ae202c022 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Tue, 10 Jan 2017 16:57:54 -0800 Subject: [PATCH 58/86] signal: protect SIGNAL_UNKILLABLE from unintentional clearing. Since commit 00cd5c37afd5 ("ptrace: permit ptracing of /sbin/init") we can now trace init processes. init is initially protected with SIGNAL_UNKILLABLE which will prevent fatal signals such as SIGSTOP, but there are a number of paths during tracing where SIGNAL_UNKILLABLE can be implicitly cleared. This can result in init becoming stoppable/killable after tracing. For example, running: while true; do kill -STOP 1; done & strace -p 1 and then stopping strace and the kill loop will result in init being left in state TASK_STOPPED. Sending SIGCONT to init will resume it, but init will now respond to future SIGSTOP signals rather than ignoring them. Make sure that when setting SIGNAL_STOP_CONTINUED/SIGNAL_STOP_STOPPED that we don't clear SIGNAL_UNKILLABLE. Link: http://lkml.kernel.org/r/20170104122017.25047-1-jamie.iles@oracle.com Signed-off-by: Jamie Iles Acked-by: Oleg Nesterov Cc: Alexander Viro Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 10 ++++++++++ kernel/signal.c | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4d1905245c7a..ad3ec9ec61f7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -854,6 +854,16 @@ struct signal_struct { #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ +#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ + SIGNAL_STOP_CONTINUED) + +static inline void signal_set_stop_flags(struct signal_struct *sig, + unsigned int flags) +{ + WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP)); + sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; +} + /* If true, all threads except ->group_exit_task have pending SIGKILL */ static inline int signal_group_exit(const struct signal_struct *sig) { diff --git a/kernel/signal.c b/kernel/signal.c index ff046b73ff2d..3603d93a1968 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -346,7 +346,7 @@ static bool task_participate_group_stop(struct task_struct *task) * fresh group stop. Read comment in do_signal_stop() for details. */ if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { - sig->flags = SIGNAL_STOP_STOPPED; + signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED); return true; } return false; @@ -843,7 +843,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal_to_deliver(). */ - signal->flags = why | SIGNAL_STOP_CONTINUED; + signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED); signal->group_stop_count = 0; signal->group_exit_code = 0; } From 9ebf73b275f06b114586af27cda3fd72e149d5ba Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 10 Jan 2017 16:57:57 -0800 Subject: [PATCH 59/86] mailmap: add codeaurora.org names for nameless email commits Some codeaurora.org emails have crept in but the names don't exist for them. Add the names for the emails so git can match everyone up. Link: http://lkml.kernel.org/r/20170104194611.25933-1-sboyd@codeaurora.org Signed-off-by: Stephen Boyd Cc: Sarangdhar Joshi Cc: Subash Abhinov Kasiviswanathan Cc: Subhash Jadavani Cc: Thomas Pedersen Cc: Andy Gross Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .mailmap | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.mailmap b/.mailmap index 02d261407683..67dc22ffc9a8 100644 --- a/.mailmap +++ b/.mailmap @@ -137,6 +137,7 @@ Ricardo Ribalda Delgado Rudolf Marek Rui Saraiva Sachin P Sant +Sarangdhar Joshi Sam Ravnborg Santosh Shilimkar Santosh Shilimkar @@ -150,10 +151,13 @@ Shuah Khan Simon Kelley Stéphane Witzmann Stephen Hemminger +Subash Abhinov Kasiviswanathan +Subhash Jadavani Sudeep Holla Sudeep KarkadaNagesha Sumit Semwal Tejun Heo Thomas Graf +Thomas Pedersen Tony Luck Tsuneo Yoshioka Uwe Kleine-König From f073bdc51771f5a5c7a8d1191bfc3ae371d44de7 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 10 Jan 2017 16:58:00 -0800 Subject: [PATCH 60/86] mm: don't dereference struct page fields of invalid pages The VM_BUG_ON() check in move_freepages() checks whether the node id of a page matches the node id of its zone. However, it does this before having checked whether the struct page pointer refers to a valid struct page to begin with. This is guaranteed in most cases, but may not be the case if CONFIG_HOLES_IN_ZONE=y. So reorder the VM_BUG_ON() with the pfn_valid_within() check. Link: http://lkml.kernel.org/r/1481706707-6211-2-git-send-email-ard.biesheuvel@linaro.org Signed-off-by: Ard Biesheuvel Acked-by: Will Deacon Cc: Catalin Marinas Cc: Hanjun Guo Cc: Yisheng Xie Cc: Robert Richter Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 872caae544ef..74afdb4177cb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1864,14 +1864,14 @@ int move_freepages(struct zone *zone, #endif for (page = start_page; page <= end_page;) { - /* Make sure we are not inadvertently changing nodes */ - VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); - if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; } + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); + if (!PageBuddy(page)) { page++; continue; From b4536f0c829c8586544c94735c343f9b5070bd01 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:58:04 -0800 Subject: [PATCH 61/86] mm, memcg: fix the active list aging for lowmem requests when memcg is enabled Nils Holland and Klaus Ethgen have reported unexpected OOM killer invocations with 32b kernel starting with 4.8 kernels kworker/u4:5 invoked oom-killer: gfp_mask=0x2400840(GFP_NOFS|__GFP_NOFAIL), nodemask=0, order=0, oom_score_adj=0 kworker/u4:5 cpuset=/ mems_allowed=0 CPU: 1 PID: 2603 Comm: kworker/u4:5 Not tainted 4.9.0-gentoo #2 [...] Mem-Info: active_anon:58685 inactive_anon:90 isolated_anon:0 active_file:274324 inactive_file:281962 isolated_file:0 unevictable:0 dirty:649 writeback:0 unstable:0 slab_reclaimable:40662 slab_unreclaimable:17754 mapped:7382 shmem:202 pagetables:351 bounce:0 free:206736 free_pcp:332 free_cma:0 Node 0 active_anon:234740kB inactive_anon:360kB active_file:1097296kB inactive_file:1127848kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:29528kB dirty:2596kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 184320kB anon_thp: 808kB writeback_tmp:0kB unstable:0kB pages_scanned:0 all_unreclaimable? no DMA free:3952kB min:788kB low:984kB high:1180kB active_anon:0kB inactive_anon:0kB active_file:7316kB inactive_file:0kB unevictable:0kB writepending:96kB present:15992kB managed:15916kB mlocked:0kB slab_reclaimable:3200kB slab_unreclaimable:1408kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 813 3474 3474 Normal free:41332kB min:41368kB low:51708kB high:62048kB active_anon:0kB inactive_anon:0kB active_file:532748kB inactive_file:44kB unevictable:0kB writepending:24kB present:897016kB managed:836248kB mlocked:0kB slab_reclaimable:159448kB slab_unreclaimable:69608kB kernel_stack:1112kB pagetables:1404kB bounce:0kB free_pcp:528kB local_pcp:340kB free_cma:0kB lowmem_reserve[]: 0 0 21292 21292 HighMem free:781660kB min:512kB low:34356kB high:68200kB active_anon:234740kB inactive_anon:360kB active_file:557232kB inactive_file:1127804kB unevictable:0kB writepending:2592kB present:2725384kB managed:2725384kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:800kB local_pcp:608kB free_cma:0kB the oom killer is clearly pre-mature because there there is still a lot of page cache in the zone Normal which should satisfy this lowmem request. Further debugging has shown that the reclaim cannot make any forward progress because the page cache is hidden in the active list which doesn't get rotated because inactive_list_is_low is not memcg aware. The code simply subtracts per-zone highmem counters from the respective memcg's lru sizes which doesn't make any sense. We can simply end up always seeing the resulting active and inactive counts 0 and return false. This issue is not limited to 32b kernels but in practice the effect on systems without CONFIG_HIGHMEM would be much harder to notice because we do not invoke the OOM killer for allocations requests targeting < ZONE_NORMAL. Fix the issue by tracking per zone lru page counts in mem_cgroup_per_node and subtract per-memcg highmem counts when memcg is enabled. Introduce helper lruvec_zone_lru_size which redirects to either zone counters or mem_cgroup_get_zone_lru_size when appropriate. We are losing empty LRU but non-zero lru size detection introduced by ca707239e8a7 ("mm: update_lru_size warn and reset bad lru_size") because of the inherent zone vs. node discrepancy. Fixes: f8d1a31163fc ("mm: consider whether to decivate based on eligible zones inactive ratio") Link: http://lkml.kernel.org/r/20170104100825.3729-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Nils Holland Tested-by: Nils Holland Reported-by: Klaus Ethgen Acked-by: Minchan Kim Acked-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Vladimir Davydov Cc: [4.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 26 +++++++++++++++++++++++--- include/linux/mm_inline.h | 2 +- mm/memcontrol.c | 18 ++++++++---------- mm/vmscan.c | 27 +++++++++++++++++---------- 4 files changed, 49 insertions(+), 24 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 61d20c17f3b7..254698856b8f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -120,7 +120,7 @@ struct mem_cgroup_reclaim_iter { */ struct mem_cgroup_per_node { struct lruvec lruvec; - unsigned long lru_size[NR_LRU_LISTS]; + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; @@ -432,7 +432,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int nr_pages); + int zid, int nr_pages); unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask); @@ -441,9 +441,23 @@ static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { struct mem_cgroup_per_node *mz; + unsigned long nr_pages = 0; + int zid; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - return mz->lru_size[lru]; + for (zid = 0; zid < MAX_NR_ZONES; zid++) + nr_pages += mz->lru_zone_size[zid][lru]; + return nr_pages; +} + +static inline +unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, + enum lru_list lru, int zone_idx) +{ + struct mem_cgroup_per_node *mz; + + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + return mz->lru_zone_size[zone_idx][lru]; } void mem_cgroup_handle_over_high(void); @@ -671,6 +685,12 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { return 0; } +static inline +unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, + enum lru_list lru, int zone_idx) +{ + return 0; +} static inline unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 71613e8a720f..41d376e7116d 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -39,7 +39,7 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4048897e7b01..a63a8f832664 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -625,8 +625,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); unsigned long nr = 0; - struct mem_cgroup_per_node *mz; enum lru_list lru; VM_BUG_ON((unsigned)nid >= nr_node_ids); @@ -634,8 +634,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - mz = mem_cgroup_nodeinfo(memcg, nid); - nr += mz->lru_size[lru]; + nr += mem_cgroup_get_lru_size(lruvec, lru); } return nr; } @@ -1002,6 +1001,7 @@ out: * mem_cgroup_update_lru_size - account for adding or removing an lru page * @lruvec: mem_cgroup per zone lru vector * @lru: index of lru list the page is sitting on + * @zid: zone id of the accounted pages * @nr_pages: positive when adding or negative when removing * * This function must be called under lru_lock, just before a page is added @@ -1009,27 +1009,25 @@ out: * so as to allow it to check that lru_size 0 is consistent with list_empty). */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int nr_pages) + int zid, int nr_pages) { struct mem_cgroup_per_node *mz; unsigned long *lru_size; long size; - bool empty; if (mem_cgroup_disabled()) return; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - lru_size = mz->lru_size + lru; - empty = list_empty(lruvec->lists + lru); + lru_size = &mz->lru_zone_size[zid][lru]; if (nr_pages < 0) *lru_size += nr_pages; size = *lru_size; - if (WARN_ONCE(size < 0 || empty != !size, - "%s(%p, %d, %d): lru_size %ld but %sempty\n", - __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) { + if (WARN_ONCE(size < 0, + "%s(%p, %d, %d): lru_size %ld\n", + __func__, lruvec, lru, nr_pages, size)) { VM_BUG_ON(1); *lru_size = 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 6aa5b01d3e75..532a2a750952 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -242,6 +242,16 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); } +unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, + int zone_idx) +{ + if (!mem_cgroup_disabled()) + return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx); + + return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx], + NR_ZONE_LRU_BASE + lru); +} + /* * Add a shrinker callback to be called from the vm. */ @@ -1382,8 +1392,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) * be complete before mem_cgroup_update_lru_size due to a santity check. */ static __always_inline void update_lru_sizes(struct lruvec *lruvec, - enum lru_list lru, unsigned long *nr_zone_taken, - unsigned long nr_taken) + enum lru_list lru, unsigned long *nr_zone_taken) { int zid; @@ -1392,11 +1401,11 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, continue; __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); +#ifdef CONFIG_MEMCG + mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); +#endif } -#ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, -nr_taken); -#endif } /* @@ -1501,7 +1510,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, *nr_scanned = scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, nr_taken, mode, is_file_lru(lru)); - update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken); + update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } @@ -2047,10 +2056,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, if (!managed_zone(zone)) continue; - inactive_zone = zone_page_state(zone, - NR_ZONE_LRU_BASE + (file * LRU_FILE)); - active_zone = zone_page_state(zone, - NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE); + inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid); + active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid); inactive -= min(inactive, inactive_zone); active -= min(active, active_zone); From 8c2dd3e4a4bae78093c4a5cee6494877651be3c9 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 10 Jan 2017 16:58:06 -0800 Subject: [PATCH 62/86] mm: rename __alloc_page_frag to page_frag_alloc and __free_page_frag to page_frag_free Patch series "Page fragment updates", v4. This patch series takes care of a few cleanups for the page fragments API. First we do some renames so that things are much more consistent. First we move the page_frag_ portion of the name to the front of the functions names. Secondly we split out the cache specific functions from the other page fragment functions by adding the word "cache" to the name. Finally I added a bit of documentation that will hopefully help to explain some of this. I plan to revisit this later as we get things more ironed out in the near future with the changes planned for the DMA setup to support eXpress Data Path. This patch (of 3): This patch renames the page frag functions to be more consistent with other APIs. Specifically we place the name page_frag first in the name and then have either an alloc or free call name that we append as the suffix. This makes it a bit clearer in terms of naming. In addition we drop the leading double underscores since we are technically no longer a backing interface and instead the front end that is called from the networking APIs. Link: http://lkml.kernel.org/r/20170104023854.13451.67390.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 6 +++--- include/linux/skbuff.h | 2 +- mm/page_alloc.c | 10 +++++----- net/core/skbuff.c | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7806a8f80abc..ed77a86fbbb0 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -501,9 +501,9 @@ extern void free_hot_cold_page_list(struct list_head *list, bool cold); struct page_frag_cache; extern void __page_frag_drain(struct page *page, unsigned int order, unsigned int count); -extern void *__alloc_page_frag(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask); -extern void __free_page_frag(void *addr); +extern void *page_frag_alloc(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask); +extern void page_frag_free(void *addr); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b53c0cfd417e..a410715bbef8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2480,7 +2480,7 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, static inline void skb_free_frag(void *addr) { - __free_page_frag(addr); + page_frag_free(addr); } void *napi_alloc_frag(unsigned int fragsz); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 74afdb4177cb..097893ffe194 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3931,8 +3931,8 @@ void __page_frag_drain(struct page *page, unsigned int order, } EXPORT_SYMBOL(__page_frag_drain); -void *__alloc_page_frag(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask) +void *page_frag_alloc(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask) { unsigned int size = PAGE_SIZE; struct page *page; @@ -3983,19 +3983,19 @@ refill: return nc->va + offset; } -EXPORT_SYMBOL(__alloc_page_frag); +EXPORT_SYMBOL(page_frag_alloc); /* * Frees a page fragment allocated out of either a compound or order 0 page. */ -void __free_page_frag(void *addr) +void page_frag_free(void *addr) { struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) __free_pages_ok(page, compound_order(page)); } -EXPORT_SYMBOL(__free_page_frag); +EXPORT_SYMBOL(page_frag_free); static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5a03730fbc1a..734c71468b01 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -369,7 +369,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) local_irq_save(flags); nc = this_cpu_ptr(&netdev_alloc_cache); - data = __alloc_page_frag(nc, fragsz, gfp_mask); + data = page_frag_alloc(nc, fragsz, gfp_mask); local_irq_restore(flags); return data; } @@ -391,7 +391,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return __alloc_page_frag(&nc->page, fragsz, gfp_mask); + return page_frag_alloc(&nc->page, fragsz, gfp_mask); } void *napi_alloc_frag(unsigned int fragsz) @@ -441,7 +441,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, local_irq_save(flags); nc = this_cpu_ptr(&netdev_alloc_cache); - data = __alloc_page_frag(nc, len, gfp_mask); + data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = nc->pfmemalloc; local_irq_restore(flags); @@ -505,7 +505,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __alloc_page_frag(&nc->page, len, gfp_mask); + data = page_frag_alloc(&nc->page, len, gfp_mask); if (unlikely(!data)) return NULL; From 2976db8018532b624c4123ae662fbc0814877abf Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 10 Jan 2017 16:58:09 -0800 Subject: [PATCH 63/86] mm: rename __page_frag functions to __page_frag_cache, drop order from drain This patch does two things. First it goes through and renames the __page_frag prefixed functions to __page_frag_cache so that we can be clear that we are draining or refilling the cache, not the frags themselves. Second we drop the order parameter from __page_frag_cache_drain since we don't actually need to pass it since all fragments are either order 0 or must be a compound page. Link: http://lkml.kernel.org/r/20170104023954.13451.5678.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/ethernet/intel/igb/igb_main.c | 6 +++--- include/linux/gfp.h | 3 +-- mm/page_alloc.c | 13 +++++++------ 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index a761001308dc..1515abaa5ac9 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -3962,8 +3962,8 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring) PAGE_SIZE, DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); - __page_frag_drain(buffer_info->page, 0, - buffer_info->pagecnt_bias); + __page_frag_cache_drain(buffer_info->page, + buffer_info->pagecnt_bias); buffer_info->page = NULL; } @@ -6991,7 +6991,7 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring, dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, PAGE_SIZE, DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); - __page_frag_drain(page, 0, rx_buffer->pagecnt_bias); + __page_frag_cache_drain(page, rx_buffer->pagecnt_bias); } /* clear contents of rx_buffer */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ed77a86fbbb0..0fe0b6295ab5 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -499,8 +499,7 @@ extern void free_hot_cold_page(struct page *page, bool cold); extern void free_hot_cold_page_list(struct list_head *list, bool cold); struct page_frag_cache; -extern void __page_frag_drain(struct page *page, unsigned int order, - unsigned int count); +extern void __page_frag_cache_drain(struct page *page, unsigned int count); extern void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask); extern void page_frag_free(void *addr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 097893ffe194..d604d2596b7b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3896,8 +3896,8 @@ EXPORT_SYMBOL(free_pages); * drivers to provide a backing region of memory for use as either an * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. */ -static struct page *__page_frag_refill(struct page_frag_cache *nc, - gfp_t gfp_mask) +static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, + gfp_t gfp_mask) { struct page *page = NULL; gfp_t gfp = gfp_mask; @@ -3917,19 +3917,20 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc, return page; } -void __page_frag_drain(struct page *page, unsigned int order, - unsigned int count) +void __page_frag_cache_drain(struct page *page, unsigned int count) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); if (page_ref_sub_and_test(page, count)) { + unsigned int order = compound_order(page); + if (order == 0) free_hot_cold_page(page, false); else __free_pages_ok(page, order); } } -EXPORT_SYMBOL(__page_frag_drain); +EXPORT_SYMBOL(__page_frag_cache_drain); void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) @@ -3940,7 +3941,7 @@ void *page_frag_alloc(struct page_frag_cache *nc, if (unlikely(!nc->va)) { refill: - page = __page_frag_refill(nc, gfp_mask); + page = __page_frag_cache_refill(nc, gfp_mask); if (!page) return NULL; From 4d09d0f45dd5d78b3a301c238272211d1ea7d9e6 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 10 Jan 2017 16:58:12 -0800 Subject: [PATCH 64/86] mm: add documentation for page fragment APIs This is a first pass at trying to add documentation for the page_frag APIs. They may still change over time but for now I thought I would try to get these documented so that as more network drivers and stack calls make use of them we have one central spot to document how they are meant to be used. Link: http://lkml.kernel.org/r/20170104024157.13451.6758.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_frags | 42 +++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 Documentation/vm/page_frags diff --git a/Documentation/vm/page_frags b/Documentation/vm/page_frags new file mode 100644 index 000000000000..a6714565dbf9 --- /dev/null +++ b/Documentation/vm/page_frags @@ -0,0 +1,42 @@ +Page fragments +-------------- + +A page fragment is an arbitrary-length arbitrary-offset area of memory +which resides within a 0 or higher order compound page. Multiple +fragments within that page are individually refcounted, in the page's +reference counter. + +The page_frag functions, page_frag_alloc and page_frag_free, provide a +simple allocation framework for page fragments. This is used by the +network stack and network device drivers to provide a backing region of +memory for use as either an sk_buff->head, or to be used in the "frags" +portion of skb_shared_info. + +In order to make use of the page fragment APIs a backing page fragment +cache is needed. This provides a central point for the fragment allocation +and tracks allows multiple calls to make use of a cached page. The +advantage to doing this is that multiple calls to get_page can be avoided +which can be expensive at allocation time. However due to the nature of +this caching it is required that any calls to the cache be protected by +either a per-cpu limitation, or a per-cpu limitation and forcing interrupts +to be disabled when executing the fragment allocation. + +The network stack uses two separate caches per CPU to handle fragment +allocation. The netdev_alloc_cache is used by callers making use of the +__netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is +used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The +main difference between these two calls is the context in which they may be +called. The "netdev" prefixed functions are usable in any context as these +functions will disable interrupts, while the "napi" prefixed functions are +only usable within the softirq context. + +Many network device drivers use a similar methodology for allocating page +fragments, but the page fragments are cached at the ring or descriptor +level. In order to enable these cases it is necessary to provide a generic +way of tearing down a page cache. For this reason __page_frag_cache_drain +was implemented. It allows for freeing multiple references from a single +page via a single call. The advantage to doing this is that it allows for +cleaning up the multiple references that were added to a page in order to +avoid calling get_page per allocation. + +Alexander Duyck, Nov 29, 2016. From f05714293a591038304ddae7cb0dd747bb3786cc Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2017 16:58:15 -0800 Subject: [PATCH 65/86] mm: support anonymous stable page During developemnt for zram-swap asynchronous writeback, I found strange corruption of compressed page, resulting in: Modules linked in: zram(E) CPU: 3 PID: 1520 Comm: zramd-1 Tainted: G E 4.8.0-mm1-00320-ge0d4894c9c38-dirty #3274 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 task: ffff88007620b840 task.stack: ffff880078090000 RIP: set_freeobj.part.43+0x1c/0x1f RSP: 0018:ffff880078093ca8 EFLAGS: 00010246 RAX: 0000000000000018 RBX: ffff880076798d88 RCX: ffffffff81c408c8 RDX: 0000000000000018 RSI: 0000000000000000 RDI: 0000000000000246 RBP: ffff880078093cb0 R08: 0000000000000000 R09: 0000000000000000 R10: ffff88005bc43030 R11: 0000000000001df3 R12: ffff880076798d88 R13: 000000000005bc43 R14: ffff88007819d1b8 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff88007e380000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc934048f20 CR3: 0000000077b01000 CR4: 00000000000406e0 Call Trace: obj_malloc+0x22b/0x260 zs_malloc+0x1e4/0x580 zram_bvec_rw+0x4cd/0x830 [zram] page_requests_rw+0x9c/0x130 [zram] zram_thread+0xe6/0x173 [zram] kthread+0xca/0xe0 ret_from_fork+0x25/0x30 With investigation, it reveals currently stable page doesn't support anonymous page. IOW, reuse_swap_page can reuse the page without waiting writeback completion so it can overwrite page zram is compressing. Unfortunately, zram has used per-cpu stream feature from v4.7. It aims for increasing cache hit ratio of scratch buffer for compressing. Downside of that approach is that zram should ask memory space for compressed page in per-cpu context which requires stricted gfp flag which could be failed. If so, it retries to allocate memory space out of per-cpu context so it could get memory this time and compress the data again, copies it to the memory space. In this scenario, zram assumes the data should never be changed but it is not true unless stable page supports. So, If the data is changed under us, zram can make buffer overrun because second compression size could be bigger than one we got in previous trial and blindly, copy bigger size object to smaller buffer which is buffer overrun. The overrun breaks zsmalloc free object chaining so system goes crash like above. I think below is same problem. https://bugzilla.suse.com/show_bug.cgi?id=997574 Unfortunately, reuse_swap_page should be atomic so that we cannot wait on writeback in there so the approach in this patch is simply return false if we found it needs stable page. Although it increases memory footprint temporarily, it happens rarely and it should be reclaimed easily althoug it happened. Also, It would be better than waiting of IO completion, which is critial path for application latency. Fixes: da9556a2367c ("zram: user per-cpu compression streams") Link: http://lkml.kernel.org/r/20161120233015.GA14113@bbox Link: http://lkml.kernel.org/r/1482366980-3782-2-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Hugh Dickins Cc: Sergey Senozhatsky Cc: Darrick J. Wong Cc: Takashi Iwai Cc: Hyeoncheol Lee Cc: Cc: Sangseok Lee Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 ++- mm/swapfile.c | 20 +++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 09f4be179ff3..7f47b7098b1b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -150,8 +150,9 @@ enum { SWP_FILE = (1 << 7), /* set after swap_activate success */ SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ + SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ /* add others here before... */ - SWP_SCANNING = (1 << 10), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 11), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL diff --git a/mm/swapfile.c b/mm/swapfile.c index 1c6e0321205d..4761701d1721 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -943,11 +943,25 @@ bool reuse_swap_page(struct page *page, int *total_mapcount) count = page_trans_huge_mapcount(page, total_mapcount); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); - if (count == 1 && !PageWriteback(page)) { + if (count != 1) + goto out; + if (!PageWriteback(page)) { delete_from_swap_cache(page); SetPageDirty(page); + } else { + swp_entry_t entry; + struct swap_info_struct *p; + + entry.val = page_private(page); + p = swap_info_get(entry); + if (p->flags & SWP_STABLE_WRITES) { + spin_unlock(&p->lock); + return false; + } + spin_unlock(&p->lock); } } +out: return count <= 1; } @@ -2448,6 +2462,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } + + if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) + p->flags |= SWP_STABLE_WRITES; + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { int cpu; From e7ccfc4ccb703e0f033bd4617580039898e912dd Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2017 16:58:18 -0800 Subject: [PATCH 66/86] zram: revalidate disk under init_lock Commit b4c5c60920e3 ("zram: avoid lockdep splat by revalidate_disk") moved revalidate_disk call out of init_lock to avoid lockdep false-positive splat. However, commit 08eee69fcf6b ("zram: remove init_lock in zram_make_request") removed init_lock in IO path so there is no worry about lockdep splat. So, let's restore it. This patch is needed to set BDI_CAP_STABLE_WRITES atomically in next patch. Fixes: da9556a2367c ("zram: user per-cpu compression streams") Link: http://lkml.kernel.org/r/1482366980-3782-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Takashi Iwai Cc: Hyeoncheol Lee Cc: Cc: Sangseok Lee Cc: Hugh Dickins Cc: Darrick J. Wong Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 15f58ab44d0b..195376b4472b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1095,14 +1095,8 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - up_write(&zram->init_lock); - - /* - * Revalidate disk out of the init_lock to avoid lockdep splat. - * It's okay because disk's capacity is protected by init_lock - * so that revalidate_disk always sees up-to-date capacity. - */ revalidate_disk(zram->disk); + up_write(&zram->init_lock); return len; From b09ab054b69b07077bd3292f67e777861ac796e5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2017 16:58:21 -0800 Subject: [PATCH 67/86] zram: support BDI_CAP_STABLE_WRITES zram has used per-cpu stream feature from v4.7. It aims for increasing cache hit ratio of scratch buffer for compressing. Downside of that approach is that zram should ask memory space for compressed page in per-cpu context which requires stricted gfp flag which could be failed. If so, it retries to allocate memory space out of per-cpu context so it could get memory this time and compress the data again, copies it to the memory space. In this scenario, zram assumes the data should never be changed but it is not true without stable page support. So, If the data is changed under us, zram can make buffer overrun so that zsmalloc free object chain is broken so system goes crash like below https://bugzilla.suse.com/show_bug.cgi?id=997574 This patch adds BDI_CAP_STABLE_WRITES to zram for declaring "I am block device needing *stable write*". Fixes: da9556a2367c ("zram: user per-cpu compression streams") Link: http://lkml.kernel.org/r/1482366980-3782-4-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Takashi Iwai Cc: Hyeoncheol Lee Cc: Cc: Sangseok Lee Cc: Hugh Dickins Cc: Darrick J. Wong Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 195376b4472b..e5ab7d9e8c45 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -112,6 +113,14 @@ static inline bool is_partial_io(struct bio_vec *bvec) return bvec->bv_len != PAGE_SIZE; } +static void zram_revalidate_disk(struct zram *zram) +{ + revalidate_disk(zram->disk); + /* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */ + zram->disk->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; +} + /* * Check if request is within bounds and aligned on zram logical blocks. */ @@ -1095,7 +1104,7 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - revalidate_disk(zram->disk); + zram_revalidate_disk(zram); up_write(&zram->init_lock); return len; @@ -1143,7 +1152,7 @@ static ssize_t reset_store(struct device *dev, /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - revalidate_disk(zram->disk); + zram_revalidate_disk(zram); bdput(bdev); mutex_lock(&bdev->bd_mutex); From c4e490cf148e85ead0d1b1c2caaba833f1d5b29f Mon Sep 17 00:00:00 2001 From: John Sperbeck Date: Tue, 10 Jan 2017 16:58:24 -0800 Subject: [PATCH 68/86] mm/slab.c: fix SLAB freelist randomization duplicate entries This patch fixes a bug in the freelist randomization code. When a high random number is used, the freelist will contain duplicate entries. It will result in different allocations sharing the same chunk. It will result in odd behaviours and crashes. It should be uncommon but it depends on the machines. We saw it happening more often on some machines (every few hours of running tests). Fixes: c7ce4f60ac19 ("mm: SLAB freelist randomization") Link: http://lkml.kernel.org/r/20170103181908.143178-1-thgarnie@google.com Signed-off-by: John Sperbeck Signed-off-by: Thomas Garnier Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 29bc6c0dedd0..4f2ec6bb46eb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2457,7 +2457,6 @@ union freelist_init_state { unsigned int pos; unsigned int *list; unsigned int count; - unsigned int rand; }; struct rnd_state rnd_state; }; @@ -2483,8 +2482,7 @@ static bool freelist_state_initialize(union freelist_init_state *state, } else { state->list = cachep->random_seq; state->count = count; - state->pos = 0; - state->rand = rand; + state->pos = rand % count; ret = true; } return ret; @@ -2493,7 +2491,9 @@ static bool freelist_state_initialize(union freelist_init_state *state, /* Get the next entry on the list and randomize it using a random shift */ static freelist_idx_t next_random_slot(union freelist_init_state *state) { - return (state->list[state->pos++] + state->rand) % state->count; + if (state->pos >= state->count) + state->pos = 0; + return state->list[state->pos++]; } /* Swap two freelist entries */ From e5bbc8a6c992901058bc09e2ce01d16c111ff047 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 10 Jan 2017 16:58:27 -0800 Subject: [PATCH 69/86] mm/hugetlb.c: fix reservation race when freeing surplus pages return_unused_surplus_pages() decrements the global reservation count, and frees any unused surplus pages that were backing the reservation. Commit 7848a4bf51b3 ("mm/hugetlb.c: add cond_resched_lock() in return_unused_surplus_pages()") added a call to cond_resched_lock in the loop freeing the pages. As a result, the hugetlb_lock could be dropped, and someone else could use the pages that will be freed in subsequent iterations of the loop. This could result in inconsistent global hugetlb page state, application api failures (such as mmap) failures or application crashes. When dropping the lock in return_unused_surplus_pages, make sure that the global reservation count (resv_huge_pages) remains sufficiently large to prevent someone else from claiming pages about to be freed. Analyzed by Paul Cassella. Fixes: 7848a4bf51b3 ("mm/hugetlb.c: add cond_resched_lock() in return_unused_surplus_pages()") Link: http://lkml.kernel.org/r/1483991767-6879-1-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Paul Cassella Suggested-by: Michal Hocko Cc: Masayoshi Mizuma Cc: Naoya Horiguchi Cc: Aneesh Kumar Cc: Hillf Danton Cc: [3.15+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3edb759c5c7d..c7025c132670 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1773,23 +1773,32 @@ free: } /* - * When releasing a hugetlb pool reservation, any surplus pages that were - * allocated to satisfy the reservation must be explicitly freed if they were - * never used. - * Called with hugetlb_lock held. + * This routine has two main purposes: + * 1) Decrement the reservation count (resv_huge_pages) by the value passed + * in unused_resv_pages. This corresponds to the prior adjustments made + * to the associated reservation map. + * 2) Free any unused surplus pages that may have been allocated to satisfy + * the reservation. As many as unused_resv_pages may be freed. + * + * Called with hugetlb_lock held. However, the lock could be dropped (and + * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, + * we must make sure nobody else can claim pages we are in the process of + * freeing. Do this by ensuring resv_huge_page always is greater than the + * number of huge pages we plan to free when dropping the lock. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; - /* Uncommit the reservation */ - h->resv_huge_pages -= unused_resv_pages; - /* Cannot return gigantic pages currently */ if (hstate_is_gigantic(h)) - return; + goto out; + /* + * Part (or even all) of the reservation could have been backed + * by pre-allocated pages. Only free surplus pages. + */ nr_pages = min(unused_resv_pages, h->surplus_huge_pages); /* @@ -1799,12 +1808,22 @@ static void return_unused_surplus_pages(struct hstate *h, * when the nodes with surplus pages have no free pages. * free_pool_huge_page() will balance the the freed pages across the * on-line nodes with memory and will handle the hstate accounting. + * + * Note that we decrement resv_huge_pages as we free the pages. If + * we drop the lock, resv_huge_pages will still be sufficiently large + * to cover subsequent pages we may free. */ while (nr_pages--) { + h->resv_huge_pages--; + unused_resv_pages--; if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) - break; + goto out; cond_resched_lock(&hugetlb_lock); } + +out: + /* Fully uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; } From 575b1967e10a1f3038266244d2c7a3ca6b99fed8 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 10 Jan 2017 16:58:30 -0800 Subject: [PATCH 70/86] timerfd: export defines to userspace Since userspace is expected to call timerfd syscalls directly with these flags/ioctls, make sure we export them so they don't have to duplicate the values themselves. Link: http://lkml.kernel.org/r/20161219064052.7196-1-vapier@gentoo.org Signed-off-by: Mike Frysinger Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/timerfd.h | 20 +------------------- include/uapi/linux/Kbuild | 1 + include/uapi/linux/timerfd.h | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 19 deletions(-) create mode 100644 include/uapi/linux/timerfd.h diff --git a/include/linux/timerfd.h b/include/linux/timerfd.h index bd36ce431e32..bab0b1ad0613 100644 --- a/include/linux/timerfd.h +++ b/include/linux/timerfd.h @@ -8,23 +8,7 @@ #ifndef _LINUX_TIMERFD_H #define _LINUX_TIMERFD_H -/* For O_CLOEXEC and O_NONBLOCK */ -#include - -/* For _IO helpers */ -#include - -/* - * CAREFUL: Check include/asm-generic/fcntl.h when defining - * new flags, since they might collide with O_* ones. We want - * to re-use O_* flags that couldn't possibly have a meaning - * from eventfd, in order to leave a free define-space for - * shared O_* flags. - */ -#define TFD_TIMER_ABSTIME (1 << 0) -#define TFD_TIMER_CANCEL_ON_SET (1 << 1) -#define TFD_CLOEXEC O_CLOEXEC -#define TFD_NONBLOCK O_NONBLOCK +#include #define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK) /* Flags for timerfd_create. */ @@ -32,6 +16,4 @@ /* Flags for timerfd_settime. */ #define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET) -#define TFD_IOC_SET_TICKS _IOW('T', 0, u64) - #endif /* _LINUX_TIMERFD_H */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index a8b93e685239..f330ba4547cf 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -414,6 +414,7 @@ header-y += telephony.h header-y += termios.h header-y += thermal.h header-y += time.h +header-y += timerfd.h header-y += times.h header-y += timex.h header-y += tiocl.h diff --git a/include/uapi/linux/timerfd.h b/include/uapi/linux/timerfd.h new file mode 100644 index 000000000000..6fcfaa8da173 --- /dev/null +++ b/include/uapi/linux/timerfd.h @@ -0,0 +1,36 @@ +/* + * include/linux/timerfd.h + * + * Copyright (C) 2007 Davide Libenzi + * + */ + +#ifndef _UAPI_LINUX_TIMERFD_H +#define _UAPI_LINUX_TIMERFD_H + +#include + +/* For O_CLOEXEC and O_NONBLOCK */ +#include + +/* For _IO helpers */ +#include + +/* + * CAREFUL: Check include/asm-generic/fcntl.h when defining + * new flags, since they might collide with O_* ones. We want + * to re-use O_* flags that couldn't possibly have a meaning + * from eventfd, in order to leave a free define-space for + * shared O_* flags. + * + * Also make sure to update the masks in include/linux/timerfd.h + * when adding new flags. + */ +#define TFD_TIMER_ABSTIME (1 << 0) +#define TFD_TIMER_CANCEL_ON_SET (1 << 1) +#define TFD_CLOEXEC O_CLOEXEC +#define TFD_NONBLOCK O_NONBLOCK + +#define TFD_IOC_SET_TICKS _IOW('T', 0, __u64) + +#endif /* _UAPI_LINUX_TIMERFD_H */ From cd3776638003b3362d9d7d1f27bcb80c276e2c28 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 10 Jan 2017 22:33:30 +0200 Subject: [PATCH 71/86] net/mlx5e: Properly handle offloading of source udp port for IP tunnels We can offload the matching on source udp port of ip tunnels for decapsulation. We can not offload setting source udp port for tunnels as part of encapsulation. Fix both the code that deals with matching offload (decap) and the code that deal with encap offload to align with that. Fixes: a54e20b4fcae ('net/mlx5e: Add basic TC tunnel set action for SRIOV offloads') Fixes: bbd00f7e2349 ('net/mlx5e: Add TC tunnel release action for SRIOV offloads') Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index f8829b517156..b60feceab63b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -239,10 +239,6 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, if (memchr_inv(&mask->dst, 0xff, sizeof(mask->dst))) return -EOPNOTSUPP; - /* udp src port isn't supported */ - if (memchr_inv(&mask->src, 0, sizeof(mask->src))) - return -EOPNOTSUPP; - if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) parse_vxlan_attr(spec, f); @@ -254,6 +250,10 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, ntohs(key->dst)); + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + udp_sport, ntohs(mask->src)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + udp_sport, ntohs(key->src)); } else { /* udp dst port must be given */ return -EOPNOTSUPP; } @@ -796,6 +796,10 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, if (!memchr_inv(&key->tp_dst, 0, sizeof(key->tp_dst))) return -EOPNOTSUPP; + /* setting udp src port isn't supported */ + if (memchr_inv(&key->tp_src, 0, sizeof(key->tp_src))) + return -EOPNOTSUPP; + if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->tp_dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) { info.tp_dst = key->tp_dst; From 2fcd82e9be133e4ec777f66fd67a8fb8e7748b1b Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 10 Jan 2017 22:33:31 +0200 Subject: [PATCH 72/86] net/mlx5e: Warn when rejecting offload attempts of IP tunnels We silently reject offloading of IPv6 tunnels, non vxlan tunnels, vxlan tunnels where the dst port to match is not provided, etc. Be a bit more verbose and print a warning so the user better realizes what went wrong here and can fix it. Fixes: a54e20b4fcae ('net/mlx5e: Add basic TC tunnel set action for SRIOV offloads') Fixes: bbd00f7e2349 ('net/mlx5e: Add TC tunnel release action for SRIOV offloads') Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index b60feceab63b..d2fc055f054a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -237,13 +237,16 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, /* Full udp dst port must be given */ if (memchr_inv(&mask->dst, 0xff, sizeof(mask->dst))) - return -EOPNOTSUPP; + goto vxlan_match_offload_err; if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) parse_vxlan_attr(spec, f); - else + else { + netdev_warn(priv->netdev, + "%d isn't an offloaded vxlan udp dport\n", be16_to_cpu(key->dst)); return -EOPNOTSUPP; + } MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport, ntohs(mask->dst)); @@ -255,7 +258,10 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport, ntohs(key->src)); } else { /* udp dst port must be given */ - return -EOPNOTSUPP; +vxlan_match_offload_err: + netdev_warn(priv->netdev, + "IP tunnel decap offload supported only for vxlan, must set UDP dport\n"); + return -EOPNOTSUPP; } if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { @@ -346,6 +352,9 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, if (parse_tunnel_attr(priv, spec, f)) return -EOPNOTSUPP; break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + netdev_warn(priv->netdev, + "IPv6 tunnel decap offload isn't supported\n"); default: return -EOPNOTSUPP; } @@ -792,13 +801,17 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, int tunnel_type; int err; - /* udp dst port must be given */ + /* udp dst port must be set */ if (!memchr_inv(&key->tp_dst, 0, sizeof(key->tp_dst))) - return -EOPNOTSUPP; + goto vxlan_encap_offload_err; /* setting udp src port isn't supported */ - if (memchr_inv(&key->tp_src, 0, sizeof(key->tp_src))) + if (memchr_inv(&key->tp_src, 0, sizeof(key->tp_src))) { +vxlan_encap_offload_err: + netdev_warn(priv->netdev, + "must set udp dst port and not set udp src port\n"); return -EOPNOTSUPP; + } if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->tp_dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) { @@ -806,6 +819,8 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, info.tun_id = tunnel_id_to_key32(key->tun_id); tunnel_type = MLX5_HEADER_TYPE_VXLAN; } else { + netdev_warn(priv->netdev, + "%d isn't an offloaded vxlan udp dport\n", be16_to_cpu(key->tp_dst)); return -EOPNOTSUPP; } @@ -813,6 +828,9 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, case AF_INET: info.daddr = key->u.ipv4.dst; break; + case AF_INET6: + netdev_warn(priv->netdev, + "IPv6 tunnel encap offload isn't supported\n"); default: return -EOPNOTSUPP; } From a42485eb0ee458da3a0df82b0e42ab58ce76be05 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 10 Jan 2017 22:33:32 +0200 Subject: [PATCH 73/86] net/mlx5e: TC ipv4 tunnel encap offload error flow fixes When the route lookup fails we should return the actual error. When the neigh isn't valid, we should return -EOPNOTSUPP as done in similar cases along the code. When the offload can't take place as of invalid neigh etc, we must release the neigh. Fixes: a54e20b4fcae ('net/mlx5e: Add basic TC tunnel set action for SRIOV offloads') Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index d2fc055f054a..b62f06f3f7e0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -656,17 +656,14 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, #if IS_ENABLED(CONFIG_INET) rt = ip_route_output_key(dev_net(mirred_dev), fl4); - if (IS_ERR(rt)) { - pr_warn("%s: no route to %pI4\n", __func__, &fl4->daddr); - return -EOPNOTSUPP; - } + if (IS_ERR(rt)) + return PTR_ERR(rt); #else return -EOPNOTSUPP; #endif if (!switchdev_port_same_parent_id(priv->netdev, rt->dst.dev)) { - pr_warn("%s: Can't offload the flow, netdevices aren't on the same HW e-switch\n", - __func__); + pr_warn("%s: can't offload, devices not on same HW e-switch\n", __func__); ip_rt_put(rt); return -EOPNOTSUPP; } @@ -727,8 +724,8 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, struct net_device **out_dev) { int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size); + struct neighbour *n = NULL; struct flowi4 fl4 = {}; - struct neighbour *n; char *encap_header; int encap_size; __be32 saddr; @@ -759,7 +756,8 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, e->out_dev = *out_dev; if (!(n->nud_state & NUD_VALID)) { - err = -ENOTSUPP; + pr_warn("%s: can't offload, neighbour to %pI4 invalid\n", __func__, &fl4.daddr); + err = -EOPNOTSUPP; goto out; } @@ -781,6 +779,8 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, err = mlx5_encap_alloc(priv->mdev, e->tunnel_type, encap_size, encap_header, &e->encap_id); out: + if (err && n) + neigh_release(n); kfree(encap_header); return err; } From 2e72eb438ce5ea9fa118edfd9a5f628c2a69111a Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 10 Jan 2017 22:33:33 +0200 Subject: [PATCH 74/86] net/mlx5e: Properly get address type of encapsulation IP headers As done elsewhere in our TC/flower offload code, the address type of the encapsulation IP headers should be realized accroding to the addr_type field of the encapsulation control dissector key, do that. Fixes: bbd00f7e2349 ('net/mlx5e: Add TC tunnel release action for SRIOV offloads') Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index b62f06f3f7e0..9cfddd9fc097 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -225,6 +225,11 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers); + struct flow_dissector_key_control *enc_control = + skb_flow_dissector_target(f->dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL, + f->key); + if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) { struct flow_dissector_key_ports *key = skb_flow_dissector_target(f->dissector, @@ -264,7 +269,7 @@ vxlan_match_offload_err: return -EOPNOTSUPP; } - if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { + if (enc_control->addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { struct flow_dissector_key_ipv4_addrs *key = skb_flow_dissector_target(f->dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, @@ -286,10 +291,10 @@ vxlan_match_offload_err: MLX5_SET(fte_match_set_lyr_2_4, headers_v, dst_ipv4_dst_ipv6.ipv4_layout.ipv4, ntohl(key->dst)); - } - MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IP); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IP); + } /* Enforce DMAC when offloading incoming tunneled flows. * Flow counters require a match on the DMAC. From 0827444d052ba5347900376dbdbf5d9065d091d4 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 10 Jan 2017 22:33:34 +0200 Subject: [PATCH 75/86] net/mlx5e: Set inline mode requirements for matching on IP fragments For e-switch level matching on packets being an IP fragment, we need to make sure the source vport inline mode is L3, fix that. Fixes: 3f7d0eb42d59 ('net/mlx5e: Offload TC matching on packets being IP fragments') Signed-off-by: Or Gerlitz Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 9cfddd9fc097..a35fa1eb0694 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -389,6 +389,10 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, MLX5_SET(fte_match_set_lyr_2_4, headers_c, frag, 1); MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, key->flags & FLOW_DIS_IS_FRAGMENT); + + /* the HW doesn't need L3 inline to match on frag=no */ + if (key->flags & FLOW_DIS_IS_FRAGMENT) + *min_inline = MLX5_INLINE_MODE_IP; } } From a757d108dc1a053722215ee89116f8af9bba1525 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Tue, 10 Jan 2017 22:33:35 +0200 Subject: [PATCH 76/86] net/mlx5e: Fix kbuild warnings for uninitialized parameters kbuild warn about parameters that may be used uninitialized, fix it. Fixes: a54e20b4fcae ('net/mlx5e: Add basic TC tunnel set action for SRIOV offloads') Signed-off-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index a35fa1eb0694..5dbc81de34ee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -737,8 +737,8 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, struct flowi4 fl4 = {}; char *encap_header; int encap_size; - __be32 saddr; - int ttl; + __be32 saddr = 0; + int ttl = 0; int err; encap_header = kzalloc(max_encap_size, GFP_KERNEL); From 5e86397abe10aa4c884478a45e9a35b6a37d8d5d Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 10 Jan 2017 22:33:36 +0200 Subject: [PATCH 77/86] net/mlx5e: Properly handle FW errors while adding TC rules When the firmware returns an error (common example is an attempt to add twice the same rule which is refused by the some FWs), we are not properly derefing/cleaning few resources allocated on the way. Examples are vport vlan deref under eswitch vlan offloads, and encap entry/neighbour deref under eswitch encapsulation offloads, fix that. Fixes: a54e20b4fcae ('net/mlx5e: Add basic TC tunnel set action for SRIOV offloads') Fixes: 8b32580df1cb ('net/mlx5e: Add TC vlan action for SRIOV offloads') Signed-off-by: Or Gerlitz Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 5dbc81de34ee..118cea5e5489 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -161,15 +161,21 @@ static void mlx5e_detach_encap(struct mlx5e_priv *priv, } } +/* we get here also when setting rule to the FW failed, etc. It means that the + * flow rule itself might not exist, but some offloading related to the actions + * should be cleaned. + */ static void mlx5e_tc_del_flow(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct mlx5_fc *counter = NULL; - counter = mlx5_flow_rule_counter(flow->rule); - - mlx5_del_flow_rules(flow->rule); + if (!IS_ERR(flow->rule)) { + counter = mlx5_flow_rule_counter(flow->rule); + mlx5_del_flow_rules(flow->rule); + mlx5_fc_destroy(priv->mdev, counter); + } if (esw && esw->mode == SRIOV_OFFLOADS) { mlx5_eswitch_del_vlan_action(esw, flow->attr); @@ -177,8 +183,6 @@ static void mlx5e_tc_del_flow(struct mlx5e_priv *priv, mlx5e_detach_encap(priv, flow); } - mlx5_fc_destroy(priv->mdev, counter); - if (!mlx5e_tc_num_filters(priv) && (priv->fs.tc.t)) { mlx5_destroy_flow_table(priv->fs.tc.t); priv->fs.tc.t = NULL; @@ -1017,7 +1021,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol, if (IS_ERR(flow->rule)) { err = PTR_ERR(flow->rule); - goto err_free; + goto err_del_rule; } err = rhashtable_insert_fast(&tc->ht, &flow->node, @@ -1028,7 +1032,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol, goto out; err_del_rule: - mlx5_del_flow_rules(flow->rule); + mlx5e_tc_del_flow(priv, flow); err_free: kfree(flow); From 3deef8cea3efcaeeae240bb00541de66abb9bfa0 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 10 Jan 2017 22:33:37 +0200 Subject: [PATCH 78/86] net/mlx5e: Un-register uplink representor on nic_disable The code before this patch registered uplink e-Switch representor on nic_enable and unregistered on nic_cleanup, the right place for this unregister is in nic_disable. Fixes: 127ea380acc9 ("net/mlx5: Add Representors registration API") Signed-off-by: Saeed Mahameed Reviewed-by: Mohamad Haj Yahia Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 1236b27b1493..2b7dd315020c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3675,14 +3675,8 @@ static void mlx5e_nic_init(struct mlx5_core_dev *mdev, static void mlx5e_nic_cleanup(struct mlx5e_priv *priv) { - struct mlx5_core_dev *mdev = priv->mdev; - struct mlx5_eswitch *esw = mdev->priv.eswitch; - mlx5e_vxlan_cleanup(priv); - if (MLX5_CAP_GEN(mdev, vport_group_manager)) - mlx5_eswitch_unregister_vport_rep(esw, 0); - if (priv->xdp_prog) bpf_prog_put(priv->xdp_prog); } @@ -3807,9 +3801,14 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv) static void mlx5e_nic_disable(struct mlx5e_priv *priv) { + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_eswitch *esw = mdev->priv.eswitch; + queue_work(priv->wq, &priv->set_rx_mode_work); + if (MLX5_CAP_GEN(mdev, vport_group_manager)) + mlx5_eswitch_unregister_vport_rep(esw, 0); mlx5e_disable_async_events(priv); - mlx5_lag_remove(priv->mdev); + mlx5_lag_remove(mdev); } static const struct mlx5e_profile mlx5e_nic_profile = { From 0bbcc0a8fc394d01988fe0263ccf7fddb77a12c3 Mon Sep 17 00:00:00 2001 From: Gil Rockah Date: Tue, 10 Jan 2017 22:33:38 +0200 Subject: [PATCH 79/86] net/mlx5e: Remove WARN_ONCE from adaptive moderation code When trying to do interface down or changing interface configuration under heavy traffic, some of the adaptive moderation corner cases can occur and leave a WARN_ONCE call trace in the kernel log. Those WARN_ONCE are meant for debug only, and should have been inserted only under debug. We avoid such call traces by removing those WARN_ONCE. Fixes: cb3c7fd4f839 ("net/mlx5e: Support adaptive RX coalescing") Signed-off-by: Gil Rockah Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c index 1fffe48a93cc..cbfac06b7ffd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c @@ -109,7 +109,6 @@ static bool mlx5e_am_on_top(struct mlx5e_rx_am *am) switch (am->tune_state) { case MLX5E_AM_PARKING_ON_TOP: case MLX5E_AM_PARKING_TIRED: - WARN_ONCE(true, "mlx5e_am_on_top: PARKING\n"); return true; case MLX5E_AM_GOING_RIGHT: return (am->steps_left > 1) && (am->steps_right == 1); @@ -123,7 +122,6 @@ static void mlx5e_am_turn(struct mlx5e_rx_am *am) switch (am->tune_state) { case MLX5E_AM_PARKING_ON_TOP: case MLX5E_AM_PARKING_TIRED: - WARN_ONCE(true, "mlx5e_am_turn: PARKING\n"); break; case MLX5E_AM_GOING_RIGHT: am->tune_state = MLX5E_AM_GOING_LEFT; @@ -144,7 +142,6 @@ static int mlx5e_am_step(struct mlx5e_rx_am *am) switch (am->tune_state) { case MLX5E_AM_PARKING_ON_TOP: case MLX5E_AM_PARKING_TIRED: - WARN_ONCE(true, "mlx5e_am_step: PARKING\n"); break; case MLX5E_AM_GOING_RIGHT: if (am->profile_ix == (MLX5E_PARAMS_AM_NUM_PROFILES - 1)) @@ -282,10 +279,8 @@ static void mlx5e_am_calc_stats(struct mlx5e_rx_am_sample *start, u32 delta_us = ktime_us_delta(end->time, start->time); unsigned int npkts = end->pkt_ctr - start->pkt_ctr; - if (!delta_us) { - WARN_ONCE(true, "mlx5e_am_calc_stats: delta_us=0\n"); + if (!delta_us) return; - } curr_stats->ppms = (npkts * USEC_PER_MSEC) / delta_us; curr_stats->epms = (MLX5E_AM_NEVENTS * USEC_PER_MSEC) / delta_us; From 5e44fca5047054f1762813751626b5245e0da022 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Tue, 10 Jan 2017 22:33:39 +0200 Subject: [PATCH 80/86] net/mlx5: Only cancel recovery work when cleaning up device Do not attempt to drain the health workqueue when unloading the device in the recovery flow, this can cause a deadlock when the recovery work tries to cancel itself with sync. Because the work is no longer unconditionally canceled when unloading, it must be explicitly canceled in the AER flow. fixes: 689a248df83b ("net/mlx5: Cancel recovery work in remove flow") Signed-off-by: Daniel Jurgens Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 6547f22e6b9b..d01e9f21d469 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1195,7 +1195,8 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, { int err = 0; - mlx5_drain_health_wq(dev); + if (cleanup) + mlx5_drain_health_wq(dev); mutex_lock(&dev->intf_state_mutex); if (test_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state)) { @@ -1359,9 +1360,10 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, mlx5_enter_error_state(dev); mlx5_unload_one(dev, priv, false); - /* In case of kernel call save the pci state */ + /* In case of kernel call save the pci state and drain the health wq */ if (state) { pci_save_state(pdev); + mlx5_drain_health_wq(dev); mlx5_pci_disable_device(dev); } From 7cfd5fd5a9813f1430290d20c0fead9b4582a307 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Jan 2017 19:52:43 -0800 Subject: [PATCH 81/86] gro: use min_t() in skb_gro_reset_offset() On 32bit arches, (skb->end - skb->data) is not 'unsigned int', so we shall use min_t() instead of min() to avoid a compiler error. Fixes: 1272ce87fa01 ("gro: Enter slow-path if there is no tailroom") Reported-by: kernel test robot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 88d2907ca2cd..07b307b0b414 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4441,8 +4441,9 @@ static void skb_gro_reset_offset(struct sk_buff *skb) pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = min(skb_frag_size(frag0), - skb->end - skb->tail); + NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, + skb_frag_size(frag0), + skb->end - skb->tail); } } From 73b351473547e543e9c8166dd67fd99c64c15b0b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Jan 2017 13:08:06 +0100 Subject: [PATCH 82/86] cgroup: move CONFIG_SOCK_CGROUP_DATA to init/Kconfig We now 'select SOCK_CGROUP_DATA' but Kconfig complains that this is not right when CONFIG_NET is disabled and there is no socket interface: warning: (CGROUP_BPF) selects SOCK_CGROUP_DATA which has unmet direct dependencies (NET) I don't know what the correct solution for this is, but simply removing the dependency on NET from SOCK_CGROUP_DATA by moving it out of the 'if NET' section avoids the warning and does not produce other build errors. Fixes: 483c4933ea09 ("cgroup: Fix CGROUP_BPF config") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- init/Kconfig | 4 ++++ net/Kconfig | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 223b734abccd..e1a937348a3e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1176,6 +1176,10 @@ config CGROUP_DEBUG Say N. +config SOCK_CGROUP_DATA + bool + default n + endif # CGROUPS config CHECKPOINT_RESTORE diff --git a/net/Kconfig b/net/Kconfig index a1005007224c..a29bb4b41c50 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -258,10 +258,6 @@ config XPS config HWBM bool -config SOCK_CGROUP_DATA - bool - default n - config CGROUP_NET_PRIO bool "Network priority cgroup" depends on CGROUPS From 7a18c5b9fb31a999afc62b0e60978aa896fc89e9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 10 Jan 2017 14:37:35 -0800 Subject: [PATCH 83/86] net: ipv4: Fix multipath selection with vrf fib_select_path does not call fib_select_multipath if oif is set in the flow struct. For VRF use cases oif is always set, so multipath route selection is bypassed. Use the FLOWI_FLAG_SKIP_NH_OIF to skip the oif check similar to what is done in fib_table_lookup. Add saddr and proto to the flow struct for the fib lookup done by the VRF driver to better match hash computation for a flow. Fixes: 613d09b30f8b ("net: Use VRF device index for lookups on TX") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 2 ++ net/ipv4/fib_semantics.c | 9 +++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 23dfb0eac098..0a067708aa39 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -263,7 +263,9 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_tos = RT_TOS(ip4h->tos), .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_SKIP_NH_OIF, + .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, + .saddr = ip4h->saddr, }; struct net *net = dev_net(vrf_dev); struct rtable *rt; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 7a5b4c7d9a87..eba1546b5031 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1618,8 +1618,13 @@ void fib_select_multipath(struct fib_result *res, int hash) void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, int mp_hash) { + bool oif_check; + + oif_check = (fl4->flowi4_oif == 0 || + fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF); + #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { + if (res->fi->fib_nhs > 1 && oif_check) { if (mp_hash < 0) mp_hash = get_hash_from_flowi4(fl4) >> 1; @@ -1629,7 +1634,7 @@ void fib_select_path(struct net *net, struct fib_result *res, #endif if (!res->prefixlen && res->table->tb_num_default > 1 && - res->type == RTN_UNICAST && !fl4->flowi4_oif) + res->type == RTN_UNICAST && oif_check) fib_select_default(fl4, res); if (!fl4->saddr) From eb004603c857f3e3bfcda437b6c68fd258c54960 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 10 Jan 2017 22:53:06 +0000 Subject: [PATCH 84/86] sctp: Fix spelling mistake: "Atempt" -> "Attempt" Trivial fix to spelling mistake in WARN_ONCE message Signed-off-by: Colin Ian King Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index e54082699520..34efaa4ef2f6 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1048,7 +1048,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) (new_transport->state == SCTP_PF))) new_transport = asoc->peer.active_path; if (new_transport->state == SCTP_UNCONFIRMED) { - WARN_ONCE(1, "Atempt to send packet on unconfirmed path."); + WARN_ONCE(1, "Attempt to send packet on unconfirmed path."); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; From a13c06525ab9ff442924e67df9393a5efa914c56 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 10 Jan 2017 23:13:45 +0000 Subject: [PATCH 85/86] net: phy: marvell: fix Marvell 88E1512 used in SGMII mode When an Marvell 88E1512 PHY is connected to a nic in SGMII mode, the fiber page is used for the SGMII host-side connection. The PHY driver notices that SUPPORTED_FIBRE is set, so it tries reading the fiber page for the link status, and ends up reading the MAC-side status instead of the outgoing (copper) link. This leads to incorrect results reported via ethtool. If the PHY is connected via SGMII to the host, ignore the fiber page. However, continue to allow the existing power management code to suspend and resume the fiber page. Fixes: 6cfb3bcc0641 ("Marvell phy: check link status in case of fiber link.") Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index e269262471a4..0b78210c0fa7 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -1192,7 +1192,8 @@ static int marvell_read_status(struct phy_device *phydev) int err; /* Check the fiber mode first */ - if (phydev->supported & SUPPORTED_FIBRE) { + if (phydev->supported & SUPPORTED_FIBRE && + phydev->interface != PHY_INTERFACE_MODE_SGMII) { err = phy_write(phydev, MII_MARVELL_PHY_PAGE, MII_M1111_FIBER); if (err < 0) goto error; From 24c63bbc18e25d5d8439422aa5fd2d66390b88eb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 10 Jan 2017 15:22:25 -0800 Subject: [PATCH 86/86] net: vrf: do not allow table id 0 Frank reported that vrf devices can be created with a table id of 0. This breaks many of the run time table id checks and should not be allowed. Detect this condition at create time and fail with EINVAL. Fixes: 193125dbd8eb ("net: Introduce VRF device driver") Reported-by: Frank Kellermann Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 0a067708aa39..454f907d419a 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1252,6 +1252,8 @@ static int vrf_newlink(struct net *src_net, struct net_device *dev, return -EINVAL; vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); + if (vrf->tb_id == RT_TABLE_UNSPEC) + return -EINVAL; dev->priv_flags |= IFF_L3MDEV_MASTER;