From 4c7b65fc3541567d26e62699756e5913e2d2f8b1 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 19 Nov 2018 12:06:08 +0100 Subject: [PATCH] 4.9-stable patches added patches: clk-at91-fix-division-by-zero-in-pll-recalc_rate.patch clk-rockchip-fix-static-checker-warning-in-rockchip_ddrclk_get_parent-call.patch clk-s2mps11-fix-matching-when-built-as-module-and-dt-node-contains-compatible.patch libceph-bump-ceph_msg_max_data_len.patch mach64-fix-display-corruption-on-big-endian-machines.patch mach64-fix-image-corruption-due-to-reading-accelerator-registers.patch mm-thp-relax-__gfp_thisnode-for-madv_hugepage-mappings.patch mtd-docg3-don-t-set-conflicting-bch_const_params-option.patch netfilter-conntrack-fix-calculation-of-next-bucket-number-in-early_drop.patch ocfs2-fix-a-misuse-a-of-brelse-after-failing-ocfs2_check_dir_entry.patch of-numa-validate-some-distance-map-rules.patch reset-hisilicon-fix-potential-null-pointer-dereference.patch revert-ceph-fix-dentry-leak-in-splice_dentry.patch vhost-scsi-truncate-t10-pi-iov_iter-to-prot_bytes.patch xtensa-add-notes-section-to-the-linker-script.patch xtensa-fix-boot-parameters-address-translation.patch xtensa-make-sure-bflt-stack-is-16-byte-aligned.patch --- ...-division-by-zero-in-pll-recalc_rate.patch | 37 +++ ...g-in-rockchip_ddrclk_get_parent-call.patch | 45 ++++ ...dule-and-dt-node-contains-compatible.patch | 75 ++++++ .../libceph-bump-ceph_msg_max_data_len.patch | 42 ++++ ...ay-corruption-on-big-endian-machines.patch | 59 +++++ ...due-to-reading-accelerator-registers.patch | 114 +++++++++ ..._thisnode-for-madv_hugepage-mappings.patch | 228 ++++++++++++++++++ ...-conflicting-bch_const_params-option.patch | 51 ++++ ...-of-next-bucket-number-in-early_drop.patch | 59 +++++ ...-after-failing-ocfs2_check_dir_entry.patch | 54 +++++ ...uma-validate-some-distance-map-rules.patch | 80 ++++++ ...x-potential-null-pointer-dereference.patch | 42 ++++ ...eph-fix-dentry-leak-in-splice_dentry.patch | 41 ++++ queue-4.9/series | 17 ++ ...uncate-t10-pi-iov_iter-to-prot_bytes.patch | 47 ++++ ...d-notes-section-to-the-linker-script.patch | 44 ++++ ...-boot-parameters-address-translation.patch | 43 ++++ ...e-sure-bflt-stack-is-16-byte-aligned.patch | 45 ++++ 18 files changed, 1123 insertions(+) create mode 100644 queue-4.9/clk-at91-fix-division-by-zero-in-pll-recalc_rate.patch create mode 100644 queue-4.9/clk-rockchip-fix-static-checker-warning-in-rockchip_ddrclk_get_parent-call.patch create mode 100644 queue-4.9/clk-s2mps11-fix-matching-when-built-as-module-and-dt-node-contains-compatible.patch create mode 100644 queue-4.9/libceph-bump-ceph_msg_max_data_len.patch create mode 100644 queue-4.9/mach64-fix-display-corruption-on-big-endian-machines.patch create mode 100644 queue-4.9/mach64-fix-image-corruption-due-to-reading-accelerator-registers.patch create mode 100644 queue-4.9/mm-thp-relax-__gfp_thisnode-for-madv_hugepage-mappings.patch create mode 100644 queue-4.9/mtd-docg3-don-t-set-conflicting-bch_const_params-option.patch create mode 100644 queue-4.9/netfilter-conntrack-fix-calculation-of-next-bucket-number-in-early_drop.patch create mode 100644 queue-4.9/ocfs2-fix-a-misuse-a-of-brelse-after-failing-ocfs2_check_dir_entry.patch create mode 100644 queue-4.9/of-numa-validate-some-distance-map-rules.patch create mode 100644 queue-4.9/reset-hisilicon-fix-potential-null-pointer-dereference.patch create mode 100644 queue-4.9/revert-ceph-fix-dentry-leak-in-splice_dentry.patch create mode 100644 queue-4.9/vhost-scsi-truncate-t10-pi-iov_iter-to-prot_bytes.patch create mode 100644 queue-4.9/xtensa-add-notes-section-to-the-linker-script.patch create mode 100644 queue-4.9/xtensa-fix-boot-parameters-address-translation.patch create mode 100644 queue-4.9/xtensa-make-sure-bflt-stack-is-16-byte-aligned.patch diff --git a/queue-4.9/clk-at91-fix-division-by-zero-in-pll-recalc_rate.patch b/queue-4.9/clk-at91-fix-division-by-zero-in-pll-recalc_rate.patch new file mode 100644 index 00000000000..4aa88a70e69 --- /dev/null +++ b/queue-4.9/clk-at91-fix-division-by-zero-in-pll-recalc_rate.patch @@ -0,0 +1,37 @@ +From 0f5cb0e6225cae2f029944cb8c74617aab6ddd49 Mon Sep 17 00:00:00 2001 +From: Ronald Wahl +Date: Wed, 10 Oct 2018 15:54:54 +0200 +Subject: clk: at91: Fix division by zero in PLL recalc_rate() + +From: Ronald Wahl + +commit 0f5cb0e6225cae2f029944cb8c74617aab6ddd49 upstream. + +Commit a982e45dc150 ("clk: at91: PLL recalc_rate() now using cached MUL +and DIV values") removed a check that prevents a division by zero. This +now causes a stacktrace when booting the kernel on a at91 platform if +the PLL DIV register contains zero. This commit reintroduces this check. + +Fixes: a982e45dc150 ("clk: at91: PLL recalc_rate() now using cached...") +Cc: +Signed-off-by: Ronald Wahl +Acked-by: Ludovic Desroches +Signed-off-by: Stephen Boyd +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/clk/at91/clk-pll.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/clk/at91/clk-pll.c ++++ b/drivers/clk/at91/clk-pll.c +@@ -133,6 +133,9 @@ static unsigned long clk_pll_recalc_rate + { + struct clk_pll *pll = to_clk_pll(hw); + ++ if (!pll->div || !pll->mul) ++ return 0; ++ + return (parent_rate / pll->div) * (pll->mul + 1); + } + diff --git a/queue-4.9/clk-rockchip-fix-static-checker-warning-in-rockchip_ddrclk_get_parent-call.patch b/queue-4.9/clk-rockchip-fix-static-checker-warning-in-rockchip_ddrclk_get_parent-call.patch new file mode 100644 index 00000000000..d6a28fcaad7 --- /dev/null +++ b/queue-4.9/clk-rockchip-fix-static-checker-warning-in-rockchip_ddrclk_get_parent-call.patch @@ -0,0 +1,45 @@ +From 665636b2940d0897c4130253467f5e8c42eea392 Mon Sep 17 00:00:00 2001 +From: Enric Balletbo i Serra +Date: Tue, 16 Oct 2018 15:41:44 +0200 +Subject: clk: rockchip: Fix static checker warning in rockchip_ddrclk_get_parent call + +From: Enric Balletbo i Serra + +commit 665636b2940d0897c4130253467f5e8c42eea392 upstream. + +Fixes the signedness bug returning '(-22)' on the return type by removing the +sanity checker in rockchip_ddrclk_get_parent(). The function should return +and unsigned value only and it's safe to remove the sanity checker as the +core functions that call get_parent like clk_core_get_parent_by_index already +ensures the validity of the clk index returned (index >= core->num_parents). + +Fixes: a4f182bf81f18 ("clk: rockchip: add new clock-type for the ddrclk") +Cc: stable@vger.kernel.org +Signed-off-by: Enric Balletbo i Serra +Reviewed-by: Stephen Boyd +Signed-off-by: Heiko Stuebner +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/clk/rockchip/clk-ddr.c | 4 ---- + 1 file changed, 4 deletions(-) + +--- a/drivers/clk/rockchip/clk-ddr.c ++++ b/drivers/clk/rockchip/clk-ddr.c +@@ -80,16 +80,12 @@ static long rockchip_ddrclk_sip_round_ra + static u8 rockchip_ddrclk_get_parent(struct clk_hw *hw) + { + struct rockchip_ddrclk *ddrclk = to_rockchip_ddrclk_hw(hw); +- int num_parents = clk_hw_get_num_parents(hw); + u32 val; + + val = clk_readl(ddrclk->reg_base + + ddrclk->mux_offset) >> ddrclk->mux_shift; + val &= GENMASK(ddrclk->mux_width - 1, 0); + +- if (val >= num_parents) +- return -EINVAL; +- + return val; + } + diff --git a/queue-4.9/clk-s2mps11-fix-matching-when-built-as-module-and-dt-node-contains-compatible.patch b/queue-4.9/clk-s2mps11-fix-matching-when-built-as-module-and-dt-node-contains-compatible.patch new file mode 100644 index 00000000000..e66bb664d20 --- /dev/null +++ b/queue-4.9/clk-s2mps11-fix-matching-when-built-as-module-and-dt-node-contains-compatible.patch @@ -0,0 +1,75 @@ +From 8985167ecf57f97061599a155bb9652c84ea4913 Mon Sep 17 00:00:00 2001 +From: Krzysztof Kozlowski +Date: Wed, 29 Aug 2018 21:20:10 +0200 +Subject: clk: s2mps11: Fix matching when built as module and DT node contains compatible + +From: Krzysztof Kozlowski + +commit 8985167ecf57f97061599a155bb9652c84ea4913 upstream. + +When driver is built as module and DT node contains clocks compatible +(e.g. "samsung,s2mps11-clk"), the module will not be autoloaded because +module aliases won't match. + +The modalias from uevent: of:NclocksTCsamsung,s2mps11-clk +The modalias from driver: platform:s2mps11-clk + +The devices are instantiated by parent's MFD. However both Device Tree +bindings and parent define the compatible for clocks devices. In case +of module matching this DT compatible will be used. + +The issue will not happen if this is a built-in (no need for module +matching) or when clocks DT node does not contain compatible (not +correct from bindings perspective but working for driver). + +Note when backporting to stable kernels: adjust the list of device ID +entries. + +Cc: +Fixes: 53c31b3437a6 ("mfd: sec-core: Add of_compatible strings for clock MFD cells") +Signed-off-by: Krzysztof Kozlowski +Acked-by: Stephen Boyd +Signed-off-by: Stephen Boyd +Signed-off-by: Greg Kroah-Hartman + +diff --git a/drivers/clk/clk-s2mps11.c b/drivers/clk/clk-s2mps11.c +index d44e0eea31ec..0934d3724495 100644 +--- a/drivers/clk/clk-s2mps11.c ++++ b/drivers/clk/clk-s2mps11.c +@@ -245,6 +245,36 @@ static const struct platform_device_id s2mps11_clk_id[] = { + }; + MODULE_DEVICE_TABLE(platform, s2mps11_clk_id); + ++#ifdef CONFIG_OF ++/* ++ * Device is instantiated through parent MFD device and device matching is done ++ * through platform_device_id. ++ * ++ * However if device's DT node contains proper clock compatible and driver is ++ * built as a module, then the *module* matching will be done trough DT aliases. ++ * This requires of_device_id table. In the same time this will not change the ++ * actual *device* matching so do not add .of_match_table. ++ */ ++static const struct of_device_id s2mps11_dt_match[] = { ++ { ++ .compatible = "samsung,s2mps11-clk", ++ .data = (void *)S2MPS11X, ++ }, { ++ .compatible = "samsung,s2mps13-clk", ++ .data = (void *)S2MPS13X, ++ }, { ++ .compatible = "samsung,s2mps14-clk", ++ .data = (void *)S2MPS14X, ++ }, { ++ .compatible = "samsung,s5m8767-clk", ++ .data = (void *)S5M8767X, ++ }, { ++ /* Sentinel */ ++ }, ++}; ++MODULE_DEVICE_TABLE(of, s2mps11_dt_match); ++#endif ++ + static struct platform_driver s2mps11_clk_driver = { + .driver = { + .name = "s2mps11-clk", diff --git a/queue-4.9/libceph-bump-ceph_msg_max_data_len.patch b/queue-4.9/libceph-bump-ceph_msg_max_data_len.patch new file mode 100644 index 00000000000..581f1ddfc3a --- /dev/null +++ b/queue-4.9/libceph-bump-ceph_msg_max_data_len.patch @@ -0,0 +1,42 @@ +From 94e6992bb560be8bffb47f287194adf070b57695 Mon Sep 17 00:00:00 2001 +From: Ilya Dryomov +Date: Wed, 26 Sep 2018 18:03:16 +0200 +Subject: libceph: bump CEPH_MSG_MAX_DATA_LEN + +From: Ilya Dryomov + +commit 94e6992bb560be8bffb47f287194adf070b57695 upstream. + +If the read is large enough, we end up spinning in the messenger: + + libceph: osd0 192.168.122.1:6801 io error + libceph: osd0 192.168.122.1:6801 io error + libceph: osd0 192.168.122.1:6801 io error + +This is a receive side limit, so only reads were affected. + +Cc: stable@vger.kernel.org +Signed-off-by: Ilya Dryomov +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/ceph/libceph.h | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/include/linux/ceph/libceph.h ++++ b/include/linux/ceph/libceph.h +@@ -77,7 +77,13 @@ struct ceph_options { + + #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) + #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) +-#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) ++ ++/* ++ * Handle the largest possible rbd object in one message. ++ * There is no limit on the size of cephfs objects, but it has to obey ++ * rsize and wsize mount options anyway. ++ */ ++#define CEPH_MSG_MAX_DATA_LEN (32*1024*1024) + + #define CEPH_AUTH_NAME_DEFAULT "guest" + diff --git a/queue-4.9/mach64-fix-display-corruption-on-big-endian-machines.patch b/queue-4.9/mach64-fix-display-corruption-on-big-endian-machines.patch new file mode 100644 index 00000000000..6cf377bf1ed --- /dev/null +++ b/queue-4.9/mach64-fix-display-corruption-on-big-endian-machines.patch @@ -0,0 +1,59 @@ +From 3c6c6a7878d00a3ac997a779c5b9861ff25dfcc8 Mon Sep 17 00:00:00 2001 +From: Mikulas Patocka +Date: Mon, 8 Oct 2018 12:57:34 +0200 +Subject: mach64: fix display corruption on big endian machines +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Mikulas Patocka + +commit 3c6c6a7878d00a3ac997a779c5b9861ff25dfcc8 upstream. + +The code for manual bit triple is not endian-clean. It builds the variable +"hostdword" using byte accesses, therefore we must read the variable with +"le32_to_cpu". + +The patch also enables (hardware or software) bit triple only if the image +is monochrome (image->depth). If we want to blit full-color image, we +shouldn't use the triple code. + +Signed-off-by: Mikulas Patocka +Reviewed-by: Ville Syrjälä +Cc: stable@vger.kernel.org +Signed-off-by: Bartlomiej Zolnierkiewicz +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/video/fbdev/aty/mach64_accel.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/drivers/video/fbdev/aty/mach64_accel.c ++++ b/drivers/video/fbdev/aty/mach64_accel.c +@@ -344,7 +344,7 @@ void atyfb_imageblit(struct fb_info *inf + * since Rage 3D IIc we have DP_HOST_TRIPLE_EN bit + * this hwaccelerated triple has an issue with not aligned data + */ +- if (M64_HAS(HW_TRIPLE) && image->width % 8 == 0) ++ if (image->depth == 1 && M64_HAS(HW_TRIPLE) && image->width % 8 == 0) + pix_width |= DP_HOST_TRIPLE_EN; + } + +@@ -381,7 +381,7 @@ void atyfb_imageblit(struct fb_info *inf + src_bytes = (((image->width * image->depth) + 7) / 8) * image->height; + + /* manual triple each pixel */ +- if (info->var.bits_per_pixel == 24 && !(pix_width & DP_HOST_TRIPLE_EN)) { ++ if (image->depth == 1 && info->var.bits_per_pixel == 24 && !(pix_width & DP_HOST_TRIPLE_EN)) { + int inbit, outbit, mult24, byte_id_in_dword, width; + u8 *pbitmapin = (u8*)image->data, *pbitmapout; + u32 hostdword; +@@ -414,7 +414,7 @@ void atyfb_imageblit(struct fb_info *inf + } + } + wait_for_fifo(1, par); +- aty_st_le32(HOST_DATA0, hostdword, par); ++ aty_st_le32(HOST_DATA0, le32_to_cpu(hostdword), par); + } + } else { + u32 *pbitmap, dwords = (src_bytes + 3) / 4; diff --git a/queue-4.9/mach64-fix-image-corruption-due-to-reading-accelerator-registers.patch b/queue-4.9/mach64-fix-image-corruption-due-to-reading-accelerator-registers.patch new file mode 100644 index 00000000000..96b3927e8e5 --- /dev/null +++ b/queue-4.9/mach64-fix-image-corruption-due-to-reading-accelerator-registers.patch @@ -0,0 +1,114 @@ +From c09bcc91bb94ed91f1391bffcbe294963d605732 Mon Sep 17 00:00:00 2001 +From: Mikulas Patocka +Date: Mon, 8 Oct 2018 12:57:35 +0200 +Subject: mach64: fix image corruption due to reading accelerator registers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Mikulas Patocka + +commit c09bcc91bb94ed91f1391bffcbe294963d605732 upstream. + +Reading the registers without waiting for engine idle returns +unpredictable values. These unpredictable values result in display +corruption - if atyfb_imageblit reads the content of DP_PIX_WIDTH with the +bit DP_HOST_TRIPLE_EN set (from previous invocation), the driver would +never ever clear the bit, resulting in display corruption. + +We don't want to wait for idle because it would degrade performance, so +this patch modifies the driver so that it never reads accelerator +registers. + +HOST_CNTL doesn't have to be read, we can just write it with +HOST_BYTE_ALIGN because no other part of the driver cares if +HOST_BYTE_ALIGN is set. + +DP_PIX_WIDTH is written in the functions atyfb_copyarea and atyfb_fillrect +with the default value and in atyfb_imageblit with the value set according +to the source image data. + +Signed-off-by: Mikulas Patocka +Reviewed-by: Ville Syrjälä +Cc: stable@vger.kernel.org +Signed-off-by: Bartlomiej Zolnierkiewicz +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/video/fbdev/aty/mach64_accel.c | 22 +++++++++------------- + 1 file changed, 9 insertions(+), 13 deletions(-) + +--- a/drivers/video/fbdev/aty/mach64_accel.c ++++ b/drivers/video/fbdev/aty/mach64_accel.c +@@ -126,7 +126,7 @@ void aty_init_engine(struct atyfb_par *p + + /* set host attributes */ + wait_for_fifo(13, par); +- aty_st_le32(HOST_CNTL, 0, par); ++ aty_st_le32(HOST_CNTL, HOST_BYTE_ALIGN, par); + + /* set pattern attributes */ + aty_st_le32(PAT_REG0, 0, par); +@@ -232,7 +232,8 @@ void atyfb_copyarea(struct fb_info *info + rotation = rotation24bpp(dx, direction); + } + +- wait_for_fifo(4, par); ++ wait_for_fifo(5, par); ++ aty_st_le32(DP_PIX_WIDTH, par->crtc.dp_pix_width, par); + aty_st_le32(DP_SRC, FRGD_SRC_BLIT, par); + aty_st_le32(SRC_Y_X, (sx << 16) | sy, par); + aty_st_le32(SRC_HEIGHT1_WIDTH1, (width << 16) | area->height, par); +@@ -268,7 +269,8 @@ void atyfb_fillrect(struct fb_info *info + rotation = rotation24bpp(dx, DST_X_LEFT_TO_RIGHT); + } + +- wait_for_fifo(3, par); ++ wait_for_fifo(4, par); ++ aty_st_le32(DP_PIX_WIDTH, par->crtc.dp_pix_width, par); + aty_st_le32(DP_FRGD_CLR, color, par); + aty_st_le32(DP_SRC, + BKGD_SRC_BKGD_CLR | FRGD_SRC_FRGD_CLR | MONO_SRC_ONE, +@@ -283,7 +285,7 @@ void atyfb_imageblit(struct fb_info *inf + { + struct atyfb_par *par = (struct atyfb_par *) info->par; + u32 src_bytes, dx = image->dx, dy = image->dy, width = image->width; +- u32 pix_width_save, pix_width, host_cntl, rotation = 0, src, mix; ++ u32 pix_width, rotation = 0, src, mix; + + if (par->asleep) + return; +@@ -295,8 +297,7 @@ void atyfb_imageblit(struct fb_info *inf + return; + } + +- pix_width = pix_width_save = aty_ld_le32(DP_PIX_WIDTH, par); +- host_cntl = aty_ld_le32(HOST_CNTL, par) | HOST_BYTE_ALIGN; ++ pix_width = par->crtc.dp_pix_width; + + switch (image->depth) { + case 1: +@@ -369,12 +370,11 @@ void atyfb_imageblit(struct fb_info *inf + mix = FRGD_MIX_D_XOR_S | BKGD_MIX_D; + } + +- wait_for_fifo(6, par); +- aty_st_le32(DP_WRITE_MASK, 0xFFFFFFFF, par); ++ wait_for_fifo(5, par); + aty_st_le32(DP_PIX_WIDTH, pix_width, par); + aty_st_le32(DP_MIX, mix, par); + aty_st_le32(DP_SRC, src, par); +- aty_st_le32(HOST_CNTL, host_cntl, par); ++ aty_st_le32(HOST_CNTL, HOST_BYTE_ALIGN, par); + aty_st_le32(DST_CNTL, DST_Y_TOP_TO_BOTTOM | DST_X_LEFT_TO_RIGHT | rotation, par); + + draw_rect(dx, dy, width, image->height, par); +@@ -423,8 +423,4 @@ void atyfb_imageblit(struct fb_info *inf + aty_st_le32(HOST_DATA0, get_unaligned_le32(pbitmap), par); + } + } +- +- /* restore pix_width */ +- wait_for_fifo(1, par); +- aty_st_le32(DP_PIX_WIDTH, pix_width_save, par); + } diff --git a/queue-4.9/mm-thp-relax-__gfp_thisnode-for-madv_hugepage-mappings.patch b/queue-4.9/mm-thp-relax-__gfp_thisnode-for-madv_hugepage-mappings.patch new file mode 100644 index 00000000000..f9fdbb9166b --- /dev/null +++ b/queue-4.9/mm-thp-relax-__gfp_thisnode-for-madv_hugepage-mappings.patch @@ -0,0 +1,228 @@ +From ac5b2c18911ffe95c08d69273917f90212cf5659 Mon Sep 17 00:00:00 2001 +From: Andrea Arcangeli +Date: Fri, 2 Nov 2018 15:47:59 -0700 +Subject: mm: thp: relax __GFP_THISNODE for MADV_HUGEPAGE mappings + +From: Andrea Arcangeli + +commit ac5b2c18911ffe95c08d69273917f90212cf5659 upstream. + +THP allocation might be really disruptive when allocated on NUMA system +with the local node full or hard to reclaim. Stefan has posted an +allocation stall report on 4.12 based SLES kernel which suggests the +same issue: + + kvm: page allocation stalls for 194572ms, order:9, mode:0x4740ca(__GFP_HIGHMEM|__GFP_IO|__GFP_FS|__GFP_COMP|__GFP_NOMEMALLOC|__GFP_HARDWALL|__GFP_THISNODE|__GFP_MOVABLE|__GFP_DIRECT_RECLAIM), nodemask=(null) + kvm cpuset=/ mems_allowed=0-1 + CPU: 10 PID: 84752 Comm: kvm Tainted: G W 4.12.0+98-ph 0000001 SLE15 (unreleased) + Hardware name: Supermicro SYS-1029P-WTRT/X11DDW-NT, BIOS 2.0 12/05/2017 + Call Trace: + dump_stack+0x5c/0x84 + warn_alloc+0xe0/0x180 + __alloc_pages_slowpath+0x820/0xc90 + __alloc_pages_nodemask+0x1cc/0x210 + alloc_pages_vma+0x1e5/0x280 + do_huge_pmd_wp_page+0x83f/0xf00 + __handle_mm_fault+0x93d/0x1060 + handle_mm_fault+0xc6/0x1b0 + __do_page_fault+0x230/0x430 + do_page_fault+0x2a/0x70 + page_fault+0x7b/0x80 + [...] + Mem-Info: + active_anon:126315487 inactive_anon:1612476 isolated_anon:5 + active_file:60183 inactive_file:245285 isolated_file:0 + unevictable:15657 dirty:286 writeback:1 unstable:0 + slab_reclaimable:75543 slab_unreclaimable:2509111 + mapped:81814 shmem:31764 pagetables:370616 bounce:0 + free:32294031 free_pcp:6233 free_cma:0 + Node 0 active_anon:254680388kB inactive_anon:1112760kB active_file:240648kB inactive_file:981168kB unevictable:13368kB isolated(anon):0kB isolated(file):0kB mapped:280240kB dirty:1144kB writeback:0kB shmem:95832kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 81225728kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no + Node 1 active_anon:250583072kB inactive_anon:5337144kB active_file:84kB inactive_file:0kB unevictable:49260kB isolated(anon):20kB isolated(file):0kB mapped:47016kB dirty:0kB writeback:4kB shmem:31224kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 31897600kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no + +The defrag mode is "madvise" and from the above report it is clear that +the THP has been allocated for MADV_HUGEPAGA vma. + +Andrea has identified that the main source of the problem is +__GFP_THISNODE usage: + +: The problem is that direct compaction combined with the NUMA +: __GFP_THISNODE logic in mempolicy.c is telling reclaim to swap very +: hard the local node, instead of failing the allocation if there's no +: THP available in the local node. +: +: Such logic was ok until __GFP_THISNODE was added to the THP allocation +: path even with MPOL_DEFAULT. +: +: The idea behind the __GFP_THISNODE addition, is that it is better to +: provide local memory in PAGE_SIZE units than to use remote NUMA THP +: backed memory. That largely depends on the remote latency though, on +: threadrippers for example the overhead is relatively low in my +: experience. +: +: The combination of __GFP_THISNODE and __GFP_DIRECT_RECLAIM results in +: extremely slow qemu startup with vfio, if the VM is larger than the +: size of one host NUMA node. This is because it will try very hard to +: unsuccessfully swapout get_user_pages pinned pages as result of the +: __GFP_THISNODE being set, instead of falling back to PAGE_SIZE +: allocations and instead of trying to allocate THP on other nodes (it +: would be even worse without vfio type1 GUP pins of course, except it'd +: be swapping heavily instead). + +Fix this by removing __GFP_THISNODE for THP requests which are +requesting the direct reclaim. This effectivelly reverts 5265047ac301 +on the grounds that the zone/node reclaim was known to be disruptive due +to premature reclaim when there was memory free. While it made sense at +the time for HPC workloads without NUMA awareness on rare machines, it +was ultimately harmful in the majority of cases. The existing behaviour +is similar, if not as widespare as it applies to a corner case but +crucially, it cannot be tuned around like zone_reclaim_mode can. The +default behaviour should always be to cause the least harm for the +common case. + +If there are specialised use cases out there that want zone_reclaim_mode +in specific cases, then it can be built on top. Longterm we should +consider a memory policy which allows for the node reclaim like behavior +for the specific memory ranges which would allow a + +[1] http://lkml.kernel.org/r/20180820032204.9591-1-aarcange@redhat.com + +Mel said: + +: Both patches look correct to me but I'm responding to this one because +: it's the fix. The change makes sense and moves further away from the +: severe stalling behaviour we used to see with both THP and zone reclaim +: mode. +: +: I put together a basic experiment with usemem configured to reference a +: buffer multiple times that is 80% the size of main memory on a 2-socket +: box with symmetric node sizes and defrag set to "always". The defrag +: setting is not the default but it would be functionally similar to +: accessing a buffer with madvise(MADV_HUGEPAGE). Usemem is configured to +: reference the buffer multiple times and while it's not an interesting +: workload, it would be expected to complete reasonably quickly as it fits +: within memory. The results were; +: +: usemem +: vanilla noreclaim-v1 +: Amean Elapsd-1 42.78 ( 0.00%) 26.87 ( 37.18%) +: Amean Elapsd-3 27.55 ( 0.00%) 7.44 ( 73.00%) +: Amean Elapsd-4 5.72 ( 0.00%) 5.69 ( 0.45%) +: +: This shows the elapsed time in seconds for 1 thread, 3 threads and 4 +: threads referencing buffers 80% the size of memory. With the patches +: applied, it's 37.18% faster for the single thread and 73% faster with two +: threads. Note that 4 threads showing little difference does not indicate +: the problem is related to thread counts. It's simply the case that 4 +: threads gets spread so their workload mostly fits in one node. +: +: The overall view from /proc/vmstats is more startling +: +: 4.19.0-rc1 4.19.0-rc1 +: vanillanoreclaim-v1r1 +: Minor Faults 35593425 708164 +: Major Faults 484088 36 +: Swap Ins 3772837 0 +: Swap Outs 3932295 0 +: +: Massive amounts of swap in/out without the patch +: +: Direct pages scanned 6013214 0 +: Kswapd pages scanned 0 0 +: Kswapd pages reclaimed 0 0 +: Direct pages reclaimed 4033009 0 +: +: Lots of reclaim activity without the patch +: +: Kswapd efficiency 100% 100% +: Kswapd velocity 0.000 0.000 +: Direct efficiency 67% 100% +: Direct velocity 11191.956 0.000 +: +: Mostly from direct reclaim context as you'd expect without the patch. +: +: Page writes by reclaim 3932314.000 0.000 +: Page writes file 19 0 +: Page writes anon 3932295 0 +: Page reclaim immediate 42336 0 +: +: Writes from reclaim context is never good but the patch eliminates it. +: +: We should never have default behaviour to thrash the system for such a +: basic workload. If zone reclaim mode behaviour is ever desired but on a +: single task instead of a global basis then the sensible option is to build +: a mempolicy that enforces that behaviour. + +This was a severe regression compared to previous kernels that made +important workloads unusable and it starts when __GFP_THISNODE was +added to THP allocations under MADV_HUGEPAGE. It is not a significant +risk to go to the previous behavior before __GFP_THISNODE was added, it +worked like that for years. + +This was simply an optimization to some lucky workloads that can fit in +a single node, but it ended up breaking the VM for others that can't +possibly fit in a single node, so going back is safe. + +[mhocko@suse.com: rewrote the changelog based on the one from Andrea] +Link: http://lkml.kernel.org/r/20180925120326.24392-2-mhocko@kernel.org +Fixes: 5265047ac301 ("mm, thp: really limit transparent hugepage allocation to local node") +Signed-off-by: Andrea Arcangeli +Signed-off-by: Michal Hocko +Reported-by: Stefan Priebe +Debugged-by: Andrea Arcangeli +Reported-by: Alex Williamson +Reviewed-by: Mel Gorman +Tested-by: Mel Gorman +Cc: Zi Yan +Cc: Vlastimil Babka +Cc: David Rientjes +Cc: "Kirill A. Shutemov" +Cc: [4.1+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/mempolicy.c | 32 ++++++++++++++++++++++++++++++-- + 1 file changed, 30 insertions(+), 2 deletions(-) + +--- a/mm/mempolicy.c ++++ b/mm/mempolicy.c +@@ -2027,8 +2027,36 @@ retry_cpuset: + nmask = policy_nodemask(gfp, pol); + if (!nmask || node_isset(hpage_node, *nmask)) { + mpol_cond_put(pol); +- page = __alloc_pages_node(hpage_node, +- gfp | __GFP_THISNODE, order); ++ /* ++ * We cannot invoke reclaim if __GFP_THISNODE ++ * is set. Invoking reclaim with ++ * __GFP_THISNODE set, would cause THP ++ * allocations to trigger heavy swapping ++ * despite there may be tons of free memory ++ * (including potentially plenty of THP ++ * already available in the buddy) on all the ++ * other NUMA nodes. ++ * ++ * At most we could invoke compaction when ++ * __GFP_THISNODE is set (but we would need to ++ * refrain from invoking reclaim even if ++ * compaction returned COMPACT_SKIPPED because ++ * there wasn't not enough memory to succeed ++ * compaction). For now just avoid ++ * __GFP_THISNODE instead of limiting the ++ * allocation path to a strict and single ++ * compaction invocation. ++ * ++ * Supposedly if direct reclaim was enabled by ++ * the caller, the app prefers THP regardless ++ * of the node it comes from so this would be ++ * more desiderable behavior than only ++ * providing THP originated from the local ++ * node in such case. ++ */ ++ if (!(gfp & __GFP_DIRECT_RECLAIM)) ++ gfp |= __GFP_THISNODE; ++ page = __alloc_pages_node(hpage_node, gfp, order); + goto out; + } + } diff --git a/queue-4.9/mtd-docg3-don-t-set-conflicting-bch_const_params-option.patch b/queue-4.9/mtd-docg3-don-t-set-conflicting-bch_const_params-option.patch new file mode 100644 index 00000000000..c68d95ad743 --- /dev/null +++ b/queue-4.9/mtd-docg3-don-t-set-conflicting-bch_const_params-option.patch @@ -0,0 +1,51 @@ +From be2e1c9dcf76886a83fb1c433a316e26d4ca2550 Mon Sep 17 00:00:00 2001 +From: Arnd Bergmann +Date: Thu, 11 Oct 2018 13:06:16 +0200 +Subject: mtd: docg3: don't set conflicting BCH_CONST_PARAMS option + +From: Arnd Bergmann + +commit be2e1c9dcf76886a83fb1c433a316e26d4ca2550 upstream. + +I noticed during the creation of another bugfix that the BCH_CONST_PARAMS +option that is set by DOCG3 breaks setting variable parameters for any +other users of the BCH library code. + +The only other user we have today is the MTD_NAND software BCH +implementation (most flash controllers use hardware BCH these days +and are not affected). I considered removing BCH_CONST_PARAMS entirely +because of the inherent conflict, but according to the description in +lib/bch.c there is a significant performance benefit in keeping it. + +To avoid the immediate problem of the conflict between MTD_NAND_BCH +and DOCG3, this only sets the constant parameters if MTD_NAND_BCH +is disabled, which should fix the problem for all cases that +are affected. This should also work for all stable kernels. + +Note that there is only one machine that actually seems to use the +DOCG3 driver (arch/arm/mach-pxa/mioa701.c), so most users should have +the driver disabled, but it almost certainly shows up if we wanted +to test random kernels on machines that use software BCH in MTD. + +Fixes: d13d19ece39f ("mtd: docg3: add ECC correction code") +Cc: stable@vger.kernel.org +Cc: Robert Jarzmik +Signed-off-by: Arnd Bergmann +Signed-off-by: Boris Brezillon +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/mtd/devices/Kconfig | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/mtd/devices/Kconfig ++++ b/drivers/mtd/devices/Kconfig +@@ -196,7 +196,7 @@ comment "Disk-On-Chip Device Drivers" + config MTD_DOCG3 + tristate "M-Systems Disk-On-Chip G3" + select BCH +- select BCH_CONST_PARAMS ++ select BCH_CONST_PARAMS if !MTD_NAND_BCH + select BITREVERSE + ---help--- + This provides an MTD device driver for the M-Systems DiskOnChip diff --git a/queue-4.9/netfilter-conntrack-fix-calculation-of-next-bucket-number-in-early_drop.patch b/queue-4.9/netfilter-conntrack-fix-calculation-of-next-bucket-number-in-early_drop.patch new file mode 100644 index 00000000000..73d9dd80067 --- /dev/null +++ b/queue-4.9/netfilter-conntrack-fix-calculation-of-next-bucket-number-in-early_drop.patch @@ -0,0 +1,59 @@ +From f393808dc64149ccd0e5a8427505ba2974a59854 Mon Sep 17 00:00:00 2001 +From: Vasily Khoruzhick +Date: Thu, 25 Oct 2018 12:15:43 -0700 +Subject: netfilter: conntrack: fix calculation of next bucket number in early_drop + +From: Vasily Khoruzhick + +commit f393808dc64149ccd0e5a8427505ba2974a59854 upstream. + +If there's no entry to drop in bucket that corresponds to the hash, +early_drop() should look for it in other buckets. But since it increments +hash instead of bucket number, it actually looks in the same bucket 8 +times: hsize is 16k by default (14 bits) and hash is 32-bit value, so +reciprocal_scale(hash, hsize) returns the same value for hash..hash+7 in +most cases. + +Fix it by increasing bucket number instead of hash and rename _hash +to bucket to avoid future confusion. + +Fixes: 3e86638e9a0b ("netfilter: conntrack: consider ct netns in early_drop logic") +Cc: # v4.7+ +Signed-off-by: Vasily Khoruzhick +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Greg Kroah-Hartman + +--- + net/netfilter/nf_conntrack_core.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +--- a/net/netfilter/nf_conntrack_core.c ++++ b/net/netfilter/nf_conntrack_core.c +@@ -918,19 +918,22 @@ static unsigned int early_drop_list(stru + return drops; + } + +-static noinline int early_drop(struct net *net, unsigned int _hash) ++static noinline int early_drop(struct net *net, unsigned int hash) + { +- unsigned int i; ++ unsigned int i, bucket; + + for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { + struct hlist_nulls_head *ct_hash; +- unsigned int hash, hsize, drops; ++ unsigned int hsize, drops; + + rcu_read_lock(); + nf_conntrack_get_ht(&ct_hash, &hsize); +- hash = reciprocal_scale(_hash++, hsize); ++ if (!i) ++ bucket = reciprocal_scale(hash, hsize); ++ else ++ bucket = (bucket + 1) % hsize; + +- drops = early_drop_list(net, &ct_hash[hash]); ++ drops = early_drop_list(net, &ct_hash[bucket]); + rcu_read_unlock(); + + if (drops) { diff --git a/queue-4.9/ocfs2-fix-a-misuse-a-of-brelse-after-failing-ocfs2_check_dir_entry.patch b/queue-4.9/ocfs2-fix-a-misuse-a-of-brelse-after-failing-ocfs2_check_dir_entry.patch new file mode 100644 index 00000000000..aca9e6944a9 --- /dev/null +++ b/queue-4.9/ocfs2-fix-a-misuse-a-of-brelse-after-failing-ocfs2_check_dir_entry.patch @@ -0,0 +1,54 @@ +From 29aa30167a0a2e6045a0d6d2e89d8168132333d5 Mon Sep 17 00:00:00 2001 +From: Changwei Ge +Date: Fri, 2 Nov 2018 15:48:15 -0700 +Subject: ocfs2: fix a misuse a of brelse after failing ocfs2_check_dir_entry + +From: Changwei Ge + +commit 29aa30167a0a2e6045a0d6d2e89d8168132333d5 upstream. + +Somehow, file system metadata was corrupted, which causes +ocfs2_check_dir_entry() to fail in function ocfs2_dir_foreach_blk_el(). + +According to the original design intention, if above happens we should +skip the problematic block and continue to retrieve dir entry. But +there is obviouse misuse of brelse around related code. + +After failure of ocfs2_check_dir_entry(), current code just moves to +next position and uses the problematic buffer head again and again +during which the problematic buffer head is released for multiple times. +I suppose, this a serious issue which is long-lived in ocfs2. This may +cause other file systems which is also used in a the same host insane. + +So we should also consider about bakcporting this patch into linux +-stable. + +Link: http://lkml.kernel.org/r/HK2PR06MB045211675B43EED794E597B6D56E0@HK2PR06MB0452.apcprd06.prod.outlook.com +Signed-off-by: Changwei Ge +Suggested-by: Changkuo Shi +Reviewed-by: Andrew Morton +Cc: Mark Fasheh +Cc: Joel Becker +Cc: Junxiao Bi +Cc: Joseph Qi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/dir.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/ocfs2/dir.c ++++ b/fs/ocfs2/dir.c +@@ -1896,8 +1896,7 @@ static int ocfs2_dir_foreach_blk_el(stru + /* On error, skip the f_pos to the + next block. */ + ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; +- brelse(bh); +- continue; ++ break; + } + if (le64_to_cpu(de->inode)) { + unsigned char d_type = DT_UNKNOWN; diff --git a/queue-4.9/of-numa-validate-some-distance-map-rules.patch b/queue-4.9/of-numa-validate-some-distance-map-rules.patch new file mode 100644 index 00000000000..79f4be765d2 --- /dev/null +++ b/queue-4.9/of-numa-validate-some-distance-map-rules.patch @@ -0,0 +1,80 @@ +From 89c38422e072bb453e3045b8f1b962a344c3edea Mon Sep 17 00:00:00 2001 +From: John Garry +Date: Thu, 8 Nov 2018 18:17:03 +0800 +Subject: of, numa: Validate some distance map rules + +From: John Garry + +commit 89c38422e072bb453e3045b8f1b962a344c3edea upstream. + +Currently the NUMA distance map parsing does not validate the distance +table for the distance-matrix rules 1-2 in [1]. + +However the arch NUMA code may enforce some of these rules, but not all. +Such is the case for the arm64 port, which does not enforce the rule that +the distance between separates nodes cannot equal LOCAL_DISTANCE. + +The patch adds the following rules validation: +- distance of node to self equals LOCAL_DISTANCE +- distance of separate nodes > LOCAL_DISTANCE + +This change avoids a yet-unresolved crash reported in [2]. + +A note on dealing with symmetrical distances between nodes: + +Validating symmetrical distances between nodes is difficult. If it were +mandated in the bindings that every distance must be recorded in the +table, then it would be easy. However, it isn't. + +In addition to this, it is also possible to record [b, a] distance only +(and not [a, b]). So, when processing the table for [b, a], we cannot +assert that current distance of [a, b] != [b, a] as invalid, as [a, b] +distance may not be present in the table and current distance would be +default at REMOTE_DISTANCE. + +As such, we maintain the policy that we overwrite distance [a, b] = [b, a] +for b > a. This policy is different to kernel ACPI SLIT validation, which +allows non-symmetrical distances (ACPI spec SLIT rules allow it). However, +the distance debug message is dropped as it may be misleading (for a distance +which is later overwritten). + +Some final notes on semantics: + +- It is implied that it is the responsibility of the arch NUMA code to + reset the NUMA distance map for an error in distance map parsing. + +- It is the responsibility of the FW NUMA topology parsing (whether OF or + ACPI) to enforce NUMA distance rules, and not arch NUMA code. + +[1] Documents/devicetree/bindings/numa.txt +[2] https://www.spinics.net/lists/arm-kernel/msg683304.html + +Cc: stable@vger.kernel.org # 4.7 +Signed-off-by: John Garry +Acked-by: Will Deacon +Signed-off-by: Rob Herring +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/of/of_numa.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/drivers/of/of_numa.c ++++ b/drivers/of/of_numa.c +@@ -126,9 +126,14 @@ static int __init of_numa_parse_distance + distance = of_read_number(matrix, 1); + matrix++; + ++ if ((nodea == nodeb && distance != LOCAL_DISTANCE) || ++ (nodea != nodeb && distance <= LOCAL_DISTANCE)) { ++ pr_err("Invalid distance[node%d -> node%d] = %d\n", ++ nodea, nodeb, distance); ++ return -EINVAL; ++ } ++ + numa_set_distance(nodea, nodeb, distance); +- pr_debug("distance[node%d -> node%d] = %d\n", +- nodea, nodeb, distance); + + /* Set default distance of node B->A same as A->B */ + if (nodeb > nodea) diff --git a/queue-4.9/reset-hisilicon-fix-potential-null-pointer-dereference.patch b/queue-4.9/reset-hisilicon-fix-potential-null-pointer-dereference.patch new file mode 100644 index 00000000000..f1d79e6aed8 --- /dev/null +++ b/queue-4.9/reset-hisilicon-fix-potential-null-pointer-dereference.patch @@ -0,0 +1,42 @@ +From e9a2310fb689151166df7fd9971093362d34bd79 Mon Sep 17 00:00:00 2001 +From: "Gustavo A. R. Silva" +Date: Wed, 25 Jul 2018 19:47:19 -0500 +Subject: reset: hisilicon: fix potential NULL pointer dereference + +From: Gustavo A. R. Silva + +commit e9a2310fb689151166df7fd9971093362d34bd79 upstream. + +There is a potential execution path in which function +platform_get_resource() returns NULL. If this happens, +we will end up having a NULL pointer dereference. + +Fix this by replacing devm_ioremap with devm_ioremap_resource, +which has the NULL check and the memory region request. + +This code was detected with the help of Coccinelle. + +Cc: stable@vger.kernel.org +Fixes: 97b7129cd2af ("reset: hisilicon: change the definition of hisi_reset_init") +Signed-off-by: Gustavo A. R. Silva +Signed-off-by: Stephen Boyd +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/clk/hisilicon/reset.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +--- a/drivers/clk/hisilicon/reset.c ++++ b/drivers/clk/hisilicon/reset.c +@@ -109,9 +109,8 @@ struct hisi_reset_controller *hisi_reset + return NULL; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); +- rstc->membase = devm_ioremap(&pdev->dev, +- res->start, resource_size(res)); +- if (!rstc->membase) ++ rstc->membase = devm_ioremap_resource(&pdev->dev, res); ++ if (IS_ERR(rstc->membase)) + return NULL; + + spin_lock_init(&rstc->lock); diff --git a/queue-4.9/revert-ceph-fix-dentry-leak-in-splice_dentry.patch b/queue-4.9/revert-ceph-fix-dentry-leak-in-splice_dentry.patch new file mode 100644 index 00000000000..bc2d7ddb9ed --- /dev/null +++ b/queue-4.9/revert-ceph-fix-dentry-leak-in-splice_dentry.patch @@ -0,0 +1,41 @@ +From efe328230dc01aa0b1269aad0b5fae73eea4677a Mon Sep 17 00:00:00 2001 +From: "Yan, Zheng" +Date: Thu, 27 Sep 2018 21:16:05 +0800 +Subject: Revert "ceph: fix dentry leak in splice_dentry()" + +From: Yan, Zheng + +commit efe328230dc01aa0b1269aad0b5fae73eea4677a upstream. + +This reverts commit 8b8f53af1ed9df88a4c0fbfdf3db58f62060edf3. + +splice_dentry() is used by three places. For two places, req->r_dentry +is passed to splice_dentry(). In the case of error, req->r_dentry does +not get updated. So splice_dentry() should not drop reference. + +Cc: stable@vger.kernel.org # 4.18+ +Signed-off-by: "Yan, Zheng" +Signed-off-by: Ilya Dryomov +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ceph/inode.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/fs/ceph/inode.c ++++ b/fs/ceph/inode.c +@@ -1077,8 +1077,12 @@ static struct dentry *splice_dentry(stru + if (IS_ERR(realdn)) { + pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", + PTR_ERR(realdn), dn, in, ceph_vinop(in)); +- dput(dn); +- dn = realdn; /* note realdn contains the error */ ++ dn = realdn; ++ /* ++ * Caller should release 'dn' in the case of error. ++ * If 'req->r_dentry' is passed to this function, ++ * caller should leave 'req->r_dentry' untouched. ++ */ + goto out; + } else if (realdn) { + dout("dn %p (%d) spliced with %p (%d) " diff --git a/queue-4.9/series b/queue-4.9/series index 2c3efd2c8ac..406c359ff99 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -28,3 +28,20 @@ parisc-fix-hpmc-handler-by-increasing-size-to-multip.patch parisc-fix-exported-address-of-os_hpmc-handler.patch mips-loongson-3-fix-cpu-uart-irq-delivery-problem.patch mips-loongson-3-fix-bridge-irq-delivery-problem.patch +xtensa-add-notes-section-to-the-linker-script.patch +xtensa-make-sure-bflt-stack-is-16-byte-aligned.patch +xtensa-fix-boot-parameters-address-translation.patch +clk-s2mps11-fix-matching-when-built-as-module-and-dt-node-contains-compatible.patch +clk-at91-fix-division-by-zero-in-pll-recalc_rate.patch +clk-rockchip-fix-static-checker-warning-in-rockchip_ddrclk_get_parent-call.patch +libceph-bump-ceph_msg_max_data_len.patch +revert-ceph-fix-dentry-leak-in-splice_dentry.patch +mach64-fix-display-corruption-on-big-endian-machines.patch +mach64-fix-image-corruption-due-to-reading-accelerator-registers.patch +reset-hisilicon-fix-potential-null-pointer-dereference.patch +vhost-scsi-truncate-t10-pi-iov_iter-to-prot_bytes.patch +ocfs2-fix-a-misuse-a-of-brelse-after-failing-ocfs2_check_dir_entry.patch +mm-thp-relax-__gfp_thisnode-for-madv_hugepage-mappings.patch +netfilter-conntrack-fix-calculation-of-next-bucket-number-in-early_drop.patch +mtd-docg3-don-t-set-conflicting-bch_const_params-option.patch +of-numa-validate-some-distance-map-rules.patch diff --git a/queue-4.9/vhost-scsi-truncate-t10-pi-iov_iter-to-prot_bytes.patch b/queue-4.9/vhost-scsi-truncate-t10-pi-iov_iter-to-prot_bytes.patch new file mode 100644 index 00000000000..7f50a92fbb5 --- /dev/null +++ b/queue-4.9/vhost-scsi-truncate-t10-pi-iov_iter-to-prot_bytes.patch @@ -0,0 +1,47 @@ +From 4542d623c7134bc1738f8a68ccb6dd546f1c264f Mon Sep 17 00:00:00 2001 +From: Greg Edwards +Date: Wed, 22 Aug 2018 13:21:53 -0600 +Subject: vhost/scsi: truncate T10 PI iov_iter to prot_bytes + +From: Greg Edwards + +commit 4542d623c7134bc1738f8a68ccb6dd546f1c264f upstream. + +Commands with protection information included were not truncating the +protection iov_iter to the number of protection bytes in the command. +This resulted in vhost_scsi mis-calculating the size of the protection +SGL in vhost_scsi_calc_sgls(), and including both the protection and +data SG entries in the protection SGL. + +Fixes: 09b13fa8c1a1 ("vhost/scsi: Add ANY_LAYOUT support in vhost_scsi_handle_vq") +Signed-off-by: Greg Edwards +Signed-off-by: Michael S. Tsirkin +Fixes: 09b13fa8c1a1093e9458549ac8bb203a7c65c62a +Cc: stable@vger.kernel.org +Reviewed-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/vhost/scsi.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/drivers/vhost/scsi.c ++++ b/drivers/vhost/scsi.c +@@ -999,7 +999,8 @@ vhost_scsi_handle_vq(struct vhost_scsi * + prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesin); + } + /* +- * Set prot_iter to data_iter, and advance past any ++ * Set prot_iter to data_iter and truncate it to ++ * prot_bytes, and advance data_iter past any + * preceeding prot_bytes that may be present. + * + * Also fix up the exp_data_len to reflect only the +@@ -1008,6 +1009,7 @@ vhost_scsi_handle_vq(struct vhost_scsi * + if (prot_bytes) { + exp_data_len -= prot_bytes; + prot_iter = data_iter; ++ iov_iter_truncate(&prot_iter, prot_bytes); + iov_iter_advance(&data_iter, prot_bytes); + } + tag = vhost64_to_cpu(vq, v_req_pi.tag); diff --git a/queue-4.9/xtensa-add-notes-section-to-the-linker-script.patch b/queue-4.9/xtensa-add-notes-section-to-the-linker-script.patch new file mode 100644 index 00000000000..8f6abcb1520 --- /dev/null +++ b/queue-4.9/xtensa-add-notes-section-to-the-linker-script.patch @@ -0,0 +1,44 @@ +From 4119ba211bc4f1bf638f41e50b7a0f329f58aa16 Mon Sep 17 00:00:00 2001 +From: Max Filippov +Date: Mon, 29 Oct 2018 18:30:13 -0700 +Subject: xtensa: add NOTES section to the linker script + +From: Max Filippov + +commit 4119ba211bc4f1bf638f41e50b7a0f329f58aa16 upstream. + +This section collects all source .note.* sections together in the +vmlinux image. Without it .note.Linux section may be placed at address +0, while the rest of the kernel is at its normal address, resulting in a +huge vmlinux.bin image that may not be linked into the xtensa Image.elf. + +Cc: stable@vger.kernel.org +Signed-off-by: Max Filippov +Signed-off-by: Greg Kroah-Hartman + +--- + arch/xtensa/boot/Makefile | 2 +- + arch/xtensa/kernel/vmlinux.lds.S | 1 + + 2 files changed, 2 insertions(+), 1 deletion(-) + +--- a/arch/xtensa/boot/Makefile ++++ b/arch/xtensa/boot/Makefile +@@ -31,7 +31,7 @@ $(bootdir-y): $(addprefix $(obj)/,$(subd + $(addprefix $(obj)/,$(host-progs)) + $(Q)$(MAKE) $(build)=$(obj)/$@ $(MAKECMDGOALS) + +-OBJCOPYFLAGS = --strip-all -R .comment -R .note.gnu.build-id -O binary ++OBJCOPYFLAGS = --strip-all -R .comment -R .notes -O binary + + vmlinux.bin: vmlinux FORCE + $(call if_changed,objcopy) +--- a/arch/xtensa/kernel/vmlinux.lds.S ++++ b/arch/xtensa/kernel/vmlinux.lds.S +@@ -109,6 +109,7 @@ SECTIONS + .fixup : { *(.fixup) } + + EXCEPTION_TABLE(16) ++ NOTES + /* Data section */ + + _sdata = .; diff --git a/queue-4.9/xtensa-fix-boot-parameters-address-translation.patch b/queue-4.9/xtensa-fix-boot-parameters-address-translation.patch new file mode 100644 index 00000000000..906f7206e2a --- /dev/null +++ b/queue-4.9/xtensa-fix-boot-parameters-address-translation.patch @@ -0,0 +1,43 @@ +From 40dc948f234b73497c3278875eb08a01d5854d3f Mon Sep 17 00:00:00 2001 +From: Max Filippov +Date: Tue, 13 Nov 2018 23:46:42 -0800 +Subject: xtensa: fix boot parameters address translation + +From: Max Filippov + +commit 40dc948f234b73497c3278875eb08a01d5854d3f upstream. + +The bootloader may pass physical address of the boot parameters structure +to the MMUv3 kernel in the register a2. Code in the _SetupMMU block in +the arch/xtensa/kernel/head.S is supposed to map that physical address to +the virtual address in the configured virtual memory layout. + +This code haven't been updated when additional 256+256 and 512+512 +memory layouts were introduced and it may produce wrong addresses when +used with these layouts. + +Cc: stable@vger.kernel.org +Signed-off-by: Max Filippov +Signed-off-by: Greg Kroah-Hartman + +--- + arch/xtensa/kernel/head.S | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/arch/xtensa/kernel/head.S ++++ b/arch/xtensa/kernel/head.S +@@ -88,9 +88,12 @@ _SetupMMU: + initialize_mmu + #if defined(CONFIG_MMU) && XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY + rsr a2, excsave1 +- movi a3, 0x08000000 ++ movi a3, XCHAL_KSEG_PADDR ++ bltu a2, a3, 1f ++ sub a2, a2, a3 ++ movi a3, XCHAL_KSEG_SIZE + bgeu a2, a3, 1f +- movi a3, 0xd0000000 ++ movi a3, XCHAL_KSEG_CACHED_VADDR + add a2, a2, a3 + wsr a2, excsave1 + 1: diff --git a/queue-4.9/xtensa-make-sure-bflt-stack-is-16-byte-aligned.patch b/queue-4.9/xtensa-make-sure-bflt-stack-is-16-byte-aligned.patch new file mode 100644 index 00000000000..9b0d5850299 --- /dev/null +++ b/queue-4.9/xtensa-make-sure-bflt-stack-is-16-byte-aligned.patch @@ -0,0 +1,45 @@ +From 0773495b1f5f1c5e23551843f87b5ff37e7af8f7 Mon Sep 17 00:00:00 2001 +From: Max Filippov +Date: Sun, 4 Nov 2018 01:46:00 -0700 +Subject: xtensa: make sure bFLT stack is 16 byte aligned + +From: Max Filippov + +commit 0773495b1f5f1c5e23551843f87b5ff37e7af8f7 upstream. + +Xtensa ABI requires stack alignment to be at least 16. In noMMU +configuration ARCH_SLAB_MINALIGN is used to align stack. Make it at +least 16. + +This fixes the following runtime error in noMMU configuration, caused by +interaction between insufficiently aligned stack and alloca function, +that results in corruption of on-stack variable in the libc function +glob: + + Caught unhandled exception in 'sh' (pid = 47, pc = 0x02d05d65) + - should not happen + EXCCAUSE is 15 + +Cc: stable@vger.kernel.org +Signed-off-by: Max Filippov +Signed-off-by: Greg Kroah-Hartman + +--- + arch/xtensa/include/asm/processor.h | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/arch/xtensa/include/asm/processor.h ++++ b/arch/xtensa/include/asm/processor.h +@@ -24,7 +24,11 @@ + # error Linux requires the Xtensa Windowed Registers Option. + #endif + +-#define ARCH_SLAB_MINALIGN XCHAL_DATA_WIDTH ++/* Xtensa ABI requires stack alignment to be at least 16 */ ++ ++#define STACK_ALIGN (XCHAL_DATA_WIDTH > 16 ? XCHAL_DATA_WIDTH : 16) ++ ++#define ARCH_SLAB_MINALIGN STACK_ALIGN + + /* + * User space process size: 1 GB. -- 2.47.2