5.4-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 29 Jun 2020 11:26:51 +0000 (13:26 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 29 Jun 2020 11:26:51 +0000 (13:26 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 29 Jun 2020 11:26:51 +0000 (13:26 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 29 Jun 2020 11:26:51 +0000 (13:26 +0200)
diff --git a/queue-5.4/arm64-dts-imx8mm-evk-correct-ldo1-ldo2-voltage-range.patch b/queue-5.4/arm64-dts-imx8mm-evk-correct-ldo1-ldo2-voltage-range.patch

new file mode 100644 (file)

index 0000000..b2461f5
--- /dev/null
+++ b/queue-5.4/arm64-dts-imx8mm-evk-correct-ldo1-ldo2-voltage-range.patch
@@ -0,0 +1,51 @@
+From 4fd6b5735c03c0955d93960d31f17d7144f5578f Mon Sep 17 00:00:00 2001
+From: Robin Gong <yibin.gong@nxp.com>
+Date: Fri, 22 May 2020 18:44:50 +0800
+Subject: arm64: dts: imx8mm-evk: correct ldo1/ldo2 voltage range
+
+From: Robin Gong <yibin.gong@nxp.com>
+
+commit 4fd6b5735c03c0955d93960d31f17d7144f5578f upstream.
+
+Correct ldo1 voltage range from wrong high group(3.0V~3.3V) to low group
+(1.6V~1.9V) because the ldo1 should be 1.8V. Actually, two voltage groups
+have been supported at bd718x7-regulator driver, hence, just corrrect the
+voltage range to 1.6V~3.3V. For ldo2@0.8V, correct voltage range too.
+Otherwise, ldo1 would be kept @3.0V and ldo2@0.9V which violate i.mx8mm
+datasheet as the below warning log in kernel:
+
+[    0.995524] LDO1: Bringing 1800000uV into 3000000-3000000uV
+[    0.999196] LDO2: Bringing 800000uV into 900000-900000uV
+
+Fixes: 78cc25fa265d ("arm64: dts: imx8mm-evk: Add BD71847 PMIC")
+Cc: stable@vger.kernel.org
+Signed-off-by: Robin Gong <yibin.gong@nxp.com>
+Reviewed-by: Dong Aisheng <aisheng.dong@nxp.com>
+Reviewed-by: Fabio Estevam <festevam@gmail.com>
+Signed-off-by: Shawn Guo <shawnguo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/boot/dts/freescale/imx8mm-evk.dts |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/arm64/boot/dts/freescale/imx8mm-evk.dts
++++ b/arch/arm64/boot/dts/freescale/imx8mm-evk.dts
+@@ -231,7 +231,7 @@
+ 
+                       ldo1_reg: LDO1 {
+                               regulator-name = "LDO1";
+-                              regulator-min-microvolt = <3000000>;
++                              regulator-min-microvolt = <1600000>;
+                               regulator-max-microvolt = <3300000>;
+                               regulator-boot-on;
+                               regulator-always-on;
+@@ -239,7 +239,7 @@
+ 
+                       ldo2_reg: LDO2 {
+                               regulator-name = "LDO2";
+-                              regulator-min-microvolt = <900000>;
++                              regulator-min-microvolt = <800000>;
+                               regulator-max-microvolt = <900000>;
+                               regulator-boot-on;
+                               regulator-always-on;
diff --git a/queue-5.4/arm64-dts-imx8mn-ddr4-evk-correct-ldo1-ldo2-voltage-range.patch b/queue-5.4/arm64-dts-imx8mn-ddr4-evk-correct-ldo1-ldo2-voltage-range.patch

new file mode 100644 (file)

index 0000000..fd124a3
--- /dev/null
+++ b/queue-5.4/arm64-dts-imx8mn-ddr4-evk-correct-ldo1-ldo2-voltage-range.patch
@@ -0,0 +1,51 @@
+From cfb12c8952f617df58d73d24161e539a035d82b0 Mon Sep 17 00:00:00 2001
+From: Robin Gong <yibin.gong@nxp.com>
+Date: Fri, 22 May 2020 18:44:51 +0800
+Subject: arm64: dts: imx8mn-ddr4-evk: correct ldo1/ldo2 voltage range
+
+From: Robin Gong <yibin.gong@nxp.com>
+
+commit cfb12c8952f617df58d73d24161e539a035d82b0 upstream.
+
+Correct ldo1 voltage range from wrong high group(3.0V~3.3V) to low group
+(1.6V~1.9V) because the ldo1 should be 1.8V. Actually, two voltage groups
+have been supported at bd718x7-regulator driver, hence, just corrrect the
+voltage range to 1.6V~3.3V. For ldo2@0.8V, correct voltage range too.
+Otherwise, ldo1 would be kept @3.0V and ldo2@0.9V which violate i.mx8mn
+datasheet as the below warning log in kernel:
+
+[    0.995524] LDO1: Bringing 1800000uV into 3000000-3000000uV
+[    0.999196] LDO2: Bringing 800000uV into 900000-900000uV
+
+Fixes: 3e44dd09736d ("arm64: dts: imx8mn-ddr4-evk: Add rohm,bd71847 PMIC support")
+Cc: stable@vger.kernel.org
+Signed-off-by: Robin Gong <yibin.gong@nxp.com>
+Reviewed-by: Dong Aisheng <aisheng.dong@nxp.com>
+Reviewed-by: Fabio Estevam <festevam@gmail.com>
+Signed-off-by: Shawn Guo <shawnguo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/boot/dts/freescale/imx8mn-ddr4-evk.dts |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/arm64/boot/dts/freescale/imx8mn-ddr4-evk.dts
++++ b/arch/arm64/boot/dts/freescale/imx8mn-ddr4-evk.dts
+@@ -268,7 +268,7 @@
+ 
+                       ldo1_reg: LDO1 {
+                               regulator-name = "LDO1";
+-                              regulator-min-microvolt = <3000000>;
++                              regulator-min-microvolt = <1600000>;
+                               regulator-max-microvolt = <3300000>;
+                               regulator-boot-on;
+                               regulator-always-on;
+@@ -276,7 +276,7 @@
+ 
+                       ldo2_reg: LDO2 {
+                               regulator-name = "LDO2";
+-                              regulator-min-microvolt = <900000>;
++                              regulator-min-microvolt = <800000>;
+                               regulator-max-microvolt = <900000>;
+                               regulator-boot-on;
+                               regulator-always-on;
diff --git a/queue-5.4/arm64-perf-report-the-pc-value-in-regs_abi_32-mode.patch b/queue-5.4/arm64-perf-report-the-pc-value-in-regs_abi_32-mode.patch

new file mode 100644 (file)

index 0000000..4faf00e
--- /dev/null
+++ b/queue-5.4/arm64-perf-report-the-pc-value-in-regs_abi_32-mode.patch
@@ -0,0 +1,67 @@
+From 8dfe804a4031ca6ba3a3efb2048534249b64f3a5 Mon Sep 17 00:00:00 2001
+From: Jiping Ma <jiping.ma2@windriver.com>
+Date: Mon, 11 May 2020 10:52:07 +0800
+Subject: arm64: perf: Report the PC value in REGS_ABI_32 mode
+
+From: Jiping Ma <jiping.ma2@windriver.com>
+
+commit 8dfe804a4031ca6ba3a3efb2048534249b64f3a5 upstream.
+
+A 32-bit perf querying the registers of a compat task using REGS_ABI_32
+will receive zeroes from w15, when it expects to find the PC.
+
+Return the PC value for register dwarf register 15 when returning register
+values for a compat task to perf.
+
+Cc: <stable@vger.kernel.org>
+Acked-by: Mark Rutland <mark.rutland@arm.com>
+Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
+Link: https://lore.kernel.org/r/1589165527-188401-1-git-send-email-jiping.ma2@windriver.com
+[will: Shuffled code and added a comment]
+Signed-off-by: Will Deacon <will@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/kernel/perf_regs.c |   25 ++++++++++++++++++++++---
+ 1 file changed, 22 insertions(+), 3 deletions(-)
+
+--- a/arch/arm64/kernel/perf_regs.c
++++ b/arch/arm64/kernel/perf_regs.c
+@@ -15,15 +15,34 @@ u64 perf_reg_value(struct pt_regs *regs,
+               return 0;
+ 
+       /*
+-       * Compat (i.e. 32 bit) mode:
+-       * - PC has been set in the pt_regs struct in kernel_entry,
+-       * - Handle SP and LR here.
++       * Our handling of compat tasks (PERF_SAMPLE_REGS_ABI_32) is weird, but
++       * we're stuck with it for ABI compatability reasons.
++       *
++       * For a 32-bit consumer inspecting a 32-bit task, then it will look at
++       * the first 16 registers (see arch/arm/include/uapi/asm/perf_regs.h).
++       * These correspond directly to a prefix of the registers saved in our
++       * 'struct pt_regs', with the exception of the PC, so we copy that down
++       * (x15 corresponds to SP_hyp in the architecture).
++       *
++       * So far, so good.
++       *
++       * The oddity arises when a 64-bit consumer looks at a 32-bit task and
++       * asks for registers beyond PERF_REG_ARM_MAX. In this case, we return
++       * SP_usr, LR_usr and PC in the positions where the AArch64 SP, LR and
++       * PC registers would normally live. The initial idea was to allow a
++       * 64-bit unwinder to unwind a 32-bit task and, although it's not clear
++       * how well that works in practice, somebody might be relying on it.
++       *
++       * At the time we make a sample, we don't know whether the consumer is
++       * 32-bit or 64-bit, so we have to cater for both possibilities.
+        */
+       if (compat_user_mode(regs)) {
+               if ((u32)idx == PERF_REG_ARM64_SP)
+                       return regs->compat_sp;
+               if ((u32)idx == PERF_REG_ARM64_LR)
+                       return regs->compat_lr;
++              if (idx == 15)
++                      return regs->pc;
+       }
+ 
+       if ((u32)idx == PERF_REG_ARM64_SP)
diff --git a/queue-5.4/btrfs-check-if-a-log-root-exists-before-locking-the-log_mutex-on-unlink.patch b/queue-5.4/btrfs-check-if-a-log-root-exists-before-locking-the-log_mutex-on-unlink.patch

new file mode 100644 (file)

index 0000000..e7fd706
--- /dev/null
+++ b/queue-5.4/btrfs-check-if-a-log-root-exists-before-locking-the-log_mutex-on-unlink.patch
@@ -0,0 +1,148 @@
+From e7a79811d0db136dc2d336b56d54cf1b774ce972 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 15 Jun 2020 10:38:44 +0100
+Subject: btrfs: check if a log root exists before locking the log_mutex on unlink
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit e7a79811d0db136dc2d336b56d54cf1b774ce972 upstream.
+
+This brings back an optimization that commit e678934cbe5f02 ("btrfs:
+Remove unnecessary check from join_running_log_trans") removed, but in
+a different form. So it's almost equivalent to a revert.
+
+That commit removed an optimization where we avoid locking a root's
+log_mutex when there is no log tree created in the current transaction.
+The affected code path is triggered through unlink operations.
+
+That commit was based on the assumption that the optimization was not
+necessary because we used to have the following checks when the patch
+was authored:
+
+  int btrfs_del_dir_entries_in_log(...)
+  {
+        (...)
+        if (dir->logged_trans < trans->transid)
+            return 0;
+
+        ret = join_running_log_trans(root);
+        (...)
+   }
+
+   int btrfs_del_inode_ref_in_log(...)
+   {
+        (...)
+        if (inode->logged_trans < trans->transid)
+            return 0;
+
+        ret = join_running_log_trans(root);
+        (...)
+   }
+
+However before that patch was merged, another patch was merged first which
+replaced those checks because they were buggy.
+
+That other patch corresponds to commit 803f0f64d17769 ("Btrfs: fix fsync
+not persisting dentry deletions due to inode evictions"). The assumption
+that if the logged_trans field of an inode had a smaller value then the
+current transaction's generation (transid) meant that the inode was not
+logged in the current transaction was only correct if the inode was not
+evicted and reloaded in the current transaction. So the corresponding bug
+fix changed those checks and replaced them with the following helper
+function:
+
+  static bool inode_logged(struct btrfs_trans_handle *trans,
+                           struct btrfs_inode *inode)
+  {
+        if (inode->logged_trans == trans->transid)
+                return true;
+
+        if (inode->last_trans == trans->transid &&
+            test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+            !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
+                return true;
+
+        return false;
+  }
+
+So if we have a subvolume without a log tree in the current transaction
+(because we had no fsyncs), every time we unlink an inode we can end up
+trying to lock the log_mutex of the root through join_running_log_trans()
+twice, once for the inode being unlinked (by btrfs_del_inode_ref_in_log())
+and once for the parent directory (with btrfs_del_dir_entries_in_log()).
+
+This means if we have several unlink operations happening in parallel for
+inodes in the same subvolume, and the those inodes and/or their parent
+inode were changed in the current transaction, we end up having a lot of
+contention on the log_mutex.
+
+The test robots from intel reported a -30.7% performance regression for
+a REAIM test after commit e678934cbe5f02 ("btrfs: Remove unnecessary check
+from join_running_log_trans").
+
+So just bring back the optimization to join_running_log_trans() where we
+check first if a log root exists before trying to lock the log_mutex. This
+is done by checking for a bit that is set on the root when a log tree is
+created and removed when a log tree is freed (at transaction commit time).
+
+Commit e678934cbe5f02 ("btrfs: Remove unnecessary check from
+join_running_log_trans") was merged in the 5.4 merge window while commit
+803f0f64d17769 ("Btrfs: fix fsync not persisting dentry deletions due to
+inode evictions") was merged in the 5.3 merge window. But the first
+commit was actually authored before the second commit (May 23 2019 vs
+June 19 2019).
+
+Reported-by: kernel test robot <rong.a.chen@intel.com>
+Link: https://lore.kernel.org/lkml/20200611090233.GL12456@shao2-debian/
+Fixes: e678934cbe5f02 ("btrfs: Remove unnecessary check from join_running_log_trans")
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ctree.h    |    2 ++
+ fs/btrfs/tree-log.c |    5 +++++
+ 2 files changed, 7 insertions(+)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -940,6 +940,8 @@ enum {
+       BTRFS_ROOT_DEAD_RELOC_TREE,
+       /* Mark dead root stored on device whose cleanup needs to be resumed */
+       BTRFS_ROOT_DEAD_TREE,
++      /* The root has a log tree. Used only for subvolume roots. */
++      BTRFS_ROOT_HAS_LOG_TREE,
+ };
+ 
+ /*
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -167,6 +167,7 @@ static int start_log_trans(struct btrfs_
+               if (ret)
+                       goto out;
+ 
++              set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
+               clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+               root->log_start_pid = current->pid;
+       }
+@@ -193,6 +194,9 @@ static int join_running_log_trans(struct
+ {
+       int ret = -ENOENT;
+ 
++      if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
++              return ret;
++
+       mutex_lock(&root->log_mutex);
+       if (root->log_root) {
+               ret = 0;
+@@ -3327,6 +3331,7 @@ int btrfs_free_log(struct btrfs_trans_ha
+       if (root->log_root) {
+               free_log_tree(trans, root->log_root);
+               root->log_root = NULL;
++              clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
+       }
+       return 0;
+ }
diff --git a/queue-5.4/btrfs-fix-bytes_may_use-underflow-when-running-balance-and-scrub-in-parallel.patch b/queue-5.4/btrfs-fix-bytes_may_use-underflow-when-running-balance-and-scrub-in-parallel.patch

new file mode 100644 (file)

index 0000000..5afe6ca
--- /dev/null
+++ b/queue-5.4/btrfs-fix-bytes_may_use-underflow-when-running-balance-and-scrub-in-parallel.patch
@@ -0,0 +1,151 @@
+From 6bd335b469f945f75474c11e3f577f85409f39c3 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 8 Jun 2020 13:33:05 +0100
+Subject: btrfs: fix bytes_may_use underflow when running balance and scrub in parallel
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 6bd335b469f945f75474c11e3f577f85409f39c3 upstream.
+
+When balance and scrub are running in parallel it is possible to end up
+with an underflow of the bytes_may_use counter of the data space_info
+object, which triggers a warning like the following:
+
+   [134243.793196] BTRFS info (device sdc): relocating block group 1104150528 flags data
+   [134243.806891] ------------[ cut here ]------------
+   [134243.807561] WARNING: CPU: 1 PID: 26884 at fs/btrfs/space-info.h:125 btrfs_add_reserved_bytes+0x1da/0x280 [btrfs]
+   [134243.808819] Modules linked in: btrfs blake2b_generic xor (...)
+   [134243.815779] CPU: 1 PID: 26884 Comm: kworker/u8:8 Tainted: G        W         5.6.0-rc7-btrfs-next-58 #5
+   [134243.816944] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
+   [134243.818389] Workqueue: writeback wb_workfn (flush-btrfs-108483)
+   [134243.819186] RIP: 0010:btrfs_add_reserved_bytes+0x1da/0x280 [btrfs]
+   [134243.819963] Code: 0b f2 85 (...)
+   [134243.822271] RSP: 0018:ffffa4160aae7510 EFLAGS: 00010287
+   [134243.822929] RAX: 000000000000c000 RBX: ffff96159a8c1000 RCX: 0000000000000000
+   [134243.823816] RDX: 0000000000008000 RSI: 0000000000000000 RDI: ffff96158067a810
+   [134243.824742] RBP: ffff96158067a800 R08: 0000000000000001 R09: 0000000000000000
+   [134243.825636] R10: ffff961501432a40 R11: 0000000000000000 R12: 000000000000c000
+   [134243.826532] R13: 0000000000000001 R14: ffffffffffff4000 R15: ffff96158067a810
+   [134243.827432] FS:  0000000000000000(0000) GS:ffff9615baa00000(0000) knlGS:0000000000000000
+   [134243.828451] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+   [134243.829184] CR2: 000055bd7e414000 CR3: 00000001077be004 CR4: 00000000003606e0
+   [134243.830083] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+   [134243.830975] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+   [134243.831867] Call Trace:
+   [134243.832211]  find_free_extent+0x4a0/0x16c0 [btrfs]
+   [134243.832846]  btrfs_reserve_extent+0x91/0x180 [btrfs]
+   [134243.833487]  cow_file_range+0x12d/0x490 [btrfs]
+   [134243.834080]  fallback_to_cow+0x82/0x1b0 [btrfs]
+   [134243.834689]  ? release_extent_buffer+0x121/0x170 [btrfs]
+   [134243.835370]  run_delalloc_nocow+0x33f/0xa30 [btrfs]
+   [134243.836032]  btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
+   [134243.836725]  ? find_lock_delalloc_range+0x221/0x250 [btrfs]
+   [134243.837450]  writepage_delalloc+0xe8/0x150 [btrfs]
+   [134243.838059]  __extent_writepage+0xe8/0x4c0 [btrfs]
+   [134243.838674]  extent_write_cache_pages+0x237/0x530 [btrfs]
+   [134243.839364]  extent_writepages+0x44/0xa0 [btrfs]
+   [134243.839946]  do_writepages+0x23/0x80
+   [134243.840401]  __writeback_single_inode+0x59/0x700
+   [134243.841006]  writeback_sb_inodes+0x267/0x5f0
+   [134243.841548]  __writeback_inodes_wb+0x87/0xe0
+   [134243.842091]  wb_writeback+0x382/0x590
+   [134243.842574]  ? wb_workfn+0x4a2/0x6c0
+   [134243.843030]  wb_workfn+0x4a2/0x6c0
+   [134243.843468]  process_one_work+0x26d/0x6a0
+   [134243.843978]  worker_thread+0x4f/0x3e0
+   [134243.844452]  ? process_one_work+0x6a0/0x6a0
+   [134243.844981]  kthread+0x103/0x140
+   [134243.845400]  ? kthread_create_worker_on_cpu+0x70/0x70
+   [134243.846030]  ret_from_fork+0x3a/0x50
+   [134243.846494] irq event stamp: 0
+   [134243.846892] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
+   [134243.847682] hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
+   [134243.848687] softirqs last  enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
+   [134243.849913] softirqs last disabled at (0): [<0000000000000000>] 0x0
+   [134243.850698] ---[ end trace bd7c03622e0b0a96 ]---
+   [134243.851335] ------------[ cut here ]------------
+
+When relocating a data block group, for each extent allocated in the
+block group we preallocate another extent with the same size for the
+data relocation inode (we do it at prealloc_file_extent_cluster()).
+We reserve space by calling btrfs_check_data_free_space(), which ends
+up incrementing the data space_info's bytes_may_use counter, and
+then call btrfs_prealloc_file_range() to allocate the extent, which
+always decrements the bytes_may_use counter by the same amount.
+
+The expectation is that writeback of the data relocation inode always
+follows a NOCOW path, by writing into the preallocated extents. However,
+when starting writeback we might end up falling back into the COW path,
+because the block group that contains the preallocated extent was turned
+into RO mode by a scrub running in parallel. The COW path then calls the
+extent allocator which ends up calling btrfs_add_reserved_bytes(), and
+this function decrements the bytes_may_use counter of the data space_info
+object by an amount corresponding to the size of the allocated extent,
+despite we haven't previously incremented it. When the counter currently
+has a value smaller then the allocated extent we reset the counter to 0
+and emit a warning, otherwise we just decrement it and slowly mess up
+with this counter which is crucial for space reservation, the end result
+can be granting reserved space to tasks when there isn't really enough
+free space, and having the tasks fail later in critical places where
+error handling consists of a transaction abort or hitting a BUG_ON().
+
+Fix this by making sure that if we fallback to the COW path for a data
+relocation inode, we increment the bytes_may_use counter of the data
+space_info object. The COW path will then decrement it at
+btrfs_add_reserved_bytes() on success or through its error handling part
+by a call to extent_clear_unlock_delalloc() (which ends up calling
+btrfs_clear_delalloc_extent() that does the decrement operation) in case
+of an error.
+
+Test case btrfs/061 from fstests could sporadically trigger this.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |   17 ++++++++++++-----
+ 1 file changed, 12 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -1328,6 +1328,8 @@ static int fallback_to_cow(struct inode
+                          int *page_started, unsigned long *nr_written)
+ {
+       const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode));
++      const bool is_reloc_ino = (BTRFS_I(inode)->root->root_key.objectid ==
++                                 BTRFS_DATA_RELOC_TREE_OBJECTID);
+       const u64 range_bytes = end + 1 - start;
+       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+       u64 range_start = start;
+@@ -1358,18 +1360,23 @@ static int fallback_to_cow(struct inode
+        *    data space info, which we incremented in the step above.
+        *
+        * If we need to fallback to cow and the inode corresponds to a free
+-       * space cache inode, we must also increment bytes_may_use of the data
+-       * space_info for the same reason. Space caches always get a prealloc
++       * space cache inode or an inode of the data relocation tree, we must
++       * also increment bytes_may_use of the data space_info for the same
++       * reason. Space caches and relocated data extents always get a prealloc
+        * extent for them, however scrub or balance may have set the block
+-       * group that contains that extent to RO mode.
++       * group that contains that extent to RO mode and therefore force COW
++       * when starting writeback.
+        */
+       count = count_range_bits(io_tree, &range_start, end, range_bytes,
+                                EXTENT_NORESERVE, 0);
+-      if (count > 0 || is_space_ino) {
+-              const u64 bytes = is_space_ino ? range_bytes : count;
++      if (count > 0 || is_space_ino || is_reloc_ino) {
++              u64 bytes = count;
+               struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+               struct btrfs_space_info *sinfo = fs_info->data_sinfo;
+ 
++              if (is_space_ino || is_reloc_ino)
++                      bytes = range_bytes;
++
+               spin_lock(&sinfo->lock);
+               btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
+               spin_unlock(&sinfo->lock);
diff --git a/queue-5.4/btrfs-fix-data-block-group-relocation-failure-due-to-concurrent-scrub.patch b/queue-5.4/btrfs-fix-data-block-group-relocation-failure-due-to-concurrent-scrub.patch

new file mode 100644 (file)

index 0000000..9b20874
--- /dev/null
+++ b/queue-5.4/btrfs-fix-data-block-group-relocation-failure-due-to-concurrent-scrub.patch
@@ -0,0 +1,194 @@
+From 432cd2a10f1c10cead91fe706ff5dc52f06d642a Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 8 Jun 2020 13:32:55 +0100
+Subject: btrfs: fix data block group relocation failure due to concurrent scrub
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 432cd2a10f1c10cead91fe706ff5dc52f06d642a upstream.
+
+When running relocation of a data block group while scrub is running in
+parallel, it is possible that the relocation will fail and abort the
+current transaction with an -EINVAL error:
+
+   [134243.988595] BTRFS info (device sdc): found 14 extents, stage: move data extents
+   [134243.999871] ------------[ cut here ]------------
+   [134244.000741] BTRFS: Transaction aborted (error -22)
+   [134244.001692] WARNING: CPU: 0 PID: 26954 at fs/btrfs/ctree.c:1071 __btrfs_cow_block+0x6a7/0x790 [btrfs]
+   [134244.003380] Modules linked in: btrfs blake2b_generic xor raid6_pq (...)
+   [134244.012577] CPU: 0 PID: 26954 Comm: btrfs Tainted: G        W         5.6.0-rc7-btrfs-next-58 #5
+   [134244.014162] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
+   [134244.016184] RIP: 0010:__btrfs_cow_block+0x6a7/0x790 [btrfs]
+   [134244.017151] Code: 48 c7 c7 (...)
+   [134244.020549] RSP: 0018:ffffa41607863888 EFLAGS: 00010286
+   [134244.021515] RAX: 0000000000000000 RBX: ffff9614bdfe09c8 RCX: 0000000000000000
+   [134244.022822] RDX: 0000000000000001 RSI: ffffffffb3d63980 RDI: 0000000000000001
+   [134244.024124] RBP: ffff961589e8c000 R08: 0000000000000000 R09: 0000000000000001
+   [134244.025424] R10: ffffffffc0ae5955 R11: 0000000000000000 R12: ffff9614bd530d08
+   [134244.026725] R13: ffff9614ced41b88 R14: ffff9614bdfe2a48 R15: 0000000000000000
+   [134244.028024] FS:  00007f29b63c08c0(0000) GS:ffff9615ba600000(0000) knlGS:0000000000000000
+   [134244.029491] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+   [134244.030560] CR2: 00007f4eb339b000 CR3: 0000000130d6e006 CR4: 00000000003606f0
+   [134244.031997] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+   [134244.033153] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+   [134244.034484] Call Trace:
+   [134244.034984]  btrfs_cow_block+0x12b/0x2b0 [btrfs]
+   [134244.035859]  do_relocation+0x30b/0x790 [btrfs]
+   [134244.036681]  ? do_raw_spin_unlock+0x49/0xc0
+   [134244.037460]  ? _raw_spin_unlock+0x29/0x40
+   [134244.038235]  relocate_tree_blocks+0x37b/0x730 [btrfs]
+   [134244.039245]  relocate_block_group+0x388/0x770 [btrfs]
+   [134244.040228]  btrfs_relocate_block_group+0x161/0x2e0 [btrfs]
+   [134244.041323]  btrfs_relocate_chunk+0x36/0x110 [btrfs]
+   [134244.041345]  btrfs_balance+0xc06/0x1860 [btrfs]
+   [134244.043382]  ? btrfs_ioctl_balance+0x27c/0x310 [btrfs]
+   [134244.045586]  btrfs_ioctl_balance+0x1ed/0x310 [btrfs]
+   [134244.045611]  btrfs_ioctl+0x1880/0x3760 [btrfs]
+   [134244.049043]  ? do_raw_spin_unlock+0x49/0xc0
+   [134244.049838]  ? _raw_spin_unlock+0x29/0x40
+   [134244.050587]  ? __handle_mm_fault+0x11b3/0x14b0
+   [134244.051417]  ? ksys_ioctl+0x92/0xb0
+   [134244.052070]  ksys_ioctl+0x92/0xb0
+   [134244.052701]  ? trace_hardirqs_off_thunk+0x1a/0x1c
+   [134244.053511]  __x64_sys_ioctl+0x16/0x20
+   [134244.054206]  do_syscall_64+0x5c/0x280
+   [134244.054891]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
+   [134244.055819] RIP: 0033:0x7f29b51c9dd7
+   [134244.056491] Code: 00 00 00 (...)
+   [134244.059767] RSP: 002b:00007ffcccc1dd08 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
+   [134244.061168] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f29b51c9dd7
+   [134244.062474] RDX: 00007ffcccc1dda0 RSI: 00000000c4009420 RDI: 0000000000000003
+   [134244.063771] RBP: 0000000000000003 R08: 00005565cea4b000 R09: 0000000000000000
+   [134244.065032] R10: 0000000000000541 R11: 0000000000000202 R12: 00007ffcccc2060a
+   [134244.066327] R13: 00007ffcccc1dda0 R14: 0000000000000002 R15: 00007ffcccc1dec0
+   [134244.067626] irq event stamp: 0
+   [134244.068202] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
+   [134244.069351] hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
+   [134244.070909] softirqs last  enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
+   [134244.072392] softirqs last disabled at (0): [<0000000000000000>] 0x0
+   [134244.073432] ---[ end trace bd7c03622e0b0a99 ]---
+
+The -EINVAL error comes from the following chain of function calls:
+
+  __btrfs_cow_block() <-- aborts the transaction
+    btrfs_reloc_cow_block()
+      replace_file_extents()
+        get_new_location() <-- returns -EINVAL
+
+When relocating a data block group, for each allocated extent of the block
+group, we preallocate another extent (at prealloc_file_extent_cluster()),
+associated with the data relocation inode, and then dirty all its pages.
+These preallocated extents have, and must have, the same size that extents
+from the data block group being relocated have.
+
+Later before we start the relocation stage that updates pointers (bytenr
+field of file extent items) to point to the the new extents, we trigger
+writeback for the data relocation inode. The expectation is that writeback
+will write the pages to the previously preallocated extents, that it
+follows the NOCOW path. That is generally the case, however, if a scrub
+is running it may have turned the block group that contains those extents
+into RO mode, in which case writeback falls back to the COW path.
+
+However in the COW path instead of allocating exactly one extent with the
+expected size, the allocator may end up allocating several smaller extents
+due to free space fragmentation - because we tell it at cow_file_range()
+that the minimum allocation size can match the filesystem's sector size.
+This later breaks the relocation's expectation that an extent associated
+to a file extent item in the data relocation inode has the same size as
+the respective extent pointed by a file extent item in another tree - in
+this case the extent to which the relocation inode poins to is smaller,
+causing relocation.c:get_new_location() to return -EINVAL.
+
+For example, if we are relocating a data block group X that has a logical
+address of X and the block group has an extent allocated at the logical
+address X + 128KiB with a size of 64KiB:
+
+1) At prealloc_file_extent_cluster() we allocate an extent for the data
+   relocation inode with a size of 64KiB and associate it to the file
+   offset 128KiB (X + 128KiB - X) of the data relocation inode. This
+   preallocated extent was allocated at block group Z;
+
+2) A scrub running in parallel turns block group Z into RO mode and
+   starts scrubing its extents;
+
+3) Relocation triggers writeback for the data relocation inode;
+
+4) When running delalloc (btrfs_run_delalloc_range()), we try first the
+   NOCOW path because the data relocation inode has BTRFS_INODE_PREALLOC
+   set in its flags. However, because block group Z is in RO mode, the
+   NOCOW path (run_delalloc_nocow()) falls back into the COW path, by
+   calling cow_file_range();
+
+5) At cow_file_range(), in the first iteration of the while loop we call
+   btrfs_reserve_extent() to allocate a 64KiB extent and pass it a minimum
+   allocation size of 4KiB (fs_info->sectorsize). Due to free space
+   fragmentation, btrfs_reserve_extent() ends up allocating two extents
+   of 32KiB each, each one on a different iteration of that while loop;
+
+6) Writeback of the data relocation inode completes;
+
+7) Relocation proceeds and ends up at relocation.c:replace_file_extents(),
+   with a leaf which has a file extent item that points to the data extent
+   from block group X, that has a logical address (bytenr) of X + 128KiB
+   and a size of 64KiB. Then it calls get_new_location(), which does a
+   lookup in the data relocation tree for a file extent item starting at
+   offset 128KiB (X + 128KiB - X) and belonging to the data relocation
+   inode. It finds a corresponding file extent item, however that item
+   points to an extent that has a size of 32KiB, which doesn't match the
+   expected size of 64KiB, resuling in -EINVAL being returned from this
+   function and propagated up to __btrfs_cow_block(), which aborts the
+   current transaction.
+
+To fix this make sure that at cow_file_range() when we call the allocator
+we pass it a minimum allocation size corresponding the desired extent size
+if the inode belongs to the data relocation tree, otherwise pass it the
+filesystem's sector size as the minimum allocation size.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |   19 ++++++++++++++++++-
+ 1 file changed, 18 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -975,6 +975,7 @@ static noinline int cow_file_range(struc
+       u64 num_bytes;
+       unsigned long ram_size;
+       u64 cur_alloc_size = 0;
++      u64 min_alloc_size;
+       u64 blocksize = fs_info->sectorsize;
+       struct btrfs_key ins;
+       struct extent_map *em;
+@@ -1025,10 +1026,26 @@ static noinline int cow_file_range(struc
+       btrfs_drop_extent_cache(BTRFS_I(inode), start,
+                       start + num_bytes - 1, 0);
+ 
++      /*
++       * Relocation relies on the relocated extents to have exactly the same
++       * size as the original extents. Normally writeback for relocation data
++       * extents follows a NOCOW path because relocation preallocates the
++       * extents. However, due to an operation such as scrub turning a block
++       * group to RO mode, it may fallback to COW mode, so we must make sure
++       * an extent allocated during COW has exactly the requested size and can
++       * not be split into smaller extents, otherwise relocation breaks and
++       * fails during the stage where it updates the bytenr of file extent
++       * items.
++       */
++      if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
++              min_alloc_size = num_bytes;
++      else
++              min_alloc_size = fs_info->sectorsize;
++
+       while (num_bytes > 0) {
+               cur_alloc_size = num_bytes;
+               ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
+-                                         fs_info->sectorsize, 0, alloc_hint,
++                                         min_alloc_size, 0, alloc_hint,
+                                          &ins, 1, 1);
+               if (ret < 0)
+                       goto out_unlock;
diff --git a/queue-5.4/btrfs-fix-failure-of-rwf_nowait-write-into-prealloc-extent-beyond-eof.patch b/queue-5.4/btrfs-fix-failure-of-rwf_nowait-write-into-prealloc-extent-beyond-eof.patch

new file mode 100644 (file)

index 0000000..44699a0
--- /dev/null
+++ b/queue-5.4/btrfs-fix-failure-of-rwf_nowait-write-into-prealloc-extent-beyond-eof.patch
@@ -0,0 +1,61 @@
+From 4b1946284dd6641afdb9457101056d9e6ee6204c Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 15 Jun 2020 18:48:58 +0100
+Subject: btrfs: fix failure of RWF_NOWAIT write into prealloc extent beyond eof
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 4b1946284dd6641afdb9457101056d9e6ee6204c upstream.
+
+If we attempt to write to prealloc extent located after eof using a
+RWF_NOWAIT write, we always fail with -EAGAIN.
+
+We do actually check if we have an allocated extent for the write at
+the start of btrfs_file_write_iter() through a call to check_can_nocow(),
+but later when we go into the actual direct IO write path we simply
+return -EAGAIN if the write starts at or beyond EOF.
+
+Trivial to reproduce:
+
+  $ mkfs.btrfs -f /dev/sdb
+  $ mount /dev/sdb /mnt
+
+  $ touch /mnt/foo
+  $ chattr +C /mnt/foo
+
+  $ xfs_io -d -c "pwrite -S 0xab 0 64K" /mnt/foo
+  wrote 65536/65536 bytes at offset 0
+  64 KiB, 16 ops; 0.0004 sec (135.575 MiB/sec and 34707.1584 ops/sec)
+
+  $ xfs_io -c "falloc -k 64K 1M" /mnt/foo
+
+  $ xfs_io -d -c "pwrite -N -V 1 -S 0xfe -b 64K 64K 64K" /mnt/foo
+  pwrite: Resource temporarily unavailable
+
+On xfs and ext4 the write succeeds, as expected.
+
+Fix this by removing the wrong check at btrfs_direct_IO().
+
+Fixes: edf064e7c6fec3 ("btrfs: nowait aio support")
+CC: stable@vger.kernel.org # 4.14+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |    3 ---
+ 1 file changed, 3 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -8857,9 +8857,6 @@ static ssize_t btrfs_direct_IO(struct ki
+                       dio_data.overwrite = 1;
+                       inode_unlock(inode);
+                       relock = true;
+-              } else if (iocb->ki_flags & IOCB_NOWAIT) {
+-                      ret = -EAGAIN;
+-                      goto out;
+               }
+               ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
+                                                  offset, count);
diff --git a/queue-5.4/mm-memcontrol.c-add-missed-css_put.patch b/queue-5.4/mm-memcontrol.c-add-missed-css_put.patch

new file mode 100644 (file)

index 0000000..1125b65
--- /dev/null
+++ b/queue-5.4/mm-memcontrol.c-add-missed-css_put.patch
@@ -0,0 +1,42 @@
+From 3a98990ae2150277ed34d3b248c60e68bf2244b2 Mon Sep 17 00:00:00 2001
+From: Muchun Song <songmuchun@bytedance.com>
+Date: Thu, 25 Jun 2020 20:30:19 -0700
+Subject: mm/memcontrol.c: add missed css_put()
+
+From: Muchun Song <songmuchun@bytedance.com>
+
+commit 3a98990ae2150277ed34d3b248c60e68bf2244b2 upstream.
+
+We should put the css reference when memory allocation failed.
+
+Link: http://lkml.kernel.org/r/20200614122653.98829-1-songmuchun@bytedance.com
+Fixes: f0a3a24b532d ("mm: memcg/slab: rework non-root kmem_cache lifecycle management")
+Signed-off-by: Muchun Song <songmuchun@bytedance.com>
+Acked-by: Roman Gushchin <guro@fb.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Qian Cai <cai@lca.pw>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -2895,8 +2895,10 @@ static void memcg_schedule_kmem_cache_cr
+               return;
+ 
+       cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
+-      if (!cw)
++      if (!cw) {
++              css_put(&memcg->css);
+               return;
++      }
+ 
+       cw->memcg = memcg;
+       cw->cachep = cachep;
diff --git a/queue-5.4/mm-slab-use-memzero_explicit-in-kzfree.patch b/queue-5.4/mm-slab-use-memzero_explicit-in-kzfree.patch

new file mode 100644 (file)

index 0000000..e604404
--- /dev/null
+++ b/queue-5.4/mm-slab-use-memzero_explicit-in-kzfree.patch
@@ -0,0 +1,54 @@
+From 8982ae527fbef170ef298650c15d55a9ccd33973 Mon Sep 17 00:00:00 2001
+From: Waiman Long <longman@redhat.com>
+Date: Thu, 25 Jun 2020 20:29:52 -0700
+Subject: mm/slab: use memzero_explicit() in kzfree()
+
+From: Waiman Long <longman@redhat.com>
+
+commit 8982ae527fbef170ef298650c15d55a9ccd33973 upstream.
+
+The kzfree() function is normally used to clear some sensitive
+information, like encryption keys, in the buffer before freeing it back to
+the pool.  Memset() is currently used for buffer clearing.  However
+unlikely, there is still a non-zero probability that the compiler may
+choose to optimize away the memory clearing especially if LTO is being
+used in the future.
+
+To make sure that this optimization will never happen,
+memzero_explicit(), which is introduced in v3.18, is now used in
+kzfree() to future-proof it.
+
+Link: http://lkml.kernel.org/r/20200616154311.12314-2-longman@redhat.com
+Fixes: 3ef0e5ba4673 ("slab: introduce kzfree()")
+Signed-off-by: Waiman Long <longman@redhat.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: David Howells <dhowells@redhat.com>
+Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
+Cc: James Morris <jmorris@namei.org>
+Cc: "Serge E. Hallyn" <serge@hallyn.com>
+Cc: Joe Perches <joe@perches.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Dan Carpenter <dan.carpenter@oracle.com>
+Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/slab_common.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/slab_common.c
++++ b/mm/slab_common.c
+@@ -1740,7 +1740,7 @@ void kzfree(const void *p)
+       if (unlikely(ZERO_OR_NULL_PTR(mem)))
+               return;
+       ks = ksize(mem);
+-      memset(mem, 0, ks);
++      memzero_explicit(mem, ks);
+       kfree(mem);
+ }
+ EXPORT_SYMBOL(kzfree);
diff --git a/queue-5.4/ocfs2-avoid-inode-removal-while-nfsd-is-accessing-it.patch b/queue-5.4/ocfs2-avoid-inode-removal-while-nfsd-is-accessing-it.patch

new file mode 100644 (file)

index 0000000..be11bfb
--- /dev/null
+++ b/queue-5.4/ocfs2-avoid-inode-removal-while-nfsd-is-accessing-it.patch
@@ -0,0 +1,98 @@
+From 4cd9973f9ff69e37dd0ba2bd6e6423f8179c329a Mon Sep 17 00:00:00 2001
+From: Junxiao Bi <junxiao.bi@oracle.com>
+Date: Thu, 25 Jun 2020 20:29:30 -0700
+Subject: ocfs2: avoid inode removal while nfsd is accessing it
+
+From: Junxiao Bi <junxiao.bi@oracle.com>
+
+commit 4cd9973f9ff69e37dd0ba2bd6e6423f8179c329a upstream.
+
+Patch series "ocfs2: fix nfsd over ocfs2 issues", v2.
+
+This is a series of patches to fix issues on nfsd over ocfs2.  patch 1
+is to avoid inode removed while nfsd access it patch 2 & 3 is to fix a
+panic issue.
+
+This patch (of 4):
+
+When nfsd is getting file dentry using handle or parent dentry of some
+dentry, one cluster lock is used to avoid inode removed from other node,
+but it still could be removed from local node, so use a rw lock to avoid
+this.
+
+Link: http://lkml.kernel.org/r/20200616183829.87211-1-junxiao.bi@oracle.com
+Link: http://lkml.kernel.org/r/20200616183829.87211-2-junxiao.bi@oracle.com
+Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
+Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
+Cc: Changwei Ge <gechangwei@live.cn>
+Cc: Gang He <ghe@suse.com>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Jun Piao <piaojun@huawei.com>
+Cc: Mark Fasheh <mark@fasheh.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/dlmglue.c |   17 ++++++++++++++++-
+ fs/ocfs2/ocfs2.h   |    1 +
+ 2 files changed, 17 insertions(+), 1 deletion(-)
+
+--- a/fs/ocfs2/dlmglue.c
++++ b/fs/ocfs2/dlmglue.c
+@@ -689,6 +689,12 @@ static void ocfs2_nfs_sync_lock_res_init
+                                  &ocfs2_nfs_sync_lops, osb);
+ }
+ 
++static void ocfs2_nfs_sync_lock_init(struct ocfs2_super *osb)
++{
++      ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
++      init_rwsem(&osb->nfs_sync_rwlock);
++}
++
+ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
+ {
+       struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+@@ -2855,6 +2861,11 @@ int ocfs2_nfs_sync_lock(struct ocfs2_sup
+       if (ocfs2_is_hard_readonly(osb))
+               return -EROFS;
+ 
++      if (ex)
++              down_write(&osb->nfs_sync_rwlock);
++      else
++              down_read(&osb->nfs_sync_rwlock);
++
+       if (ocfs2_mount_local(osb))
+               return 0;
+ 
+@@ -2873,6 +2884,10 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_
+       if (!ocfs2_mount_local(osb))
+               ocfs2_cluster_unlock(osb, lockres,
+                                    ex ? LKM_EXMODE : LKM_PRMODE);
++      if (ex)
++              up_write(&osb->nfs_sync_rwlock);
++      else
++              up_read(&osb->nfs_sync_rwlock);
+ }
+ 
+ int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
+@@ -3340,7 +3355,7 @@ int ocfs2_dlm_init(struct ocfs2_super *o
+ local:
+       ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
+       ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+-      ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
++      ocfs2_nfs_sync_lock_init(osb);
+       ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
+ 
+       osb->cconn = conn;
+--- a/fs/ocfs2/ocfs2.h
++++ b/fs/ocfs2/ocfs2.h
+@@ -394,6 +394,7 @@ struct ocfs2_super
+       struct ocfs2_lock_res osb_super_lockres;
+       struct ocfs2_lock_res osb_rename_lockres;
+       struct ocfs2_lock_res osb_nfs_sync_lockres;
++      struct rw_semaphore nfs_sync_rwlock;
+       struct ocfs2_lock_res osb_trim_fs_lockres;
+       struct mutex obs_trim_fs_mutex;
+       struct ocfs2_dlm_debug *osb_dlm_debug;
diff --git a/queue-5.4/ocfs2-fix-panic-on-nfs-server-over-ocfs2.patch b/queue-5.4/ocfs2-fix-panic-on-nfs-server-over-ocfs2.patch

new file mode 100644 (file)

index 0000000..5b420b3
--- /dev/null
+++ b/queue-5.4/ocfs2-fix-panic-on-nfs-server-over-ocfs2.patch
@@ -0,0 +1,90 @@
+From e5a15e17a78d58f933d17cafedfcf7486a29f5b4 Mon Sep 17 00:00:00 2001
+From: Junxiao Bi <junxiao.bi@oracle.com>
+Date: Thu, 25 Jun 2020 20:29:37 -0700
+Subject: ocfs2: fix panic on nfs server over ocfs2
+
+From: Junxiao Bi <junxiao.bi@oracle.com>
+
+commit e5a15e17a78d58f933d17cafedfcf7486a29f5b4 upstream.
+
+The following kernel panic was captured when running nfs server over
+ocfs2, at that time ocfs2_test_inode_bit() was checking whether one
+inode locating at "blkno" 5 was valid, that is ocfs2 root inode, its
+"suballoc_slot" was OCFS2_INVALID_SLOT(65535) and it was allocted from
+//global_inode_alloc, but here it wrongly assumed that it was got from per
+slot inode alloctor which would cause array overflow and trigger kernel
+panic.
+
+  BUG: unable to handle kernel paging request at 0000000000001088
+  IP: [<ffffffff816f6898>] _raw_spin_lock+0x18/0xf0
+  PGD 1e06ba067 PUD 1e9e7d067 PMD 0
+  Oops: 0002 [#1] SMP
+  CPU: 6 PID: 24873 Comm: nfsd Not tainted 4.1.12-124.36.1.el6uek.x86_64 #2
+  Hardware name: Huawei CH121 V3/IT11SGCA1, BIOS 3.87 02/02/2018
+  RIP: _raw_spin_lock+0x18/0xf0
+  RSP: e02b:ffff88005ae97908  EFLAGS: 00010206
+  RAX: ffff88005ae98000 RBX: 0000000000001088 RCX: 0000000000000000
+  RDX: 0000000000020000 RSI: 0000000000000009 RDI: 0000000000001088
+  RBP: ffff88005ae97928 R08: 0000000000000000 R09: ffff880212878e00
+  R10: 0000000000007ff0 R11: 0000000000000000 R12: 0000000000001088
+  R13: ffff8800063c0aa8 R14: ffff8800650c27d0 R15: 000000000000ffff
+  FS:  0000000000000000(0000) GS:ffff880218180000(0000) knlGS:ffff880218180000
+  CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
+  CR2: 0000000000001088 CR3: 00000002033d0000 CR4: 0000000000042660
+  Call Trace:
+    igrab+0x1e/0x60
+    ocfs2_get_system_file_inode+0x63/0x3a0 [ocfs2]
+    ocfs2_test_inode_bit+0x328/0xa00 [ocfs2]
+    ocfs2_get_parent+0xba/0x3e0 [ocfs2]
+    reconnect_path+0xb5/0x300
+    exportfs_decode_fh+0xf6/0x2b0
+    fh_verify+0x350/0x660 [nfsd]
+    nfsd4_putfh+0x4d/0x60 [nfsd]
+    nfsd4_proc_compound+0x3d3/0x6f0 [nfsd]
+    nfsd_dispatch+0xe0/0x290 [nfsd]
+    svc_process_common+0x412/0x6a0 [sunrpc]
+    svc_process+0x123/0x210 [sunrpc]
+    nfsd+0xff/0x170 [nfsd]
+    kthread+0xcb/0xf0
+    ret_from_fork+0x61/0x90
+  Code: 83 c2 02 0f b7 f2 e8 18 dc 91 ff 66 90 eb bf 0f 1f 40 00 55 48 89 e5 41 56 41 55 41 54 53 0f 1f 44 00 00 48 89 fb ba 00 00 02 00 <f0> 0f c1 17 89 d0 45 31 e4 45 31 ed c1 e8 10 66 39 d0 41 89 c6
+  RIP   _raw_spin_lock+0x18/0xf0
+  CR2: 0000000000001088
+  ---[ end trace 7264463cd1aac8f9 ]---
+  Kernel panic - not syncing: Fatal exception
+
+Link: http://lkml.kernel.org/r/20200616183829.87211-4-junxiao.bi@oracle.com
+Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
+Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
+Cc: Changwei Ge <gechangwei@live.cn>
+Cc: Gang He <ghe@suse.com>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Jun Piao <piaojun@huawei.com>
+Cc: Mark Fasheh <mark@fasheh.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/suballoc.c |    9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/fs/ocfs2/suballoc.c
++++ b/fs/ocfs2/suballoc.c
+@@ -2827,9 +2827,12 @@ int ocfs2_test_inode_bit(struct ocfs2_su
+               goto bail;
+       }
+ 
+-      inode_alloc_inode =
+-              ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+-                                          suballoc_slot);
++      if (suballoc_slot == (u16)OCFS2_INVALID_SLOT)
++              inode_alloc_inode = ocfs2_get_system_file_inode(osb,
++                      GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
++      else
++              inode_alloc_inode = ocfs2_get_system_file_inode(osb,
++                      INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
+       if (!inode_alloc_inode) {
+               /* the error code could be inaccurate, but we are not able to
+                * get the correct one. */
diff --git a/queue-5.4/ocfs2-fix-value-of-ocfs2_invalid_slot.patch b/queue-5.4/ocfs2-fix-value-of-ocfs2_invalid_slot.patch

new file mode 100644 (file)

index 0000000..475b56c
--- /dev/null
+++ b/queue-5.4/ocfs2-fix-value-of-ocfs2_invalid_slot.patch
@@ -0,0 +1,53 @@
+From 9277f8334ffc719fe922d776444d6e4e884dbf30 Mon Sep 17 00:00:00 2001
+From: Junxiao Bi <junxiao.bi@oracle.com>
+Date: Thu, 25 Jun 2020 20:29:40 -0700
+Subject: ocfs2: fix value of OCFS2_INVALID_SLOT
+
+From: Junxiao Bi <junxiao.bi@oracle.com>
+
+commit 9277f8334ffc719fe922d776444d6e4e884dbf30 upstream.
+
+In the ocfs2 disk layout, slot number is 16 bits, but in ocfs2
+implementation, slot number is 32 bits.  Usually this will not cause any
+issue, because slot number is converted from u16 to u32, but
+OCFS2_INVALID_SLOT was defined as -1, when an invalid slot number from
+disk was obtained, its value was (u16)-1, and it was converted to u32.
+Then the following checking in get_local_system_inode will be always
+skipped:
+
+ static struct inode **get_local_system_inode(struct ocfs2_super *osb,
+                                               int type,
+                                               u32 slot)
+ {
+       BUG_ON(slot == OCFS2_INVALID_SLOT);
+       ...
+ }
+
+Link: http://lkml.kernel.org/r/20200616183829.87211-5-junxiao.bi@oracle.com
+Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
+Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
+Cc: Mark Fasheh <mark@fasheh.com>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Changwei Ge <gechangwei@live.cn>
+Cc: Gang He <ghe@suse.com>
+Cc: Jun Piao <piaojun@huawei.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/ocfs2_fs.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ocfs2/ocfs2_fs.h
++++ b/fs/ocfs2/ocfs2_fs.h
+@@ -290,7 +290,7 @@
+ #define OCFS2_MAX_SLOTS                       255
+ 
+ /* Slot map indicator for an empty slot */
+-#define OCFS2_INVALID_SLOT            -1
++#define OCFS2_INVALID_SLOT            ((u16)-1)
+ 
+ #define OCFS2_VOL_UUID_LEN            16
+ #define OCFS2_MAX_VOL_LABEL_LEN               64
diff --git a/queue-5.4/ocfs2-load-global_inode_alloc.patch b/queue-5.4/ocfs2-load-global_inode_alloc.patch

new file mode 100644 (file)

index 0000000..c8a4a1b
--- /dev/null
+++ b/queue-5.4/ocfs2-load-global_inode_alloc.patch
@@ -0,0 +1,43 @@
+From 7569d3c754e452769a5747eeeba488179e38a5da Mon Sep 17 00:00:00 2001
+From: Junxiao Bi <junxiao.bi@oracle.com>
+Date: Thu, 25 Jun 2020 20:29:33 -0700
+Subject: ocfs2: load global_inode_alloc
+
+From: Junxiao Bi <junxiao.bi@oracle.com>
+
+commit 7569d3c754e452769a5747eeeba488179e38a5da upstream.
+
+Set global_inode_alloc as OCFS2_FIRST_ONLINE_SYSTEM_INODE, that will
+make it load during mount.  It can be used to test whether some
+global/system inodes are valid.  One use case is that nfsd will test
+whether root inode is valid.
+
+Link: http://lkml.kernel.org/r/20200616183829.87211-3-junxiao.bi@oracle.com
+Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
+Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
+Cc: Changwei Ge <gechangwei@live.cn>
+Cc: Gang He <ghe@suse.com>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Jun Piao <piaojun@huawei.com>
+Cc: Mark Fasheh <mark@fasheh.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/ocfs2_fs.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ocfs2/ocfs2_fs.h
++++ b/fs/ocfs2/ocfs2_fs.h
+@@ -326,8 +326,8 @@ struct ocfs2_system_inode_info {
+ enum {
+       BAD_BLOCK_SYSTEM_INODE = 0,
+       GLOBAL_INODE_ALLOC_SYSTEM_INODE,
++#define OCFS2_FIRST_ONLINE_SYSTEM_INODE GLOBAL_INODE_ALLOC_SYSTEM_INODE
+       SLOT_MAP_SYSTEM_INODE,
+-#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
+       HEARTBEAT_SYSTEM_INODE,
+       GLOBAL_BITMAP_SYSTEM_INODE,
+       USER_QUOTA_SYSTEM_INODE,
diff --git a/queue-5.4/series b/queue-5.4/series

index 2f0d019de21e16fb168837c256ca725240fa1bf5..53c18340a7167689ddebe608eccf102f0d90b0e0 100644 (file)
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -147,3 +147,16 @@ kvm-nvmx-plumb-l2-gpa-through-to-pml-emulation.patch
  kvm-vmx-stop-context-switching-msr_ia32_umwait_control.patch
  x86-cpu-use-pinning-mask-for-cr4-bits-needing-to-be-0.patch
  x86-asm-64-align-start-of-__clear_user-loop-to-16-bytes.patch
+btrfs-fix-bytes_may_use-underflow-when-running-balance-and-scrub-in-parallel.patch
+btrfs-fix-data-block-group-relocation-failure-due-to-concurrent-scrub.patch
+btrfs-check-if-a-log-root-exists-before-locking-the-log_mutex-on-unlink.patch
+btrfs-fix-failure-of-rwf_nowait-write-into-prealloc-extent-beyond-eof.patch
+mm-slab-use-memzero_explicit-in-kzfree.patch
+ocfs2-avoid-inode-removal-while-nfsd-is-accessing-it.patch
+ocfs2-load-global_inode_alloc.patch
+ocfs2-fix-value-of-ocfs2_invalid_slot.patch
+ocfs2-fix-panic-on-nfs-server-over-ocfs2.patch
+mm-memcontrol.c-add-missed-css_put.patch
+arm64-perf-report-the-pc-value-in-regs_abi_32-mode.patch
+arm64-dts-imx8mm-evk-correct-ldo1-ldo2-voltage-range.patch
+arm64-dts-imx8mn-ddr4-evk-correct-ldo1-ldo2-voltage-range.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 29 Jun 2020 11:26:51 +0000 (13:26 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 29 Jun 2020 11:26:51 +0000 (13:26 +0200)
queue-5.4/arm64-dts-imx8mm-evk-correct-ldo1-ldo2-voltage-range.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/arm64-dts-imx8mn-ddr4-evk-correct-ldo1-ldo2-voltage-range.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/arm64-perf-report-the-pc-value-in-regs_abi_32-mode.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/btrfs-check-if-a-log-root-exists-before-locking-the-log_mutex-on-unlink.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/btrfs-fix-bytes_may_use-underflow-when-running-balance-and-scrub-in-parallel.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/btrfs-fix-data-block-group-relocation-failure-due-to-concurrent-scrub.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/btrfs-fix-failure-of-rwf_nowait-write-into-prealloc-extent-beyond-eof.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/mm-memcontrol.c-add-missed-css_put.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/mm-slab-use-memzero_explicit-in-kzfree.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/ocfs2-avoid-inode-removal-while-nfsd-is-accessing-it.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/ocfs2-fix-panic-on-nfs-server-over-ocfs2.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/ocfs2-fix-value-of-ocfs2_invalid_slot.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/ocfs2-load-global_inode_alloc.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/series		patch \| blob \| blame \| history