]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 5.16
authorSasha Levin <sashal@kernel.org>
Thu, 3 Mar 2022 21:33:04 +0000 (16:33 -0500)
committerSasha Levin <sashal@kernel.org>
Thu, 3 Mar 2022 21:33:04 +0000 (16:33 -0500)
Signed-off-by: Sasha Levin <sashal@kernel.org>
17 files changed:
queue-5.16/arm64-mark-start_backtrace-notrace-and-nokprobe_symb.patch [new file with mode: 0644]
queue-5.16/ata-pata_hpt37x-fix-pci-clock-detection.patch [new file with mode: 0644]
queue-5.16/drm-amdgpu-check-vm-ready-by-amdgpu_vm-evicting-flag.patch [new file with mode: 0644]
queue-5.16/exfat-fix-i_blocks-for-files-truncated-over-4-gib.patch [new file with mode: 0644]
queue-5.16/exfat-reuse-exfat_inode_info-variable-instead-of-cal.patch [new file with mode: 0644]
queue-5.16/ext4-drop-ineligible-txn-start-stop-apis.patch [new file with mode: 0644]
queue-5.16/ext4-fast-commit-may-miss-file-actions.patch [new file with mode: 0644]
queue-5.16/ext4-fast-commit-may-not-fallback-for-ineligible-com.patch [new file with mode: 0644]
queue-5.16/ext4-simplify-updating-of-fast-commit-stats.patch [new file with mode: 0644]
queue-5.16/ipv6-fix-skb-drops-in-igmp6_event_query-and-igmp6_ev.patch [new file with mode: 0644]
queue-5.16/kvm-arm64-workaround-cortex-a510-s-single-step-and-p.patch [new file with mode: 0644]
queue-5.16/kvm-x86-add-kvm_cap_enable_cap-to-x86.patch [new file with mode: 0644]
queue-5.16/sched-fair-fix-fault-in-reweight_entity.patch [new file with mode: 0644]
queue-5.16/serial-stm32-prevent-tdr-register-overwrite-when-sen.patch [new file with mode: 0644]
queue-5.16/series
queue-5.16/tracing-add-test-for-user-space-strings-when-filteri.patch [new file with mode: 0644]
queue-5.16/tracing-add-ustring-operation-to-filtering-string-po.patch [new file with mode: 0644]

diff --git a/queue-5.16/arm64-mark-start_backtrace-notrace-and-nokprobe_symb.patch b/queue-5.16/arm64-mark-start_backtrace-notrace-and-nokprobe_symb.patch
new file mode 100644 (file)
index 0000000..a1fc4ad
--- /dev/null
@@ -0,0 +1,49 @@
+From 9b93f78bc256080274ff351d54eb73fe892aadf6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 24 Jan 2022 17:17:54 +0900
+Subject: arm64: Mark start_backtrace() notrace and NOKPROBE_SYMBOL
+
+From: Masami Hiramatsu <mhiramat@kernel.org>
+
+[ Upstream commit 1e0924bd09916fab795fc2a21ec1d148f24299fd ]
+
+Mark the start_backtrace() as notrace and NOKPROBE_SYMBOL
+because this function is called from ftrace and lockdep to
+get the caller address via return_address(). The lockdep
+is used in kprobes, it should also be NOKPROBE_SYMBOL.
+
+Fixes: b07f3499661c ("arm64: stacktrace: Move start_backtrace() out of the header")
+Cc: <stable@vger.kernel.org> # 5.13.x
+Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
+Reviewed-by: Mark Brown <broonie@kernel.org>
+Link: https://lore.kernel.org/r/164301227374.1433152.12808232644267107415.stgit@devnote2
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/arm64/kernel/stacktrace.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
+index 94f83cd44e507..0ee6bd390bd09 100644
+--- a/arch/arm64/kernel/stacktrace.c
++++ b/arch/arm64/kernel/stacktrace.c
+@@ -33,7 +33,7 @@
+  */
+-void start_backtrace(struct stackframe *frame, unsigned long fp,
++notrace void start_backtrace(struct stackframe *frame, unsigned long fp,
+                    unsigned long pc)
+ {
+       frame->fp = fp;
+@@ -55,6 +55,7 @@ void start_backtrace(struct stackframe *frame, unsigned long fp,
+       frame->prev_fp = 0;
+       frame->prev_type = STACK_TYPE_UNKNOWN;
+ }
++NOKPROBE_SYMBOL(start_backtrace);
+ /*
+  * Unwind from one frame record (A) to the next frame record (B).
+-- 
+2.34.1
+
diff --git a/queue-5.16/ata-pata_hpt37x-fix-pci-clock-detection.patch b/queue-5.16/ata-pata_hpt37x-fix-pci-clock-detection.patch
new file mode 100644 (file)
index 0000000..275a435
--- /dev/null
@@ -0,0 +1,45 @@
+From da843dd5695cde66ea4bcce7142d469b2d8514d9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 19 Feb 2022 23:04:29 +0300
+Subject: ata: pata_hpt37x: fix PCI clock detection
+
+From: Sergey Shtylyov <s.shtylyov@omp.ru>
+
+[ Upstream commit 5f6b0f2d037c8864f20ff15311c695f65eb09db5 ]
+
+The f_CNT register (at the PCI config. address 0x78) is 16-bit, not
+8-bit! The bug was there from the very start... :-(
+
+Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
+Fixes: 669a5db411d8 ("[libata] Add a bunch of PATA drivers.")
+Cc: stable@vger.kernel.org
+Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/ata/pata_hpt37x.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/ata/pata_hpt37x.c b/drivers/ata/pata_hpt37x.c
+index ae8375e9d2681..9d371859e81ed 100644
+--- a/drivers/ata/pata_hpt37x.c
++++ b/drivers/ata/pata_hpt37x.c
+@@ -964,14 +964,14 @@ static int hpt37x_init_one(struct pci_dev *dev, const struct pci_device_id *id)
+       if ((freq >> 12) != 0xABCDE) {
+               int i;
+-              u8 sr;
++              u16 sr;
+               u32 total = 0;
+               pr_warn("BIOS has not set timing clocks\n");
+               /* This is the process the HPT371 BIOS is reported to use */
+               for (i = 0; i < 128; i++) {
+-                      pci_read_config_byte(dev, 0x78, &sr);
++                      pci_read_config_word(dev, 0x78, &sr);
+                       total += sr & 0x1FF;
+                       udelay(15);
+               }
+-- 
+2.34.1
+
diff --git a/queue-5.16/drm-amdgpu-check-vm-ready-by-amdgpu_vm-evicting-flag.patch b/queue-5.16/drm-amdgpu-check-vm-ready-by-amdgpu_vm-evicting-flag.patch
new file mode 100644 (file)
index 0000000..69a22ec
--- /dev/null
@@ -0,0 +1,83 @@
+From ba0f28782dfb657a17208d37a05a7187e3335be4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 21 Feb 2022 17:53:56 +0800
+Subject: drm/amdgpu: check vm ready by amdgpu_vm->evicting flag
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Qiang Yu <qiang.yu@amd.com>
+
+[ Upstream commit c1a66c3bc425ff93774fb2f6eefa67b83170dd7e ]
+
+Workstation application ANSA/META v21.1.4 get this error dmesg when
+running CI test suite provided by ANSA/META:
+[drm:amdgpu_gem_va_ioctl [amdgpu]] *ERROR* Couldn't update BO_VA (-16)
+
+This is caused by:
+1. create a 256MB buffer in invisible VRAM
+2. CPU map the buffer and access it causes vm_fault and try to move
+   it to visible VRAM
+3. force visible VRAM space and traverse all VRAM bos to check if
+   evicting this bo is valuable
+4. when checking a VM bo (in invisible VRAM), amdgpu_vm_evictable()
+   will set amdgpu_vm->evicting, but latter due to not in visible
+   VRAM, won't really evict it so not add it to amdgpu_vm->evicted
+5. before next CS to clear the amdgpu_vm->evicting, user VM ops
+   ioctl will pass amdgpu_vm_ready() (check amdgpu_vm->evicted)
+   but fail in amdgpu_vm_bo_update_mapping() (check
+   amdgpu_vm->evicting) and get this error log
+
+This error won't affect functionality as next CS will finish the
+waiting VM ops. But we'd better clear the error log by checking
+the amdgpu_vm->evicting flag in amdgpu_vm_ready() to stop calling
+amdgpu_vm_bo_update_mapping() later.
+
+Another reason is amdgpu_vm->evicted list holds all BOs (both
+user buffer and page table), but only page table BOs' eviction
+prevent VM ops. amdgpu_vm->evicting flag is set only for page
+table BOs, so we should use evicting flag instead of evicted list
+in amdgpu_vm_ready().
+
+The side effect of this change is: previously blocked VM op (user
+buffer in "evicted" list but no page table in it) gets done
+immediately.
+
+v2: update commit comments.
+
+Acked-by: Paul Menzel <pmenzel@molgen.mpg.de>
+Reviewed-by: Christian König <christian.koenig@amd.com>
+Signed-off-by: Qiang Yu <qiang.yu@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+index 0e7dc23f78e7f..bc8d83698880c 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+@@ -768,11 +768,16 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+  * Check if all VM PDs/PTs are ready for updates
+  *
+  * Returns:
+- * True if eviction list is empty.
++ * True if VM is not evicting.
+  */
+ bool amdgpu_vm_ready(struct amdgpu_vm *vm)
+ {
+-      return list_empty(&vm->evicted);
++      bool ret;
++
++      amdgpu_vm_eviction_lock(vm);
++      ret = !vm->evicting;
++      amdgpu_vm_eviction_unlock(vm);
++      return ret;
+ }
+ /**
+-- 
+2.34.1
+
diff --git a/queue-5.16/exfat-fix-i_blocks-for-files-truncated-over-4-gib.patch b/queue-5.16/exfat-fix-i_blocks-for-files-truncated-over-4-gib.patch
new file mode 100644 (file)
index 0000000..4b00976
--- /dev/null
@@ -0,0 +1,81 @@
+From 6d432547abd6645f7cc07a08f4eb84e73ac4c3f5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Nov 2021 22:02:37 +0900
+Subject: exfat: fix i_blocks for files truncated over 4 GiB
+
+From: Christophe Vu-Brugier <christophe.vu-brugier@seagate.com>
+
+[ Upstream commit 92fba084b79e6bc7b12fc118209f1922c1a2df56 ]
+
+In exfat_truncate(), the computation of inode->i_blocks is wrong if
+the file is larger than 4 GiB because a 32-bit variable is used as a
+mask. This is fixed and simplified by using round_up().
+
+Also fix the same buggy computation in exfat_read_root() and another
+(correct) one in exfat_fill_inode(). The latter was fixed another way
+last month but can be simplified by using round_up() as well. See:
+
+  commit 0c336d6e33f4 ("exfat: fix incorrect loading of i_blocks for
+                        large files")
+
+Fixes: 98d917047e8b ("exfat: add file operations")
+Cc: stable@vger.kernel.org # v5.7+
+Suggested-by: Matthew Wilcox <willy@infradead.org>
+Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
+Signed-off-by: Christophe Vu-Brugier <christophe.vu-brugier@seagate.com>
+Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/exfat/file.c  | 4 ++--
+ fs/exfat/inode.c | 4 ++--
+ fs/exfat/super.c | 4 ++--
+ 3 files changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/fs/exfat/file.c b/fs/exfat/file.c
+index 848166d6d5e9d..d890fd34bb2d0 100644
+--- a/fs/exfat/file.c
++++ b/fs/exfat/file.c
+@@ -251,8 +251,8 @@ void exfat_truncate(struct inode *inode, loff_t size)
+       else
+               mark_inode_dirty(inode);
+-      inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) &
+-                      ~(sbi->cluster_size - 1)) >> inode->i_blkbits;
++      inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >>
++                              inode->i_blkbits;
+ write_size:
+       aligned_size = i_size_read(inode);
+       if (aligned_size & (blocksize - 1)) {
+diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
+index aca2e64d045b6..72a0ccfb616c3 100644
+--- a/fs/exfat/inode.c
++++ b/fs/exfat/inode.c
+@@ -602,8 +602,8 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
+       exfat_save_attr(inode, info->attr);
+-      inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) &
+-              ~((loff_t)sbi->cluster_size - 1)) >> inode->i_blkbits;
++      inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >>
++                              inode->i_blkbits;
+       inode->i_mtime = info->mtime;
+       inode->i_ctime = info->mtime;
+       ei->i_crtime = info->crtime;
+diff --git a/fs/exfat/super.c b/fs/exfat/super.c
+index 1a2115d73a48a..4b5d02b1df585 100644
+--- a/fs/exfat/super.c
++++ b/fs/exfat/super.c
+@@ -364,8 +364,8 @@ static int exfat_read_root(struct inode *inode)
+       inode->i_op = &exfat_dir_inode_operations;
+       inode->i_fop = &exfat_dir_operations;
+-      inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1))
+-                      & ~(sbi->cluster_size - 1)) >> inode->i_blkbits;
++      inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >>
++                              inode->i_blkbits;
+       ei->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff;
+       ei->i_size_aligned = i_size_read(inode);
+       ei->i_size_ondisk = i_size_read(inode);
+-- 
+2.34.1
+
diff --git a/queue-5.16/exfat-reuse-exfat_inode_info-variable-instead-of-cal.patch b/queue-5.16/exfat-reuse-exfat_inode_info-variable-instead-of-cal.patch
new file mode 100644 (file)
index 0000000..5d4f1eb
--- /dev/null
@@ -0,0 +1,134 @@
+From 5dcaaecb0895d026a2954b4b5d82e8fe995f610c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 2 Nov 2021 22:23:58 +0100
+Subject: exfat: reuse exfat_inode_info variable instead of calling EXFAT_I()
+
+From: Christophe Vu-Brugier <christophe.vu-brugier@seagate.com>
+
+[ Upstream commit 7dee6f57d7f22a89dd214518c778aec448270d4c ]
+
+Also add a local "struct exfat_inode_info *ei" variable to
+exfat_truncate() to simplify the code.
+
+Signed-off-by: Christophe Vu-Brugier <christophe.vu-brugier@seagate.com>
+Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/exfat/file.c  | 14 +++++++-------
+ fs/exfat/inode.c |  9 ++++-----
+ fs/exfat/namei.c |  6 +++---
+ fs/exfat/super.c |  6 +++---
+ 4 files changed, 17 insertions(+), 18 deletions(-)
+
+diff --git a/fs/exfat/file.c b/fs/exfat/file.c
+index 6af0191b648f1..848166d6d5e9d 100644
+--- a/fs/exfat/file.c
++++ b/fs/exfat/file.c
+@@ -110,8 +110,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
+       exfat_set_volume_dirty(sb);
+       num_clusters_new = EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi);
+-      num_clusters_phys =
+-              EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk, sbi);
++      num_clusters_phys = EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi);
+       exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags);
+@@ -228,12 +227,13 @@ void exfat_truncate(struct inode *inode, loff_t size)
+ {
+       struct super_block *sb = inode->i_sb;
+       struct exfat_sb_info *sbi = EXFAT_SB(sb);
++      struct exfat_inode_info *ei = EXFAT_I(inode);
+       unsigned int blocksize = i_blocksize(inode);
+       loff_t aligned_size;
+       int err;
+       mutex_lock(&sbi->s_lock);
+-      if (EXFAT_I(inode)->start_clu == 0) {
++      if (ei->start_clu == 0) {
+               /*
+                * Empty start_clu != ~0 (not allocated)
+                */
+@@ -260,11 +260,11 @@ void exfat_truncate(struct inode *inode, loff_t size)
+               aligned_size++;
+       }
+-      if (EXFAT_I(inode)->i_size_ondisk > i_size_read(inode))
+-              EXFAT_I(inode)->i_size_ondisk = aligned_size;
++      if (ei->i_size_ondisk > i_size_read(inode))
++              ei->i_size_ondisk = aligned_size;
+-      if (EXFAT_I(inode)->i_size_aligned > i_size_read(inode))
+-              EXFAT_I(inode)->i_size_aligned = aligned_size;
++      if (ei->i_size_aligned > i_size_read(inode))
++              ei->i_size_aligned = aligned_size;
+       mutex_unlock(&sbi->s_lock);
+ }
+diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
+index 1c7aa1ea4724c..aca2e64d045b6 100644
+--- a/fs/exfat/inode.c
++++ b/fs/exfat/inode.c
+@@ -114,10 +114,9 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
+       unsigned int local_clu_offset = clu_offset;
+       unsigned int num_to_be_allocated = 0, num_clusters = 0;
+-      if (EXFAT_I(inode)->i_size_ondisk > 0)
++      if (ei->i_size_ondisk > 0)
+               num_clusters =
+-                      EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk,
+-                      sbi);
++                      EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi);
+       if (clu_offset >= num_clusters)
+               num_to_be_allocated = clu_offset - num_clusters + 1;
+@@ -416,10 +415,10 @@ static int exfat_write_end(struct file *file, struct address_space *mapping,
+       err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+-      if (EXFAT_I(inode)->i_size_aligned < i_size_read(inode)) {
++      if (ei->i_size_aligned < i_size_read(inode)) {
+               exfat_fs_error(inode->i_sb,
+                       "invalid size(size(%llu) > aligned(%llu)\n",
+-                      i_size_read(inode), EXFAT_I(inode)->i_size_aligned);
++                      i_size_read(inode), ei->i_size_aligned);
+               return -EIO;
+       }
+diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
+index 24b41103d1cc0..9d8ada781250b 100644
+--- a/fs/exfat/namei.c
++++ b/fs/exfat/namei.c
+@@ -395,9 +395,9 @@ static int exfat_find_empty_entry(struct inode *inode,
+               /* directory inode should be updated in here */
+               i_size_write(inode, size);
+-              EXFAT_I(inode)->i_size_ondisk += sbi->cluster_size;
+-              EXFAT_I(inode)->i_size_aligned += sbi->cluster_size;
+-              EXFAT_I(inode)->flags = p_dir->flags;
++              ei->i_size_ondisk += sbi->cluster_size;
++              ei->i_size_aligned += sbi->cluster_size;
++              ei->flags = p_dir->flags;
+               inode->i_blocks += 1 << sbi->sect_per_clus_bits;
+       }
+diff --git a/fs/exfat/super.c b/fs/exfat/super.c
+index 5539ffc20d164..1a2115d73a48a 100644
+--- a/fs/exfat/super.c
++++ b/fs/exfat/super.c
+@@ -366,9 +366,9 @@ static int exfat_read_root(struct inode *inode)
+       inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1))
+                       & ~(sbi->cluster_size - 1)) >> inode->i_blkbits;
+-      EXFAT_I(inode)->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff;
+-      EXFAT_I(inode)->i_size_aligned = i_size_read(inode);
+-      EXFAT_I(inode)->i_size_ondisk = i_size_read(inode);
++      ei->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff;
++      ei->i_size_aligned = i_size_read(inode);
++      ei->i_size_ondisk = i_size_read(inode);
+       exfat_save_attr(inode, ATTR_SUBDIR);
+       inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
+-- 
+2.34.1
+
diff --git a/queue-5.16/ext4-drop-ineligible-txn-start-stop-apis.patch b/queue-5.16/ext4-drop-ineligible-txn-start-stop-apis.patch
new file mode 100644 (file)
index 0000000..9afe255
--- /dev/null
@@ -0,0 +1,251 @@
+From f5b14d8a3321344787c8cff251bf27bb0935a0f2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 23 Dec 2021 12:21:38 -0800
+Subject: ext4: drop ineligible txn start stop APIs
+
+From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+
+[ Upstream commit 7bbbe241ec7ce0def9f71464c878fdbd2b0dcf37 ]
+
+This patch drops ext4_fc_start_ineligible() and
+ext4_fc_stop_ineligible() APIs. Fast commit ineligible transactions
+should simply call ext4_fc_mark_ineligible() after starting the
+trasaction.
+
+Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+Link: https://lore.kernel.org/r/20211223202140.2061101-3-harshads@google.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/ext4.h        |  6 ++--
+ fs/ext4/extents.c     |  6 ++--
+ fs/ext4/fast_commit.c | 79 ++++++++-----------------------------------
+ fs/ext4/ioctl.c       |  3 +-
+ fs/ext4/super.c       |  1 -
+ 5 files changed, 20 insertions(+), 75 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index d248a01132c3b..f80e4de726869 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1725,9 +1725,9 @@ struct ext4_sb_info {
+        */
+       struct work_struct s_error_work;
+-      /* Ext4 fast commit stuff */
++      /* Ext4 fast commit sub transaction ID */
+       atomic_t s_fc_subtid;
+-      atomic_t s_fc_ineligible_updates;
++
+       /*
+        * After commit starts, the main queue gets locked, and the further
+        * updates get added in the staging queue.
+@@ -2926,8 +2926,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
+ void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
+ void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
+ void ext4_fc_mark_ineligible(struct super_block *sb, int reason);
+-void ext4_fc_start_ineligible(struct super_block *sb, int reason);
+-void ext4_fc_stop_ineligible(struct super_block *sb);
+ void ext4_fc_start_update(struct inode *inode);
+ void ext4_fc_stop_update(struct inode *inode);
+ void ext4_fc_del(struct inode *inode);
+diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
+index 9b37d16b24ffd..d3a8d704d8b4f 100644
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -5342,7 +5342,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+               ret = PTR_ERR(handle);
+               goto out_mmap;
+       }
+-      ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
++      ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
+       down_write(&EXT4_I(inode)->i_data_sem);
+       ext4_discard_preallocations(inode, 0);
+@@ -5381,7 +5381,6 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+ out_stop:
+       ext4_journal_stop(handle);
+-      ext4_fc_stop_ineligible(sb);
+ out_mmap:
+       filemap_invalidate_unlock(mapping);
+ out_mutex:
+@@ -5483,7 +5482,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+               ret = PTR_ERR(handle);
+               goto out_mmap;
+       }
+-      ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
++      ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
+       /* Expand file to avoid data loss if there is error while shifting */
+       inode->i_size += len;
+@@ -5558,7 +5557,6 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+ out_stop:
+       ext4_journal_stop(handle);
+-      ext4_fc_stop_ineligible(sb);
+ out_mmap:
+       filemap_invalidate_unlock(mapping);
+ out_mutex:
+diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
+index 3b79fb063c07a..48e522bb7bca4 100644
+--- a/fs/ext4/fast_commit.c
++++ b/fs/ext4/fast_commit.c
+@@ -65,21 +65,11 @@
+  *
+  * Fast Commit Ineligibility
+  * -------------------------
+- * Not all operations are supported by fast commits today (e.g extended
+- * attributes). Fast commit ineligibility is marked by calling one of the
+- * two following functions:
+- *
+- * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
+- *   back to full commit. This is useful in case of transient errors.
+  *
+- * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
+- *   the fast commits happening between ext4_fc_start_ineligible() and
+- *   ext4_fc_stop_ineligible() and one fast commit after the call to
+- *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
+- *   make one more fast commit to fall back to full commit after stop call so
+- *   that it guaranteed that the fast commit ineligible operation contained
+- *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
+- *   followed by at least 1 full commit.
++ * Not all operations are supported by fast commits today (e.g extended
++ * attributes). Fast commit ineligibility is marked by calling
++ * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
++ * to full commit.
+  *
+  * Atomicity of commits
+  * --------------------
+@@ -328,44 +318,6 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
+       sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
+ }
+-/*
+- * Start a fast commit ineligible update. Any commits that happen while
+- * such an operation is in progress fall back to full commits.
+- */
+-void ext4_fc_start_ineligible(struct super_block *sb, int reason)
+-{
+-      struct ext4_sb_info *sbi = EXT4_SB(sb);
+-
+-      if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+-          (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+-              return;
+-
+-      WARN_ON(reason >= EXT4_FC_REASON_MAX);
+-      sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
+-      atomic_inc(&sbi->s_fc_ineligible_updates);
+-}
+-
+-/*
+- * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
+- * to ensure that after stopping the ineligible update, at least one full
+- * commit takes place.
+- */
+-void ext4_fc_stop_ineligible(struct super_block *sb)
+-{
+-      if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+-          (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+-              return;
+-
+-      ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+-      atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
+-}
+-
+-static inline int ext4_fc_is_ineligible(struct super_block *sb)
+-{
+-      return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
+-              atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
+-}
+-
+ /*
+  * Generic fast commit tracking function. If this is the first time this we are
+  * called after a full commit, we initialize fast commit fields and then call
+@@ -391,7 +343,7 @@ static int ext4_fc_track_template(
+           (sbi->s_mount_state & EXT4_FC_REPLAY))
+               return -EOPNOTSUPP;
+-      if (ext4_fc_is_ineligible(inode->i_sb))
++      if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+               return -EINVAL;
+       tid = handle->h_transaction->t_tid;
+@@ -1142,11 +1094,8 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
+       start_time = ktime_get();
+-      if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+-              (ext4_fc_is_ineligible(sb))) {
+-              reason = EXT4_FC_REASON_INELIGIBLE;
+-              goto out;
+-      }
++      if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
++              return jbd2_complete_transaction(journal, commit_tid);
+ restart_fc:
+       ret = jbd2_fc_begin_commit(journal, commit_tid);
+@@ -1162,6 +1111,14 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
+               reason = EXT4_FC_REASON_FC_START_FAILED;
+               goto out;
+       }
++      /*
++       * After establishing journal barrier via jbd2_fc_begin_commit(), check
++       * if we are fast commit ineligible.
++       */
++      if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
++              reason = EXT4_FC_REASON_INELIGIBLE;
++              goto out;
++      }
+       fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
+       ret = ext4_fc_perform_commit(journal);
+@@ -1180,12 +1137,6 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
+       atomic_inc(&sbi->s_fc_subtid);
+       jbd2_fc_end_commit(journal);
+ out:
+-      /* Has any ineligible update happened since we started? */
+-      if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
+-              sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
+-              reason = EXT4_FC_REASON_INELIGIBLE;
+-      }
+-
+       spin_lock(&sbi->s_fc_lock);
+       if (reason != EXT4_FC_REASON_OK &&
+               reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
+diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
+index 220a4c8178b5e..fd70bebb14370 100644
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -169,7 +169,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
+               err = -EINVAL;
+               goto err_out;
+       }
+-      ext4_fc_start_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT);
++      ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT);
+       /* Protect extent tree against block allocations via delalloc */
+       ext4_double_down_write_data_sem(inode, inode_bl);
+@@ -252,7 +252,6 @@ static long swap_inode_boot_loader(struct super_block *sb,
+ err_out1:
+       ext4_journal_stop(handle);
+-      ext4_fc_stop_ineligible(sb);
+       ext4_double_up_write_data_sem(inode, inode_bl);
+ err_out:
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 24a7ad80353b5..d304b72593d76 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -4620,7 +4620,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+       /* Initialize fast commit stuff */
+       atomic_set(&sbi->s_fc_subtid, 0);
+-      atomic_set(&sbi->s_fc_ineligible_updates, 0);
+       INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
+       INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
+       INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
+-- 
+2.34.1
+
diff --git a/queue-5.16/ext4-fast-commit-may-miss-file-actions.patch b/queue-5.16/ext4-fast-commit-may-miss-file-actions.patch
new file mode 100644 (file)
index 0000000..1b96a51
--- /dev/null
@@ -0,0 +1,117 @@
+From dc0cacc25bc9df3a8521ebf4875cac0c2248bbfb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 17 Jan 2022 17:36:55 +0800
+Subject: ext4: fast commit may miss file actions
+
+From: Xin Yin <yinxin.x@bytedance.com>
+
+[ Upstream commit bdc8a53a6f2f0b1cb5f991440f2100732299eb93 ]
+
+in the follow scenario:
+1. jbd start transaction n
+2. task A get new handle for transaction n+1
+3. task A do some actions and add inode to FC_Q_MAIN fc_q
+4. jbd complete transaction n and clear FC_Q_MAIN fc_q
+5. task A call fsync
+
+Fast commit will lost the file actions during a full commit.
+
+we should also add updates to staging queue during a full commit.
+and in ext4_fc_cleanup(), when reset a inode's fc track range, check
+it's i_sync_tid, if it bigger than current transaction tid, do not
+rest it, or we will lost the track range.
+
+And EXT4_MF_FC_COMMITTING is not needed anymore, so drop it.
+
+Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
+Link: https://lore.kernel.org/r/20220117093655.35160-3-yinxin.x@bytedance.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/ext4.h        |  5 +----
+ fs/ext4/fast_commit.c | 11 ++++++-----
+ fs/ext4/super.c       |  1 -
+ 3 files changed, 7 insertions(+), 10 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 8b5015ea46199..c2cc9d78915b0 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1793,10 +1793,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
+ enum {
+       EXT4_MF_MNTDIR_SAMPLED,
+       EXT4_MF_FS_ABORTED,     /* Fatal error detected */
+-      EXT4_MF_FC_INELIGIBLE,  /* Fast commit ineligible */
+-      EXT4_MF_FC_COMMITTING   /* File system underoing a fast
+-                               * commit.
+-                               */
++      EXT4_MF_FC_INELIGIBLE   /* Fast commit ineligible */
+ };
+ static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
+diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
+index 0cdfc5003d91a..aca8414706346 100644
+--- a/fs/ext4/fast_commit.c
++++ b/fs/ext4/fast_commit.c
+@@ -377,7 +377,8 @@ static int ext4_fc_track_template(
+       spin_lock(&sbi->s_fc_lock);
+       if (list_empty(&EXT4_I(inode)->i_fc_list))
+               list_add_tail(&EXT4_I(inode)->i_fc_list,
+-                              (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
++                              (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
++                               sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
+                               &sbi->s_fc_q[FC_Q_STAGING] :
+                               &sbi->s_fc_q[FC_Q_MAIN]);
+       spin_unlock(&sbi->s_fc_lock);
+@@ -430,7 +431,8 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
+       node->fcd_name.len = dentry->d_name.len;
+       spin_lock(&sbi->s_fc_lock);
+-      if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
++      if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
++              sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
+               list_add_tail(&node->fcd_list,
+                               &sbi->s_fc_dentry_q[FC_Q_STAGING]);
+       else
+@@ -896,7 +898,6 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
+       int ret = 0;
+       spin_lock(&sbi->s_fc_lock);
+-      ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
+       list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+               ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
+               while (atomic_read(&ei->i_fc_updates)) {
+@@ -1214,7 +1215,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
+               list_del_init(&iter->i_fc_list);
+               ext4_clear_inode_state(&iter->vfs_inode,
+                                      EXT4_STATE_FC_COMMITTING);
+-              ext4_fc_reset_inode(&iter->vfs_inode);
++              if (iter->i_sync_tid <= tid)
++                      ext4_fc_reset_inode(&iter->vfs_inode);
+               /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
+               smp_mb();
+ #if (BITS_PER_LONG < 64)
+@@ -1243,7 +1245,6 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
+       list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
+                               &sbi->s_fc_q[FC_Q_MAIN]);
+-      ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
+       if (tid >= sbi->s_fc_ineligible_tid) {
+               sbi->s_fc_ineligible_tid = 0;
+               ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 888b2db92924d..32ca34403dcec 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -4626,7 +4626,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+       INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
+       sbi->s_fc_bytes = 0;
+       ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+-      ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
+       sbi->s_fc_ineligible_tid = 0;
+       spin_lock_init(&sbi->s_fc_lock);
+       memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
+-- 
+2.34.1
+
diff --git a/queue-5.16/ext4-fast-commit-may-not-fallback-for-ineligible-com.patch b/queue-5.16/ext4-fast-commit-may-not-fallback-for-ineligible-com.patch
new file mode 100644 (file)
index 0000000..9358370
--- /dev/null
@@ -0,0 +1,329 @@
+From 642f8c30b87603489a33051fd74a51c0a4447cf7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 17 Jan 2022 17:36:54 +0800
+Subject: ext4: fast commit may not fallback for ineligible commit
+
+From: Xin Yin <yinxin.x@bytedance.com>
+
+[ Upstream commit e85c81ba8859a4c839bcd69c5d83b32954133a5b ]
+
+For the follow scenario:
+1. jbd start commit transaction n
+2. task A get new handle for transaction n+1
+3. task A do some ineligible actions and mark FC_INELIGIBLE
+4. jbd complete transaction n and clean FC_INELIGIBLE
+5. task A call fsync
+
+In this case fast commit will not fallback to full commit and
+transaction n+1 also not handled by jbd.
+
+Make ext4_fc_mark_ineligible() also record transaction tid for
+latest ineligible case, when call ext4_fc_cleanup() check
+current transaction tid, if small than latest ineligible tid
+do not clear the EXT4_MF_FC_INELIGIBLE.
+
+Reported-by: kernel test robot <lkp@intel.com>
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Reported-by: Ritesh Harjani <riteshh@linux.ibm.com>
+Suggested-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
+Link: https://lore.kernel.org/r/20220117093655.35160-2-yinxin.x@bytedance.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/ext4.h        |  3 ++-
+ fs/ext4/extents.c     |  4 ++--
+ fs/ext4/fast_commit.c | 33 +++++++++++++++++++++++++--------
+ fs/ext4/inode.c       |  4 ++--
+ fs/ext4/ioctl.c       |  4 ++--
+ fs/ext4/namei.c       |  4 ++--
+ fs/ext4/super.c       |  1 +
+ fs/ext4/xattr.c       |  6 +++---
+ fs/jbd2/commit.c      |  2 +-
+ fs/jbd2/journal.c     |  2 +-
+ include/linux/jbd2.h  |  2 +-
+ 11 files changed, 42 insertions(+), 23 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 470fd3c2aef54..8b5015ea46199 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1747,6 +1747,7 @@ struct ext4_sb_info {
+       spinlock_t s_fc_lock;
+       struct buffer_head *s_fc_bh;
+       struct ext4_fc_stats s_fc_stats;
++      tid_t s_fc_ineligible_tid;
+ #ifdef CONFIG_EXT4_DEBUG
+       int s_fc_debug_max_replay;
+ #endif
+@@ -2924,7 +2925,7 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
+                           struct dentry *dentry);
+ void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
+ void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
+-void ext4_fc_mark_ineligible(struct super_block *sb, int reason);
++void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
+ void ext4_fc_start_update(struct inode *inode);
+ void ext4_fc_stop_update(struct inode *inode);
+ void ext4_fc_del(struct inode *inode);
+diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
+index d3a8d704d8b4f..d2667189be7e5 100644
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -5342,7 +5342,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+               ret = PTR_ERR(handle);
+               goto out_mmap;
+       }
+-      ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
++      ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
+       down_write(&EXT4_I(inode)->i_data_sem);
+       ext4_discard_preallocations(inode, 0);
+@@ -5482,7 +5482,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+               ret = PTR_ERR(handle);
+               goto out_mmap;
+       }
+-      ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
++      ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
+       /* Expand file to avoid data loss if there is error while shifting */
+       inode->i_size += len;
+diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
+index 1b935feec6f6b..0cdfc5003d91a 100644
+--- a/fs/ext4/fast_commit.c
++++ b/fs/ext4/fast_commit.c
+@@ -302,18 +302,32 @@ void ext4_fc_del(struct inode *inode)
+ }
+ /*
+- * Mark file system as fast commit ineligible. This means that next commit
+- * operation would result in a full jbd2 commit.
++ * Mark file system as fast commit ineligible, and record latest
++ * ineligible transaction tid. This means until the recorded
++ * transaction, commit operation would result in a full jbd2 commit.
+  */
+-void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
++void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
++      tid_t tid;
+       if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+           (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+               return;
+       ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
++      if (handle && !IS_ERR(handle))
++              tid = handle->h_transaction->t_tid;
++      else {
++              read_lock(&sbi->s_journal->j_state_lock);
++              tid = sbi->s_journal->j_running_transaction ?
++                              sbi->s_journal->j_running_transaction->t_tid : 0;
++              read_unlock(&sbi->s_journal->j_state_lock);
++      }
++      spin_lock(&sbi->s_fc_lock);
++      if (sbi->s_fc_ineligible_tid < tid)
++              sbi->s_fc_ineligible_tid = tid;
++      spin_unlock(&sbi->s_fc_lock);
+       WARN_ON(reason >= EXT4_FC_REASON_MAX);
+       sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
+ }
+@@ -389,7 +403,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
+       mutex_unlock(&ei->i_fc_lock);
+       node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
+       if (!node) {
+-              ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
++              ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
+               mutex_lock(&ei->i_fc_lock);
+               return -ENOMEM;
+       }
+@@ -402,7 +416,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
+               if (!node->fcd_name.name) {
+                       kmem_cache_free(ext4_fc_dentry_cachep, node);
+                       ext4_fc_mark_ineligible(inode->i_sb,
+-                              EXT4_FC_REASON_NOMEM);
++                              EXT4_FC_REASON_NOMEM, NULL);
+                       mutex_lock(&ei->i_fc_lock);
+                       return -ENOMEM;
+               }
+@@ -504,7 +518,7 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
+       if (ext4_should_journal_data(inode)) {
+               ext4_fc_mark_ineligible(inode->i_sb,
+-                                      EXT4_FC_REASON_INODE_JOURNAL_DATA);
++                                      EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
+               return;
+       }
+@@ -1182,7 +1196,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
+  * Fast commit cleanup routine. This is called after every fast commit and
+  * full commit. full is true if we are called after a full commit.
+  */
+-static void ext4_fc_cleanup(journal_t *journal, int full)
++static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
+ {
+       struct super_block *sb = journal->j_private;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+@@ -1230,7 +1244,10 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
+                               &sbi->s_fc_q[FC_Q_MAIN]);
+       ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
+-      ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
++      if (tid >= sbi->s_fc_ineligible_tid) {
++              sbi->s_fc_ineligible_tid = 0;
++              ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
++      }
+       if (full)
+               sbi->s_fc_bytes = 0;
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 3bdfe010e17f9..2f5686dfa30d5 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -337,7 +337,7 @@ void ext4_evict_inode(struct inode *inode)
+       return;
+ no_delete:
+       if (!list_empty(&EXT4_I(inode)->i_fc_list))
+-              ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
++              ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
+       ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
+ }
+@@ -5983,7 +5983,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
+               return PTR_ERR(handle);
+       ext4_fc_mark_ineligible(inode->i_sb,
+-              EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
++              EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
+       err = ext4_mark_inode_dirty(handle, inode);
+       ext4_handle_sync(handle);
+       ext4_journal_stop(handle);
+diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
+index fd70bebb14370..f61b59045c6d3 100644
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -169,7 +169,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
+               err = -EINVAL;
+               goto err_out;
+       }
+-      ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT);
++      ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle);
+       /* Protect extent tree against block allocations via delalloc */
+       ext4_double_down_write_data_sem(inode, inode_bl);
+@@ -1075,7 +1075,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+               err = ext4_resize_fs(sb, n_blocks_count);
+               if (EXT4_SB(sb)->s_journal) {
+-                      ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE);
++                      ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL);
+                       jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+                       err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
+                       jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index 52c9bd154122a..47b9f87dbc6f7 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -3889,7 +3889,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+                * dirents in directories.
+                */
+               ext4_fc_mark_ineligible(old.inode->i_sb,
+-                      EXT4_FC_REASON_RENAME_DIR);
++                      EXT4_FC_REASON_RENAME_DIR, handle);
+       } else {
+               if (new.inode)
+                       ext4_fc_track_unlink(handle, new.dentry);
+@@ -4049,7 +4049,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
+       if (unlikely(retval))
+               goto end_rename;
+       ext4_fc_mark_ineligible(new.inode->i_sb,
+-                              EXT4_FC_REASON_CROSS_RENAME);
++                              EXT4_FC_REASON_CROSS_RENAME, handle);
+       if (old.dir_bh) {
+               retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
+               if (retval)
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index d304b72593d76..888b2db92924d 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -4627,6 +4627,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+       sbi->s_fc_bytes = 0;
+       ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+       ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
++      sbi->s_fc_ineligible_tid = 0;
+       spin_lock_init(&sbi->s_fc_lock);
+       memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
+       sbi->s_fc_replay_state.fc_regions = NULL;
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index 1e0fc1ed845bf..0423253490986 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -2408,7 +2408,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+               if (IS_SYNC(inode))
+                       ext4_handle_sync(handle);
+       }
+-      ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
++      ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle);
+ cleanup:
+       brelse(is.iloc.bh);
+@@ -2486,7 +2486,7 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+               if (error == 0)
+                       error = error2;
+       }
+-      ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
++      ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, NULL);
+       return error;
+ }
+@@ -2920,7 +2920,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
+                                        error);
+                       goto cleanup;
+               }
+-              ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
++              ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle);
+       }
+       error = 0;
+ cleanup:
+diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
+index 3cc4ab2ba7f4f..d188fa913a075 100644
+--- a/fs/jbd2/commit.c
++++ b/fs/jbd2/commit.c
+@@ -1170,7 +1170,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
+       if (journal->j_commit_callback)
+               journal->j_commit_callback(journal, commit_transaction);
+       if (journal->j_fc_cleanup_callback)
+-              journal->j_fc_cleanup_callback(journal, 1);
++              journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
+       trace_jbd2_end_commit(journal, commit_transaction);
+       jbd_debug(1, "JBD2: commit %d complete, head %d\n",
+diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
+index bd9ac98916043..1f8493ef181d6 100644
+--- a/fs/jbd2/journal.c
++++ b/fs/jbd2/journal.c
+@@ -769,7 +769,7 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit);
+ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
+ {
+       if (journal->j_fc_cleanup_callback)
+-              journal->j_fc_cleanup_callback(journal, 0);
++              journal->j_fc_cleanup_callback(journal, 0, tid);
+       write_lock(&journal->j_state_lock);
+       journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
+       if (fallback)
+diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
+index fd933c45281af..d63b8106796e2 100644
+--- a/include/linux/jbd2.h
++++ b/include/linux/jbd2.h
+@@ -1295,7 +1295,7 @@ struct journal_s
+        * Clean-up after fast commit or full commit. JBD2 calls this function
+        * after every commit operation.
+        */
+-      void (*j_fc_cleanup_callback)(struct journal_s *journal, int);
++      void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid);
+       /**
+        * @j_fc_replay_callback:
+-- 
+2.34.1
+
diff --git a/queue-5.16/ext4-simplify-updating-of-fast-commit-stats.patch b/queue-5.16/ext4-simplify-updating-of-fast-commit-stats.patch
new file mode 100644 (file)
index 0000000..897fb40
--- /dev/null
@@ -0,0 +1,237 @@
+From d29ecbeb7189008f18506200809e12a2f7fe49f4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 23 Dec 2021 12:21:39 -0800
+Subject: ext4: simplify updating of fast commit stats
+
+From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+
+[ Upstream commit 0915e464cb274648e1ef1663e1356e53ff400983 ]
+
+Move fast commit stats updating logic to a separate function from
+ext4_fc_commit(). This significantly improves readability of
+ext4_fc_commit().
+
+Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+Link: https://lore.kernel.org/r/20211223202140.2061101-4-harshads@google.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/ext4.h        |  1 -
+ fs/ext4/fast_commit.c | 99 +++++++++++++++++++++++--------------------
+ fs/ext4/fast_commit.h | 27 ++++++------
+ 3 files changed, 68 insertions(+), 59 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index f80e4de726869..470fd3c2aef54 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1747,7 +1747,6 @@ struct ext4_sb_info {
+       spinlock_t s_fc_lock;
+       struct buffer_head *s_fc_bh;
+       struct ext4_fc_stats s_fc_stats;
+-      u64 s_fc_avg_commit_time;
+ #ifdef CONFIG_EXT4_DEBUG
+       int s_fc_debug_max_replay;
+ #endif
+diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
+index 48e522bb7bca4..1b935feec6f6b 100644
+--- a/fs/ext4/fast_commit.c
++++ b/fs/ext4/fast_commit.c
+@@ -1075,6 +1075,32 @@ static int ext4_fc_perform_commit(journal_t *journal)
+       return ret;
+ }
++static void ext4_fc_update_stats(struct super_block *sb, int status,
++                               u64 commit_time, int nblks)
++{
++      struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
++
++      jbd_debug(1, "Fast commit ended with status = %d", status);
++      if (status == EXT4_FC_STATUS_OK) {
++              stats->fc_num_commits++;
++              stats->fc_numblks += nblks;
++              if (likely(stats->s_fc_avg_commit_time))
++                      stats->s_fc_avg_commit_time =
++                              (commit_time +
++                               stats->s_fc_avg_commit_time * 3) / 4;
++              else
++                      stats->s_fc_avg_commit_time = commit_time;
++      } else if (status == EXT4_FC_STATUS_FAILED ||
++                 status == EXT4_FC_STATUS_INELIGIBLE) {
++              if (status == EXT4_FC_STATUS_FAILED)
++                      stats->fc_failed_commits++;
++              stats->fc_ineligible_commits++;
++      } else {
++              stats->fc_skipped_commits++;
++      }
++      trace_ext4_fc_commit_stop(sb, nblks, status);
++}
++
+ /*
+  * The main commit entry point. Performs a fast commit for transaction
+  * commit_tid if needed. If it's not possible to perform a fast commit
+@@ -1087,7 +1113,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       int nblks = 0, ret, bsize = journal->j_blocksize;
+       int subtid = atomic_read(&sbi->s_fc_subtid);
+-      int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
++      int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
+       ktime_t start_time, commit_time;
+       trace_ext4_fc_commit_start(sb);
+@@ -1104,69 +1130,52 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
+               if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
+                       commit_tid > journal->j_commit_sequence)
+                       goto restart_fc;
+-              reason = EXT4_FC_REASON_ALREADY_COMMITTED;
+-              goto out;
++              ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0);
++              return 0;
+       } else if (ret) {
+-              sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
+-              reason = EXT4_FC_REASON_FC_START_FAILED;
+-              goto out;
++              /*
++               * Commit couldn't start. Just update stats and perform a
++               * full commit.
++               */
++              ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0);
++              return jbd2_complete_transaction(journal, commit_tid);
+       }
++
+       /*
+        * After establishing journal barrier via jbd2_fc_begin_commit(), check
+        * if we are fast commit ineligible.
+        */
+       if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
+-              reason = EXT4_FC_REASON_INELIGIBLE;
+-              goto out;
++              status = EXT4_FC_STATUS_INELIGIBLE;
++              goto fallback;
+       }
+       fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
+       ret = ext4_fc_perform_commit(journal);
+       if (ret < 0) {
+-              sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
+-              reason = EXT4_FC_REASON_FC_FAILED;
+-              goto out;
++              status = EXT4_FC_STATUS_FAILED;
++              goto fallback;
+       }
+       nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
+       ret = jbd2_fc_wait_bufs(journal, nblks);
+       if (ret < 0) {
+-              sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
+-              reason = EXT4_FC_REASON_FC_FAILED;
+-              goto out;
++              status = EXT4_FC_STATUS_FAILED;
++              goto fallback;
+       }
+       atomic_inc(&sbi->s_fc_subtid);
+-      jbd2_fc_end_commit(journal);
+-out:
+-      spin_lock(&sbi->s_fc_lock);
+-      if (reason != EXT4_FC_REASON_OK &&
+-              reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
+-              sbi->s_fc_stats.fc_ineligible_commits++;
+-      } else {
+-              sbi->s_fc_stats.fc_num_commits++;
+-              sbi->s_fc_stats.fc_numblks += nblks;
+-      }
+-      spin_unlock(&sbi->s_fc_lock);
+-      nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
+-      trace_ext4_fc_commit_stop(sb, nblks, reason);
+-      commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
++      ret = jbd2_fc_end_commit(journal);
+       /*
+-       * weight the commit time higher than the average time so we don't
+-       * react too strongly to vast changes in the commit time
++       * weight the commit time higher than the average time so we
++       * don't react too strongly to vast changes in the commit time
+        */
+-      if (likely(sbi->s_fc_avg_commit_time))
+-              sbi->s_fc_avg_commit_time = (commit_time +
+-                              sbi->s_fc_avg_commit_time * 3) / 4;
+-      else
+-              sbi->s_fc_avg_commit_time = commit_time;
+-      jbd_debug(1,
+-              "Fast commit ended with blks = %d, reason = %d, subtid - %d",
+-              nblks, reason, subtid);
+-      if (reason == EXT4_FC_REASON_FC_FAILED)
+-              return jbd2_fc_end_commit_fallback(journal);
+-      if (reason == EXT4_FC_REASON_FC_START_FAILED ||
+-              reason == EXT4_FC_REASON_INELIGIBLE)
+-              return jbd2_complete_transaction(journal, commit_tid);
+-      return 0;
++      commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
++      ext4_fc_update_stats(sb, status, commit_time, nblks);
++      return ret;
++
++fallback:
++      ret = jbd2_fc_end_commit_fallback(journal);
++      ext4_fc_update_stats(sb, status, 0, 0);
++      return ret;
+ }
+ /*
+@@ -2132,7 +2141,7 @@ int ext4_fc_info_show(struct seq_file *seq, void *v)
+               "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
+                  stats->fc_num_commits, stats->fc_ineligible_commits,
+                  stats->fc_numblks,
+-                 div_u64(sbi->s_fc_avg_commit_time, 1000));
++                 div_u64(stats->s_fc_avg_commit_time, 1000));
+       seq_puts(seq, "Ineligible reasons:\n");
+       for (i = 0; i < EXT4_FC_REASON_MAX; i++)
+               seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
+diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
+index 937c381b4c85e..083ad1cb705a7 100644
+--- a/fs/ext4/fast_commit.h
++++ b/fs/ext4/fast_commit.h
+@@ -71,21 +71,19 @@ struct ext4_fc_tail {
+ };
+ /*
+- * Fast commit reason codes
++ * Fast commit status codes
++ */
++enum {
++      EXT4_FC_STATUS_OK = 0,
++      EXT4_FC_STATUS_INELIGIBLE,
++      EXT4_FC_STATUS_SKIPPED,
++      EXT4_FC_STATUS_FAILED,
++};
++
++/*
++ * Fast commit ineligiblity reasons:
+  */
+ enum {
+-      /*
+-       * Commit status codes:
+-       */
+-      EXT4_FC_REASON_OK = 0,
+-      EXT4_FC_REASON_INELIGIBLE,
+-      EXT4_FC_REASON_ALREADY_COMMITTED,
+-      EXT4_FC_REASON_FC_START_FAILED,
+-      EXT4_FC_REASON_FC_FAILED,
+-
+-      /*
+-       * Fast commit ineligiblity reasons:
+-       */
+       EXT4_FC_REASON_XATTR = 0,
+       EXT4_FC_REASON_CROSS_RENAME,
+       EXT4_FC_REASON_JOURNAL_FLAG_CHANGE,
+@@ -117,7 +115,10 @@ struct ext4_fc_stats {
+       unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX];
+       unsigned long fc_num_commits;
+       unsigned long fc_ineligible_commits;
++      unsigned long fc_failed_commits;
++      unsigned long fc_skipped_commits;
+       unsigned long fc_numblks;
++      u64 s_fc_avg_commit_time;
+ };
+ #define EXT4_FC_REPLAY_REALLOC_INCREMENT      4
+-- 
+2.34.1
+
diff --git a/queue-5.16/ipv6-fix-skb-drops-in-igmp6_event_query-and-igmp6_ev.patch b/queue-5.16/ipv6-fix-skb-drops-in-igmp6_event_query-and-igmp6_ev.patch
new file mode 100644 (file)
index 0000000..2f5820d
--- /dev/null
@@ -0,0 +1,120 @@
+From d524916e6a467112345767ffd5187488dfd6277c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 3 Mar 2022 09:37:28 -0800
+Subject: ipv6: fix skb drops in igmp6_event_query() and igmp6_event_report()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 2d3916f3189172d5c69d33065c3c21119fe539fc ]
+
+While investigating on why a synchronize_net() has been added recently
+in ipv6_mc_down(), I found that igmp6_event_query() and igmp6_event_report()
+might drop skbs in some cases.
+
+Discussion about removing synchronize_net() from ipv6_mc_down()
+will happen in a different thread.
+
+Fixes: f185de28d9ae ("mld: add new workqueues for process mld events")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Taehee Yoo <ap420073@gmail.com>
+Cc: Cong Wang <xiyou.wangcong@gmail.com>
+Cc: David Ahern <dsahern@kernel.org>
+Link: https://lore.kernel.org/r/20220303173728.937869-1-eric.dumazet@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/ndisc.h |  4 ++--
+ net/ipv6/mcast.c    | 32 ++++++++++++--------------------
+ 2 files changed, 14 insertions(+), 22 deletions(-)
+
+diff --git a/include/net/ndisc.h b/include/net/ndisc.h
+index 04341d86585de..5e37e58586796 100644
+--- a/include/net/ndisc.h
++++ b/include/net/ndisc.h
+@@ -487,9 +487,9 @@ int igmp6_late_init(void);
+ void igmp6_cleanup(void);
+ void igmp6_late_cleanup(void);
+-int igmp6_event_query(struct sk_buff *skb);
++void igmp6_event_query(struct sk_buff *skb);
+-int igmp6_event_report(struct sk_buff *skb);
++void igmp6_event_report(struct sk_buff *skb);
+ #ifdef CONFIG_SYSCTL
+diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
+index a8861db52c187..909f937befd71 100644
+--- a/net/ipv6/mcast.c
++++ b/net/ipv6/mcast.c
+@@ -1371,27 +1371,23 @@ static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld,
+ }
+ /* called with rcu_read_lock() */
+-int igmp6_event_query(struct sk_buff *skb)
++void igmp6_event_query(struct sk_buff *skb)
+ {
+       struct inet6_dev *idev = __in6_dev_get(skb->dev);
+-      if (!idev)
+-              return -EINVAL;
+-
+-      if (idev->dead) {
+-              kfree_skb(skb);
+-              return -ENODEV;
+-      }
++      if (!idev || idev->dead)
++              goto out;
+       spin_lock_bh(&idev->mc_query_lock);
+       if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) {
+               __skb_queue_tail(&idev->mc_query_queue, skb);
+               if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0))
+                       in6_dev_hold(idev);
++              skb = NULL;
+       }
+       spin_unlock_bh(&idev->mc_query_lock);
+-
+-      return 0;
++out:
++      kfree_skb(skb);
+ }
+ static void __mld_query_work(struct sk_buff *skb)
+@@ -1542,27 +1538,23 @@ static void mld_query_work(struct work_struct *work)
+ }
+ /* called with rcu_read_lock() */
+-int igmp6_event_report(struct sk_buff *skb)
++void igmp6_event_report(struct sk_buff *skb)
+ {
+       struct inet6_dev *idev = __in6_dev_get(skb->dev);
+-      if (!idev)
+-              return -EINVAL;
+-
+-      if (idev->dead) {
+-              kfree_skb(skb);
+-              return -ENODEV;
+-      }
++      if (!idev || idev->dead)
++              goto out;
+       spin_lock_bh(&idev->mc_report_lock);
+       if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) {
+               __skb_queue_tail(&idev->mc_report_queue, skb);
+               if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0))
+                       in6_dev_hold(idev);
++              skb = NULL;
+       }
+       spin_unlock_bh(&idev->mc_report_lock);
+-
+-      return 0;
++out:
++      kfree_skb(skb);
+ }
+ static void __mld_report_work(struct sk_buff *skb)
+-- 
+2.34.1
+
diff --git a/queue-5.16/kvm-arm64-workaround-cortex-a510-s-single-step-and-p.patch b/queue-5.16/kvm-arm64-workaround-cortex-a510-s-single-step-and-p.patch
new file mode 100644 (file)
index 0000000..031d19d
--- /dev/null
@@ -0,0 +1,152 @@
+From 248f6fc5c8c2cbdfaa4d22847ba276c1ea01f5bb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 27 Jan 2022 12:20:52 +0000
+Subject: KVM: arm64: Workaround Cortex-A510's single-step and PAC trap errata
+
+From: James Morse <james.morse@arm.com>
+
+[ Upstream commit 1dd498e5e26ad71e3e9130daf72cfb6a693fee03 ]
+
+Cortex-A510's erratum #2077057 causes SPSR_EL2 to be corrupted when
+single-stepping authenticated ERET instructions. A single step is
+expected, but a pointer authentication trap is taken instead. The
+erratum causes SPSR_EL1 to be copied to SPSR_EL2, which could allow
+EL1 to cause a return to EL2 with a guest controlled ELR_EL2.
+
+Because the conditions require an ERET into active-not-pending state,
+this is only a problem for the EL2 when EL2 is stepping EL1. In this case
+the previous SPSR_EL2 value is preserved in struct kvm_vcpu, and can be
+restored.
+
+Cc: stable@vger.kernel.org # 53960faf2b73: arm64: Add Cortex-A510 CPU part definition
+Cc: stable@vger.kernel.org
+Signed-off-by: James Morse <james.morse@arm.com>
+[maz: fixup cpucaps ordering]
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Link: https://lore.kernel.org/r/20220127122052.1584324-5-james.morse@arm.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ Documentation/arm64/silicon-errata.rst  |  2 ++
+ arch/arm64/Kconfig                      | 16 ++++++++++++++++
+ arch/arm64/kernel/cpu_errata.c          |  8 ++++++++
+ arch/arm64/kvm/hyp/include/hyp/switch.h | 20 +++++++++++++++++++-
+ arch/arm64/tools/cpucaps                |  5 +++--
+ 5 files changed, 48 insertions(+), 3 deletions(-)
+
+diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst
+index 0ec7b7f1524b1..ea281dd755171 100644
+--- a/Documentation/arm64/silicon-errata.rst
++++ b/Documentation/arm64/silicon-errata.rst
+@@ -100,6 +100,8 @@ stable kernels.
+ +----------------+-----------------+-----------------+-----------------------------+
+ | ARM            | Cortex-A510     | #2051678        | ARM64_ERRATUM_2051678       |
+ +----------------+-----------------+-----------------+-----------------------------+
++| ARM            | Cortex-A510     | #2077057        | ARM64_ERRATUM_2077057       |
+++----------------+-----------------+-----------------+-----------------------------+
+ | ARM            | Cortex-A710     | #2119858        | ARM64_ERRATUM_2119858       |
+ +----------------+-----------------+-----------------+-----------------------------+
+ | ARM            | Cortex-A710     | #2054223        | ARM64_ERRATUM_2054223       |
+diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
+index ae0e93871ee5f..651bf217465e9 100644
+--- a/arch/arm64/Kconfig
++++ b/arch/arm64/Kconfig
+@@ -681,6 +681,22 @@ config ARM64_ERRATUM_2051678
+         If unsure, say Y.
++config ARM64_ERRATUM_2077057
++      bool "Cortex-A510: 2077057: workaround software-step corrupting SPSR_EL2"
++      help
++        This option adds the workaround for ARM Cortex-A510 erratum 2077057.
++        Affected Cortex-A510 may corrupt SPSR_EL2 when the a step exception is
++        expected, but a Pointer Authentication trap is taken instead. The
++        erratum causes SPSR_EL1 to be copied to SPSR_EL2, which could allow
++        EL1 to cause a return to EL2 with a guest controlled ELR_EL2.
++
++        This can only happen when EL2 is stepping EL1.
++
++        When these conditions occur, the SPSR_EL2 value is unchanged from the
++        previous guest entry, and can be restored from the in-memory copy.
++
++        If unsure, say Y.
++
+ config ARM64_ERRATUM_2119858
+       bool "Cortex-A710/X2: 2119858: workaround TRBE overwriting trace data in FILL mode"
+       default y
+diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
+index 066098198c248..b217941713a8d 100644
+--- a/arch/arm64/kernel/cpu_errata.c
++++ b/arch/arm64/kernel/cpu_errata.c
+@@ -600,6 +600,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
+               CAP_MIDR_RANGE_LIST(trbe_write_out_of_range_cpus),
+       },
+ #endif
++#ifdef CONFIG_ARM64_ERRATUM_2077057
++      {
++              .desc = "ARM erratum 2077057",
++              .capability = ARM64_WORKAROUND_2077057,
++              .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
++              ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2),
++      },
++#endif
+ #ifdef CONFIG_ARM64_ERRATUM_2064142
+       {
+               .desc = "ARM erratum 2064142",
+diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
+index adb67f8c9d7d3..3ae9c0b944878 100644
+--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
++++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
+@@ -424,6 +424,24 @@ static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
+       return false;
+ }
++static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code)
++{
++      /*
++       * Check for the conditions of Cortex-A510's #2077057. When these occur
++       * SPSR_EL2 can't be trusted, but isn't needed either as it is
++       * unchanged from the value in vcpu_gp_regs(vcpu)->pstate.
++       * Are we single-stepping the guest, and took a PAC exception from the
++       * active-not-pending state?
++       */
++      if (cpus_have_final_cap(ARM64_WORKAROUND_2077057)               &&
++          vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP                 &&
++          *vcpu_cpsr(vcpu) & DBG_SPSR_SS                              &&
++          ESR_ELx_EC(read_sysreg_el2(SYS_ESR)) == ESR_ELx_EC_PAC)
++              write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR);
++
++      vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR);
++}
++
+ /*
+  * Return true when we were able to fixup the guest exit and should return to
+  * the guest, false when we should restore the host state and return to the
+@@ -435,7 +453,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
+        * Save PSTATE early so that we can evaluate the vcpu mode
+        * early on.
+        */
+-      vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR);
++      synchronize_vcpu_pstate(vcpu, exit_code);
+       /*
+        * Check whether we want to repaint the state one way or
+diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
+index e7719e8f18def..9c65b1e25a965 100644
+--- a/arch/arm64/tools/cpucaps
++++ b/arch/arm64/tools/cpucaps
+@@ -55,9 +55,10 @@ WORKAROUND_1418040
+ WORKAROUND_1463225
+ WORKAROUND_1508412
+ WORKAROUND_1542419
+-WORKAROUND_2064142
+-WORKAROUND_2038923
+ WORKAROUND_1902691
++WORKAROUND_2038923
++WORKAROUND_2064142
++WORKAROUND_2077057
+ WORKAROUND_TRBE_OVERWRITE_FILL_MODE
+ WORKAROUND_TSB_FLUSH_FAILURE
+ WORKAROUND_TRBE_WRITE_OUT_OF_RANGE
+-- 
+2.34.1
+
diff --git a/queue-5.16/kvm-x86-add-kvm_cap_enable_cap-to-x86.patch b/queue-5.16/kvm-x86-add-kvm_cap_enable_cap-to-x86.patch
new file mode 100644 (file)
index 0000000..f66c792
--- /dev/null
@@ -0,0 +1,53 @@
+From 768f4b6c9d9178cefd0355fde27e1d8aa3d3f89a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 14 Feb 2022 21:29:51 +0000
+Subject: KVM: x86: Add KVM_CAP_ENABLE_CAP to x86
+
+From: Aaron Lewis <aaronlewis@google.com>
+
+[ Upstream commit 127770ac0d043435375ab86434f31a93efa88215 ]
+
+Follow the precedent set by other architectures that support the VCPU
+ioctl, KVM_ENABLE_CAP, and advertise the VM extension, KVM_CAP_ENABLE_CAP.
+This way, userspace can ensure that KVM_ENABLE_CAP is available on a
+vcpu before using it.
+
+Fixes: 5c919412fe61 ("kvm/x86: Hyper-V synthetic interrupt controller")
+Signed-off-by: Aaron Lewis <aaronlewis@google.com>
+Message-Id: <20220214212950.1776943-1-aaronlewis@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ Documentation/virt/kvm/api.rst | 2 +-
+ arch/x86/kvm/x86.c             | 1 +
+ 2 files changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
+index aeeb071c76881..9df9eadaeb5c2 100644
+--- a/Documentation/virt/kvm/api.rst
++++ b/Documentation/virt/kvm/api.rst
+@@ -1391,7 +1391,7 @@ documentation when it pops into existence).
+ -------------------
+ :Capability: KVM_CAP_ENABLE_CAP
+-:Architectures: mips, ppc, s390
++:Architectures: mips, ppc, s390, x86
+ :Type: vcpu ioctl
+ :Parameters: struct kvm_enable_cap (in)
+ :Returns: 0 on success; -1 on error
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 0714fa0e7ede0..c6eb3e45e3d80 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -4163,6 +4163,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+       case KVM_CAP_SREGS2:
+       case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+       case KVM_CAP_VCPU_ATTRIBUTES:
++      case KVM_CAP_ENABLE_CAP:
+               r = 1;
+               break;
+       case KVM_CAP_EXIT_HYPERCALL:
+-- 
+2.34.1
+
diff --git a/queue-5.16/sched-fair-fix-fault-in-reweight_entity.patch b/queue-5.16/sched-fair-fix-fault-in-reweight_entity.patch
new file mode 100644 (file)
index 0000000..9dde504
--- /dev/null
@@ -0,0 +1,105 @@
+From 9f407ba6c655408e85afec84fdd872507b6a2954 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 3 Feb 2022 08:18:46 -0800
+Subject: sched/fair: Fix fault in reweight_entity
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Tadeusz Struk <tadeusz.struk@linaro.org>
+
+[ Upstream commit 13765de8148f71fa795e0a6607de37c49ea5915a ]
+
+Syzbot found a GPF in reweight_entity. This has been bisected to
+commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid
+sched_task_group")
+
+There is a race between sched_post_fork() and setpriority(PRIO_PGRP)
+within a thread group that causes a null-ptr-deref in
+reweight_entity() in CFS. The scenario is that the main process spawns
+number of new threads, which then call setpriority(PRIO_PGRP, 0, -20),
+wait, and exit.  For each of the new threads the copy_process() gets
+invoked, which adds the new task_struct and calls sched_post_fork()
+for it.
+
+In the above scenario there is a possibility that
+setpriority(PRIO_PGRP) and set_one_prio() will be called for a thread
+in the group that is just being created by copy_process(), and for
+which the sched_post_fork() has not been executed yet. This will
+trigger a null pointer dereference in reweight_entity(), as it will
+try to access the run queue pointer, which hasn't been set.
+
+Before the mentioned change the cfs_rq pointer for the task  has been
+set in sched_fork(), which is called much earlier in copy_process(),
+before the new task is added to the thread_group.  Now it is done in
+the sched_post_fork(), which is called after that.  To fix the issue
+the remove the update_load param from the update_load param() function
+and call reweight_task() only if the task flag doesn't have the
+TASK_NEW flag set.
+
+Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group")
+Reported-by: syzbot+af7a719bc92395ee41b3@syzkaller.appspotmail.com
+Signed-off-by: Tadeusz Struk <tadeusz.struk@linaro.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: stable@vger.kernel.org
+Link: https://lkml.kernel.org/r/20220203161846.1160750-1-tadeusz.struk@linaro.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/core.c | 11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index d24823b3c3f9f..35b256b789680 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1203,8 +1203,9 @@ int tg_nop(struct task_group *tg, void *data)
+ }
+ #endif
+-static void set_load_weight(struct task_struct *p, bool update_load)
++static void set_load_weight(struct task_struct *p)
+ {
++      bool update_load = !(READ_ONCE(p->__state) & TASK_NEW);
+       int prio = p->static_prio - MAX_RT_PRIO;
+       struct load_weight *load = &p->se.load;
+@@ -4392,7 +4393,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+                       p->static_prio = NICE_TO_PRIO(0);
+               p->prio = p->normal_prio = p->static_prio;
+-              set_load_weight(p, false);
++              set_load_weight(p);
+               /*
+                * We don't need the reset flag anymore after the fork. It has
+@@ -6879,7 +6880,7 @@ void set_user_nice(struct task_struct *p, long nice)
+               put_prev_task(rq, p);
+       p->static_prio = NICE_TO_PRIO(nice);
+-      set_load_weight(p, true);
++      set_load_weight(p);
+       old_prio = p->prio;
+       p->prio = effective_prio(p);
+@@ -7170,7 +7171,7 @@ static void __setscheduler_params(struct task_struct *p,
+        */
+       p->rt_priority = attr->sched_priority;
+       p->normal_prio = normal_prio(p);
+-      set_load_weight(p, true);
++      set_load_weight(p);
+ }
+ /*
+@@ -9409,7 +9410,7 @@ void __init sched_init(void)
+ #endif
+       }
+-      set_load_weight(&init_task, false);
++      set_load_weight(&init_task);
+       /*
+        * The boot idle thread does lazy MMU switching as well:
+-- 
+2.34.1
+
diff --git a/queue-5.16/serial-stm32-prevent-tdr-register-overwrite-when-sen.patch b/queue-5.16/serial-stm32-prevent-tdr-register-overwrite-when-sen.patch
new file mode 100644 (file)
index 0000000..0b5fba4
--- /dev/null
@@ -0,0 +1,57 @@
+From 36807348826799579605d1c23d7af6b7b253d0fa Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 11 Jan 2022 17:44:40 +0100
+Subject: serial: stm32: prevent TDR register overwrite when sending x_char
+
+From: Valentin Caron <valentin.caron@foss.st.com>
+
+[ Upstream commit d3d079bde07e1b7deaeb57506dc0b86010121d17 ]
+
+When sending x_char in stm32_usart_transmit_chars(), driver can overwrite
+the value of TDR register by the value of x_char. If this happens, the
+previous value that was present in TDR register will not be sent through
+uart.
+
+This code checks if the previous value in TDR register is sent before
+writing the x_char value into register.
+
+Fixes: 48a6092fb41f ("serial: stm32-usart: Add STM32 USART Driver")
+Cc: stable <stable@vger.kernel.org>
+Signed-off-by: Valentin Caron <valentin.caron@foss.st.com>
+Link: https://lore.kernel.org/r/20220111164441.6178-2-valentin.caron@foss.st.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/tty/serial/stm32-usart.c | 12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c
+index 2d3fbcbfaf108..93c2a5c956540 100644
+--- a/drivers/tty/serial/stm32-usart.c
++++ b/drivers/tty/serial/stm32-usart.c
+@@ -520,10 +520,22 @@ static void stm32_usart_transmit_chars(struct uart_port *port)
+       struct stm32_port *stm32_port = to_stm32_port(port);
+       const struct stm32_usart_offsets *ofs = &stm32_port->info->ofs;
+       struct circ_buf *xmit = &port->state->xmit;
++      u32 isr;
++      int ret;
+       if (port->x_char) {
+               if (stm32_port->tx_dma_busy)
+                       stm32_usart_clr_bits(port, ofs->cr3, USART_CR3_DMAT);
++
++              /* Check that TDR is empty before filling FIFO */
++              ret =
++              readl_relaxed_poll_timeout_atomic(port->membase + ofs->isr,
++                                                isr,
++                                                (isr & USART_SR_TXE),
++                                                10, 1000);
++              if (ret)
++                      dev_warn(port->dev, "1 character may be erased\n");
++
+               writel_relaxed(port->x_char, port->membase + ofs->tdr);
+               port->x_char = 0;
+               port->icount.tx++;
+-- 
+2.34.1
+
index efec9e75ddfece9d20c8227c41db77843bf5fa14..c80383fe37acf100733e7ead3aa4d449deb9ce0a 100644 (file)
@@ -27,3 +27,19 @@ net-usb-cdc_mbim-avoid-altsetting-toggling-for-telit.patch
 block-map-add-__gfp_zero-flag-for-alloc_page-in-func.patch
 usb-gadget-don-t-release-an-existing-dev-buf.patch
 usb-gadget-clear-related-members-when-goto-fail.patch
+exfat-reuse-exfat_inode_info-variable-instead-of-cal.patch
+exfat-fix-i_blocks-for-files-truncated-over-4-gib.patch
+tracing-add-test-for-user-space-strings-when-filteri.patch
+arm64-mark-start_backtrace-notrace-and-nokprobe_symb.patch
+serial-stm32-prevent-tdr-register-overwrite-when-sen.patch
+kvm-arm64-workaround-cortex-a510-s-single-step-and-p.patch
+ext4-drop-ineligible-txn-start-stop-apis.patch
+ext4-simplify-updating-of-fast-commit-stats.patch
+ext4-fast-commit-may-not-fallback-for-ineligible-com.patch
+ext4-fast-commit-may-miss-file-actions.patch
+sched-fair-fix-fault-in-reweight_entity.patch
+kvm-x86-add-kvm_cap_enable_cap-to-x86.patch
+ata-pata_hpt37x-fix-pci-clock-detection.patch
+drm-amdgpu-check-vm-ready-by-amdgpu_vm-evicting-flag.patch
+tracing-add-ustring-operation-to-filtering-string-po.patch
+ipv6-fix-skb-drops-in-igmp6_event_query-and-igmp6_ev.patch
diff --git a/queue-5.16/tracing-add-test-for-user-space-strings-when-filteri.patch b/queue-5.16/tracing-add-test-for-user-space-strings-when-filteri.patch
new file mode 100644 (file)
index 0000000..8b45569
--- /dev/null
@@ -0,0 +1,217 @@
+From d003cfd3df7bf6ed1560ad3fe2abd2102c0b564e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 10 Jan 2022 11:55:32 -0500
+Subject: tracing: Add test for user space strings when filtering on string
+ pointers
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+[ Upstream commit 77360f9bbc7e5e2ab7a2c8b4c0244fbbfcfc6f62 ]
+
+Pingfan reported that the following causes a fault:
+
+  echo "filename ~ \"cpu\"" > events/syscalls/sys_enter_openat/filter
+  echo 1 > events/syscalls/sys_enter_at/enable
+
+The reason is that trace event filter treats the user space pointer
+defined by "filename" as a normal pointer to compare against the "cpu"
+string. The following bug happened:
+
+ kvm-03-guest16 login: [72198.026181] BUG: unable to handle page fault for address: 00007fffaae8ef60
+ #PF: supervisor read access in kernel mode
+ #PF: error_code(0x0001) - permissions violation
+ PGD 80000001008b7067 P4D 80000001008b7067 PUD 2393f1067 PMD 2393ec067 PTE 8000000108f47867
+ Oops: 0001 [#1] PREEMPT SMP PTI
+ CPU: 1 PID: 1 Comm: systemd Kdump: loaded Not tainted 5.14.0-32.el9.x86_64 #1
+ Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
+ RIP: 0010:strlen+0x0/0x20
+ Code: 48 89 f9 74 09 48 83 c1 01 80 39 00 75 f7 31 d2 44 0f b6 04 16 44 88 04 11
+       48 83 c2 01 45 84 c0 75 ee c3 0f 1f 80 00 00 00 00 <80> 3f 00 74 10 48 89 f8
+       48 83 c0 01 80 38 00 75 f7 48 29 f8 c3 31
+ RSP: 0018:ffffb5b900013e48 EFLAGS: 00010246
+ RAX: 0000000000000018 RBX: ffff8fc1c49ede00 RCX: 0000000000000000
+ RDX: 0000000000000020 RSI: ffff8fc1c02d601c RDI: 00007fffaae8ef60
+ RBP: 00007fffaae8ef60 R08: 0005034f4ddb8ea4 R09: 0000000000000000
+ R10: ffff8fc1c02d601c R11: 0000000000000000 R12: ffff8fc1c8a6e380
+ R13: 0000000000000000 R14: ffff8fc1c02d6010 R15: ffff8fc1c00453c0
+ FS:  00007fa86123db40(0000) GS:ffff8fc2ffd00000(0000) knlGS:0000000000000000
+ CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ CR2: 00007fffaae8ef60 CR3: 0000000102880001 CR4: 00000000007706e0
+ DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+ DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+ PKRU: 55555554
+ Call Trace:
+  filter_pred_pchar+0x18/0x40
+  filter_match_preds+0x31/0x70
+  ftrace_syscall_enter+0x27a/0x2c0
+  syscall_trace_enter.constprop.0+0x1aa/0x1d0
+  do_syscall_64+0x16/0x90
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+ RIP: 0033:0x7fa861d88664
+
+The above happened because the kernel tried to access user space directly
+and triggered a "supervisor read access in kernel mode" fault. Worse yet,
+the memory could not even be loaded yet, and a SEGFAULT could happen as
+well. This could be true for kernel space accessing as well.
+
+To be even more robust, test both kernel and user space strings. If the
+string fails to read, then simply have the filter fail.
+
+Note, TASK_SIZE is used to determine if the pointer is user or kernel space
+and the appropriate strncpy_from_kernel/user_nofault() function is used to
+copy the memory. For some architectures, the compare to TASK_SIZE may always
+pick user space or kernel space. If it gets it wrong, the only thing is that
+the filter will fail to match. In the future, this needs to be fixed to have
+the event denote which should be used. But failing a filter is much better
+than panicing the machine, and that can be solved later.
+
+Link: https://lore.kernel.org/all/20220107044951.22080-1-kernelfans@gmail.com/
+Link: https://lkml.kernel.org/r/20220110115532.536088fd@gandalf.local.home
+
+Cc: stable@vger.kernel.org
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Tom Zanussi <zanussi@kernel.org>
+Reported-by: Pingfan Liu <kernelfans@gmail.com>
+Tested-by: Pingfan Liu <kernelfans@gmail.com>
+Fixes: 87a342f5db69d ("tracing/filters: Support filtering for char * strings")
+Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ Documentation/trace/events.rst     | 10 +++++
+ kernel/trace/trace_events_filter.c | 66 ++++++++++++++++++++++++++++--
+ 2 files changed, 73 insertions(+), 3 deletions(-)
+
+diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst
+index 8ddb9b09451c8..45e66a60a816a 100644
+--- a/Documentation/trace/events.rst
++++ b/Documentation/trace/events.rst
+@@ -230,6 +230,16 @@ Currently the caret ('^') for an error always appears at the beginning of
+ the filter string; the error message should still be useful though
+ even without more accurate position info.
++5.2.1 Filter limitations
++------------------------
++
++If a filter is placed on a string pointer ``(char *)`` that does not point
++to a string on the ring buffer, but instead points to kernel or user space
++memory, then, for safety reasons, at most 1024 bytes of the content is
++copied onto a temporary buffer to do the compare. If the copy of the memory
++faults (the pointer points to memory that should not be accessed), then the
++string compare will be treated as not matching.
++
+ 5.3 Clearing filters
+ --------------------
+diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
+index c9124038b140f..d3eb3c630f601 100644
+--- a/kernel/trace/trace_events_filter.c
++++ b/kernel/trace/trace_events_filter.c
+@@ -5,6 +5,7 @@
+  * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
+  */
++#include <linux/uaccess.h>
+ #include <linux/module.h>
+ #include <linux/ctype.h>
+ #include <linux/mutex.h>
+@@ -654,6 +655,47 @@ DEFINE_EQUALITY_PRED(32);
+ DEFINE_EQUALITY_PRED(16);
+ DEFINE_EQUALITY_PRED(8);
++/* user space strings temp buffer */
++#define USTRING_BUF_SIZE      1024
++
++struct ustring_buffer {
++      char            buffer[USTRING_BUF_SIZE];
++};
++
++static __percpu struct ustring_buffer *ustring_per_cpu;
++
++static __always_inline char *test_string(char *str)
++{
++      struct ustring_buffer *ubuf;
++      char __user *ustr;
++      char *kstr;
++
++      if (!ustring_per_cpu)
++              return NULL;
++
++      ubuf = this_cpu_ptr(ustring_per_cpu);
++      kstr = ubuf->buffer;
++
++      /*
++       * We use TASK_SIZE to denote user or kernel space, but this will
++       * not work for all architectures. If it picks the wrong one, it may
++       * just fail the filter (but will not bug).
++       *
++       * TODO: Have a way to properly denote which one this is for.
++       */
++      if (likely((unsigned long)str >= TASK_SIZE)) {
++              /* For safety, do not trust the string pointer */
++              if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE))
++                      return NULL;
++      } else {
++              /* user space address? */
++              ustr = (char __user *)str;
++              if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE))
++                      return NULL;
++      }
++      return kstr;
++}
++
+ /* Filter predicate for fixed sized arrays of characters */
+ static int filter_pred_string(struct filter_pred *pred, void *event)
+ {
+@@ -671,10 +713,16 @@ static int filter_pred_string(struct filter_pred *pred, void *event)
+ static int filter_pred_pchar(struct filter_pred *pred, void *event)
+ {
+       char **addr = (char **)(event + pred->offset);
++      char *str;
+       int cmp, match;
+-      int len = strlen(*addr) + 1;    /* including tailing '\0' */
++      int len;
+-      cmp = pred->regex.match(*addr, &pred->regex, len);
++      str = test_string(*addr);
++      if (!str)
++              return 0;
++
++      len = strlen(str) + 1;  /* including tailing '\0' */
++      cmp = pred->regex.match(str, &pred->regex, len);
+       match = cmp ^ pred->not;
+@@ -1320,8 +1368,17 @@ static int parse_pred(const char *str, void *data,
+               } else if (field->filter_type == FILTER_DYN_STRING)
+                       pred->fn = filter_pred_strloc;
+-              else
++              else {
++
++                      if (!ustring_per_cpu) {
++                              /* Once allocated, keep it around for good */
++                              ustring_per_cpu = alloc_percpu(struct ustring_buffer);
++                              if (!ustring_per_cpu)
++                                      goto err_mem;
++                      }
++
+                       pred->fn = filter_pred_pchar;
++              }
+               /* go past the last quote */
+               i++;
+@@ -1387,6 +1444,9 @@ static int parse_pred(const char *str, void *data,
+ err_free:
+       kfree(pred);
+       return -EINVAL;
++err_mem:
++      kfree(pred);
++      return -ENOMEM;
+ }
+ enum {
+-- 
+2.34.1
+
diff --git a/queue-5.16/tracing-add-ustring-operation-to-filtering-string-po.patch b/queue-5.16/tracing-add-ustring-operation-to-filtering-string-po.patch
new file mode 100644 (file)
index 0000000..11f23d1
--- /dev/null
@@ -0,0 +1,190 @@
+From 641b43ce7c85d47bfb4887dc81217b3334be4593 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 13 Jan 2022 20:08:40 -0500
+Subject: tracing: Add ustring operation to filtering string pointers
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+[ Upstream commit f37c3bbc635994eda203a6da4ba0f9d05165a8d6 ]
+
+Since referencing user space pointers is special, if the user wants to
+filter on a field that is a pointer to user space, then they need to
+specify it.
+
+Add a ".ustring" attribute to the field name for filters to state that the
+field is pointing to user space such that the kernel can take the
+appropriate action to read that pointer.
+
+Link: https://lore.kernel.org/all/yt9d8rvmt2jq.fsf@linux.ibm.com/
+
+Fixes: 77360f9bbc7e ("tracing: Add test for user space strings when filtering on string pointers")
+Tested-by: Sven Schnelle <svens@linux.ibm.com>
+Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ Documentation/trace/events.rst     |  9 ++++
+ kernel/trace/trace_events_filter.c | 81 +++++++++++++++++++++---------
+ 2 files changed, 66 insertions(+), 24 deletions(-)
+
+diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst
+index 45e66a60a816a..c47f381d0c002 100644
+--- a/Documentation/trace/events.rst
++++ b/Documentation/trace/events.rst
+@@ -198,6 +198,15 @@ The glob (~) accepts a wild card character (\*,?) and character classes
+   prev_comm ~ "*sh*"
+   prev_comm ~ "ba*sh"
++If the field is a pointer that points into user space (for example
++"filename" from sys_enter_openat), then you have to append ".ustring" to the
++field name::
++
++  filename.ustring ~ "password"
++
++As the kernel will have to know how to retrieve the memory that the pointer
++is at from user space.
++
+ 5.2 Setting filters
+ -------------------
+diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
+index d3eb3c630f601..06d6318ee5377 100644
+--- a/kernel/trace/trace_events_filter.c
++++ b/kernel/trace/trace_events_filter.c
+@@ -665,6 +665,23 @@ struct ustring_buffer {
+ static __percpu struct ustring_buffer *ustring_per_cpu;
+ static __always_inline char *test_string(char *str)
++{
++      struct ustring_buffer *ubuf;
++      char *kstr;
++
++      if (!ustring_per_cpu)
++              return NULL;
++
++      ubuf = this_cpu_ptr(ustring_per_cpu);
++      kstr = ubuf->buffer;
++
++      /* For safety, do not trust the string pointer */
++      if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE))
++              return NULL;
++      return kstr;
++}
++
++static __always_inline char *test_ustring(char *str)
+ {
+       struct ustring_buffer *ubuf;
+       char __user *ustr;
+@@ -676,23 +693,11 @@ static __always_inline char *test_string(char *str)
+       ubuf = this_cpu_ptr(ustring_per_cpu);
+       kstr = ubuf->buffer;
+-      /*
+-       * We use TASK_SIZE to denote user or kernel space, but this will
+-       * not work for all architectures. If it picks the wrong one, it may
+-       * just fail the filter (but will not bug).
+-       *
+-       * TODO: Have a way to properly denote which one this is for.
+-       */
+-      if (likely((unsigned long)str >= TASK_SIZE)) {
+-              /* For safety, do not trust the string pointer */
+-              if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE))
+-                      return NULL;
+-      } else {
+-              /* user space address? */
+-              ustr = (char __user *)str;
+-              if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE))
+-                      return NULL;
+-      }
++      /* user space address? */
++      ustr = (char __user *)str;
++      if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE))
++              return NULL;
++
+       return kstr;
+ }
+@@ -709,24 +714,42 @@ static int filter_pred_string(struct filter_pred *pred, void *event)
+       return match;
+ }
++static __always_inline int filter_pchar(struct filter_pred *pred, char *str)
++{
++      int cmp, match;
++      int len;
++
++      len = strlen(str) + 1;  /* including tailing '\0' */
++      cmp = pred->regex.match(str, &pred->regex, len);
++
++      match = cmp ^ pred->not;
++
++      return match;
++}
+ /* Filter predicate for char * pointers */
+ static int filter_pred_pchar(struct filter_pred *pred, void *event)
+ {
+       char **addr = (char **)(event + pred->offset);
+       char *str;
+-      int cmp, match;
+-      int len;
+       str = test_string(*addr);
+       if (!str)
+               return 0;
+-      len = strlen(str) + 1;  /* including tailing '\0' */
+-      cmp = pred->regex.match(str, &pred->regex, len);
++      return filter_pchar(pred, str);
++}
+-      match = cmp ^ pred->not;
++/* Filter predicate for char * pointers in user space*/
++static int filter_pred_pchar_user(struct filter_pred *pred, void *event)
++{
++      char **addr = (char **)(event + pred->offset);
++      char *str;
+-      return match;
++      str = test_ustring(*addr);
++      if (!str)
++              return 0;
++
++      return filter_pchar(pred, str);
+ }
+ /*
+@@ -1206,6 +1229,7 @@ static int parse_pred(const char *str, void *data,
+       struct filter_pred *pred = NULL;
+       char num_buf[24];       /* Big enough to hold an address */
+       char *field_name;
++      bool ustring = false;
+       char q;
+       u64 val;
+       int len;
+@@ -1240,6 +1264,12 @@ static int parse_pred(const char *str, void *data,
+               return -EINVAL;
+       }
++      /* See if the field is a user space string */
++      if ((len = str_has_prefix(str + i, ".ustring"))) {
++              ustring = true;
++              i += len;
++      }
++
+       while (isspace(str[i]))
+               i++;
+@@ -1377,7 +1407,10 @@ static int parse_pred(const char *str, void *data,
+                                       goto err_mem;
+                       }
+-                      pred->fn = filter_pred_pchar;
++                      if (ustring)
++                              pred->fn = filter_pred_pchar_user;
++                      else
++                              pred->fn = filter_pred_pchar;
+               }
+               /* go past the last quote */
+               i++;
+-- 
+2.34.1
+