]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.4-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 26 Jul 2021 08:53:54 +0000 (10:53 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 26 Jul 2021 08:53:54 +0000 (10:53 +0200)
added patches:
btrfs-check-for-missing-device-in-btrfs_trim_fs.patch
firmware-efi-tell-memblock-about-efi-iomem-reservations.patch
ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch
media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch
selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch
tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch
tracing-histogram-rename-cpu-to-common_cpu.patch
userfaultfd-do-not-untag-user-pointers.patch

queue-5.4/btrfs-check-for-missing-device-in-btrfs_trim_fs.patch [new file with mode: 0644]
queue-5.4/firmware-efi-tell-memblock-about-efi-iomem-reservations.patch [new file with mode: 0644]
queue-5.4/ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch [new file with mode: 0644]
queue-5.4/media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch [new file with mode: 0644]
queue-5.4/selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch [new file with mode: 0644]
queue-5.4/series
queue-5.4/tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch [new file with mode: 0644]
queue-5.4/tracing-histogram-rename-cpu-to-common_cpu.patch [new file with mode: 0644]
queue-5.4/userfaultfd-do-not-untag-user-pointers.patch [new file with mode: 0644]

diff --git a/queue-5.4/btrfs-check-for-missing-device-in-btrfs_trim_fs.patch b/queue-5.4/btrfs-check-for-missing-device-in-btrfs_trim_fs.patch
new file mode 100644 (file)
index 0000000..c4b72e8
--- /dev/null
@@ -0,0 +1,80 @@
+From 16a200f66ede3f9afa2e51d90ade017aaa18d213 Mon Sep 17 00:00:00 2001
+From: Anand Jain <anand.jain@oracle.com>
+Date: Sun, 4 Jul 2021 19:14:39 +0800
+Subject: btrfs: check for missing device in btrfs_trim_fs
+
+From: Anand Jain <anand.jain@oracle.com>
+
+commit 16a200f66ede3f9afa2e51d90ade017aaa18d213 upstream.
+
+A fstrim on a degraded raid1 can trigger the following null pointer
+dereference:
+
+  BTRFS info (device loop0): allowing degraded mounts
+  BTRFS info (device loop0): disk space caching is enabled
+  BTRFS info (device loop0): has skinny extents
+  BTRFS warning (device loop0): devid 2 uuid 97ac16f7-e14d-4db1-95bc-3d489b424adb is missing
+  BTRFS warning (device loop0): devid 2 uuid 97ac16f7-e14d-4db1-95bc-3d489b424adb is missing
+  BTRFS info (device loop0): enabling ssd optimizations
+  BUG: kernel NULL pointer dereference, address: 0000000000000620
+  PGD 0 P4D 0
+  Oops: 0000 [#1] SMP NOPTI
+  CPU: 0 PID: 4574 Comm: fstrim Not tainted 5.13.0-rc7+ #31
+  Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
+  RIP: 0010:btrfs_trim_fs+0x199/0x4a0 [btrfs]
+  RSP: 0018:ffff959541797d28 EFLAGS: 00010293
+  RAX: 0000000000000000 RBX: ffff946f84eca508 RCX: a7a67937adff8608
+  RDX: ffff946e8122d000 RSI: 0000000000000000 RDI: ffffffffc02fdbf0
+  RBP: ffff946ea4615000 R08: 0000000000000001 R09: 0000000000000000
+  R10: 0000000000000000 R11: ffff946e8122d960 R12: 0000000000000000
+  R13: ffff959541797db8 R14: ffff946e8122d000 R15: ffff959541797db8
+  FS:  00007f55917a5080(0000) GS:ffff946f9bc00000(0000) knlGS:0000000000000000
+  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  CR2: 0000000000000620 CR3: 000000002d2c8001 CR4: 00000000000706f0
+  Call Trace:
+  btrfs_ioctl_fitrim+0x167/0x260 [btrfs]
+  btrfs_ioctl+0x1c00/0x2fe0 [btrfs]
+  ? selinux_file_ioctl+0x140/0x240
+  ? syscall_trace_enter.constprop.0+0x188/0x240
+  ? __x64_sys_ioctl+0x83/0xb0
+  __x64_sys_ioctl+0x83/0xb0
+
+Reproducer:
+
+  $ mkfs.btrfs -fq -d raid1 -m raid1 /dev/loop0 /dev/loop1
+  $ mount /dev/loop0 /btrfs
+  $ umount /btrfs
+  $ btrfs dev scan --forget
+  $ mount -o degraded /dev/loop0 /btrfs
+
+  $ fstrim /btrfs
+
+The reason is we call btrfs_trim_free_extents() for the missing device,
+which uses device->bdev (NULL for missing device) to find if the device
+supports discard.
+
+Fix is to check if the device is missing before calling
+btrfs_trim_free_extents().
+
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent-tree.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -5768,6 +5768,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *
+       mutex_lock(&fs_info->fs_devices->device_list_mutex);
+       devices = &fs_info->fs_devices->devices;
+       list_for_each_entry(device, devices, dev_list) {
++              if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
++                      continue;
++
+               ret = btrfs_trim_free_extents(device, &group_trimmed);
+               if (ret) {
+                       dev_failed++;
diff --git a/queue-5.4/firmware-efi-tell-memblock-about-efi-iomem-reservations.patch b/queue-5.4/firmware-efi-tell-memblock-about-efi-iomem-reservations.patch
new file mode 100644 (file)
index 0000000..debaa68
--- /dev/null
@@ -0,0 +1,66 @@
+From 2bab693a608bdf614b9fcd44083c5100f34b9f77 Mon Sep 17 00:00:00 2001
+From: Marc Zyngier <maz@kernel.org>
+Date: Tue, 13 Jul 2021 19:43:26 +0100
+Subject: firmware/efi: Tell memblock about EFI iomem reservations
+
+From: Marc Zyngier <maz@kernel.org>
+
+commit 2bab693a608bdf614b9fcd44083c5100f34b9f77 upstream.
+
+kexec_load_file() relies on the memblock infrastructure to avoid
+stamping over regions of memory that are essential to the survival
+of the system.
+
+However, nobody seems to agree how to flag these regions as reserved,
+and (for example) EFI only publishes its reservations in /proc/iomem
+for the benefit of the traditional, userspace based kexec tool.
+
+On arm64 platforms with GICv3, this can result in the payload being
+placed at the location of the LPI tables. Shock, horror!
+
+Let's augment the EFI reservation code with a memblock_reserve() call,
+protecting our dear tables from the secondary kernel invasion.
+
+Reported-by: Moritz Fischer <mdf@kernel.org>
+Tested-by: Moritz Fischer <mdf@kernel.org>
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Cc: James Morse <james.morse@arm.com>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Will Deacon <will@kernel.org>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/firmware/efi/efi.c |   13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/drivers/firmware/efi/efi.c
++++ b/drivers/firmware/efi/efi.c
+@@ -975,6 +975,7 @@ static int __init efi_memreserve_map_roo
+ static int efi_mem_reserve_iomem(phys_addr_t addr, u64 size)
+ {
+       struct resource *res, *parent;
++      int ret;
+       res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
+       if (!res)
+@@ -987,7 +988,17 @@ static int efi_mem_reserve_iomem(phys_ad
+       /* we expect a conflict with a 'System RAM' region */
+       parent = request_resource_conflict(&iomem_resource, res);
+-      return parent ? request_resource(parent, res) : 0;
++      ret = parent ? request_resource(parent, res) : 0;
++
++      /*
++       * Given that efi_mem_reserve_iomem() can be called at any
++       * time, only call memblock_reserve() if the architecture
++       * keeps the infrastructure around.
++       */
++      if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK) && !ret)
++              memblock_reserve(addr, size);
++
++      return ret;
+ }
+ int __ref efi_mem_reserve_persistent(phys_addr_t addr, u64 size)
diff --git a/queue-5.4/ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch b/queue-5.4/ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch
new file mode 100644 (file)
index 0000000..7734bb9
--- /dev/null
@@ -0,0 +1,55 @@
+From 09cfae9f13d51700b0fecf591dcd658fc5375428 Mon Sep 17 00:00:00 2001
+From: Markus Boehme <markubo@amazon.com>
+Date: Tue, 20 Jul 2021 16:26:19 -0700
+Subject: ixgbe: Fix packet corruption due to missing DMA sync
+
+From: Markus Boehme <markubo@amazon.com>
+
+commit 09cfae9f13d51700b0fecf591dcd658fc5375428 upstream.
+
+When receiving a packet with multiple fragments, hardware may still
+touch the first fragment until the entire packet has been received. The
+driver therefore keeps the first fragment mapped for DMA until end of
+packet has been asserted, and delays its dma_sync call until then.
+
+The driver tries to fit multiple receive buffers on one page. When using
+3K receive buffers (e.g. using Jumbo frames and legacy-rx is turned
+off/build_skb is being used) on an architecture with 4K pages, the
+driver allocates an order 1 compound page and uses one page per receive
+buffer. To determine the correct offset for a delayed DMA sync of the
+first fragment of a multi-fragment packet, the driver then cannot just
+use PAGE_MASK on the DMA address but has to construct a mask based on
+the actual size of the backing page.
+
+Using PAGE_MASK in the 3K RX buffer/4K page architecture configuration
+will always sync the first page of a compound page. With the SWIOTLB
+enabled this can lead to corrupted packets (zeroed out first fragment,
+re-used garbage from another packet) and various consequences, such as
+slow/stalling data transfers and connection resets. For example, testing
+on a link with MTU exceeding 3058 bytes on a host with SWIOTLB enabled
+(e.g. "iommu=soft swiotlb=262144,force") TCP transfers quickly fizzle
+out without this patch.
+
+Cc: stable@vger.kernel.org
+Fixes: 0c5661ecc5dd7 ("ixgbe: fix crash in build_skb Rx code path")
+Signed-off-by: Markus Boehme <markubo@amazon.com>
+Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
++++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+@@ -1827,7 +1827,8 @@ static void ixgbe_dma_sync_frag(struct i
+                               struct sk_buff *skb)
+ {
+       if (ring_uses_build_skb(rx_ring)) {
+-              unsigned long offset = (unsigned long)(skb->data) & ~PAGE_MASK;
++              unsigned long mask = (unsigned long)ixgbe_rx_pg_size(rx_ring) - 1;
++              unsigned long offset = (unsigned long)(skb->data) & mask;
+               dma_sync_single_range_for_cpu(rx_ring->dev,
+                                             IXGBE_CB(skb)->dma,
diff --git a/queue-5.4/media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch b/queue-5.4/media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch
new file mode 100644 (file)
index 0000000..d233006
--- /dev/null
@@ -0,0 +1,82 @@
+From 8d4abca95ecc82fc8c41912fa0085281f19cc29f Mon Sep 17 00:00:00 2001
+From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
+Date: Mon, 19 Apr 2021 18:43:32 -0500
+Subject: media: ngene: Fix out-of-bounds bug in ngene_command_config_free_buf()
+
+From: Gustavo A. R. Silva <gustavoars@kernel.org>
+
+commit 8d4abca95ecc82fc8c41912fa0085281f19cc29f upstream.
+
+Fix an 11-year old bug in ngene_command_config_free_buf() while
+addressing the following warnings caught with -Warray-bounds:
+
+arch/alpha/include/asm/string.h:22:16: warning: '__builtin_memcpy' offset [12, 16] from the object at 'com' is out of the bounds of referenced subobject 'config' with type 'unsigned char' at offset 10 [-Warray-bounds]
+arch/x86/include/asm/string_32.h:182:25: warning: '__builtin_memcpy' offset [12, 16] from the object at 'com' is out of the bounds of referenced subobject 'config' with type 'unsigned char' at offset 10 [-Warray-bounds]
+
+The problem is that the original code is trying to copy 6 bytes of
+data into a one-byte size member _config_ of the wrong structue
+FW_CONFIGURE_BUFFERS, in a single call to memcpy(). This causes a
+legitimate compiler warning because memcpy() overruns the length
+of &com.cmd.ConfigureBuffers.config. It seems that the right
+structure is FW_CONFIGURE_FREE_BUFFERS, instead, because it contains
+6 more members apart from the header _hdr_. Also, the name of
+the function ngene_command_config_free_buf() suggests that the actual
+intention is to ConfigureFreeBuffers, instead of ConfigureBuffers
+(which takes place in the function ngene_command_config_buf(), above).
+
+Fix this by enclosing those 6 members of struct FW_CONFIGURE_FREE_BUFFERS
+into new struct config, and use &com.cmd.ConfigureFreeBuffers.config as
+the destination address, instead of &com.cmd.ConfigureBuffers.config,
+when calling memcpy().
+
+This also helps with the ongoing efforts to globally enable
+-Warray-bounds and get us closer to being able to tighten the
+FORTIFY_SOURCE routines on memcpy().
+
+Link: https://github.com/KSPP/linux/issues/109
+Fixes: dae52d009fc9 ("V4L/DVB: ngene: Initial check-in")
+Cc: stable@vger.kernel.org
+Reported-by: kernel test robot <lkp@intel.com>
+Reviewed-by: Kees Cook <keescook@chromium.org>
+Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
+Link: https://lore.kernel.org/linux-hardening/20210420001631.GA45456@embeddedor/
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/media/pci/ngene/ngene-core.c |    2 +-
+ drivers/media/pci/ngene/ngene.h      |   14 ++++++++------
+ 2 files changed, 9 insertions(+), 7 deletions(-)
+
+--- a/drivers/media/pci/ngene/ngene-core.c
++++ b/drivers/media/pci/ngene/ngene-core.c
+@@ -385,7 +385,7 @@ static int ngene_command_config_free_buf
+       com.cmd.hdr.Opcode = CMD_CONFIGURE_FREE_BUFFER;
+       com.cmd.hdr.Length = 6;
+-      memcpy(&com.cmd.ConfigureBuffers.config, config, 6);
++      memcpy(&com.cmd.ConfigureFreeBuffers.config, config, 6);
+       com.in_len = 6;
+       com.out_len = 0;
+--- a/drivers/media/pci/ngene/ngene.h
++++ b/drivers/media/pci/ngene/ngene.h
+@@ -407,12 +407,14 @@ enum _BUFFER_CONFIGS {
+ struct FW_CONFIGURE_FREE_BUFFERS {
+       struct FW_HEADER hdr;
+-      u8   UVI1_BufferLength;
+-      u8   UVI2_BufferLength;
+-      u8   TVO_BufferLength;
+-      u8   AUD1_BufferLength;
+-      u8   AUD2_BufferLength;
+-      u8   TVA_BufferLength;
++      struct {
++              u8   UVI1_BufferLength;
++              u8   UVI2_BufferLength;
++              u8   TVO_BufferLength;
++              u8   AUD1_BufferLength;
++              u8   AUD2_BufferLength;
++              u8   TVA_BufferLength;
++      } __packed config;
+ } __attribute__ ((__packed__));
+ struct FW_CONFIGURE_UART {
diff --git a/queue-5.4/selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch b/queue-5.4/selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch
new file mode 100644 (file)
index 0000000..9c441db
--- /dev/null
@@ -0,0 +1,56 @@
+From 0db282ba2c12c1515d490d14a1ff696643ab0f1b Mon Sep 17 00:00:00 2001
+From: Peter Collingbourne <pcc@google.com>
+Date: Fri, 23 Jul 2021 15:50:04 -0700
+Subject: selftest: use mmap instead of posix_memalign to allocate memory
+
+From: Peter Collingbourne <pcc@google.com>
+
+commit 0db282ba2c12c1515d490d14a1ff696643ab0f1b upstream.
+
+This test passes pointers obtained from anon_allocate_area to the
+userfaultfd and mremap APIs.  This causes a problem if the system
+allocator returns tagged pointers because with the tagged address ABI
+the kernel rejects tagged addresses passed to these APIs, which would
+end up causing the test to fail.  To make this test compatible with such
+system allocators, stop using the system allocator to allocate memory in
+anon_allocate_area, and instead just use mmap.
+
+Link: https://lkml.kernel.org/r/20210714195437.118982-3-pcc@google.com
+Link: https://linux-review.googlesource.com/id/Icac91064fcd923f77a83e8e133f8631c5b8fc241
+Fixes: c47174fc362a ("userfaultfd: selftest")
+Co-developed-by: Lokesh Gidra <lokeshgidra@google.com>
+Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
+Signed-off-by: Peter Collingbourne <pcc@google.com>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Cc: Dave Martin <Dave.Martin@arm.com>
+Cc: Will Deacon <will@kernel.org>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Alistair Delva <adelva@google.com>
+Cc: William McVicker <willmcvicker@google.com>
+Cc: Evgenii Stepanov <eugenis@google.com>
+Cc: Mitch Phillips <mitchp@google.com>
+Cc: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: <stable@vger.kernel.org>   [5.4]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/vm/userfaultfd.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/tools/testing/selftests/vm/userfaultfd.c
++++ b/tools/testing/selftests/vm/userfaultfd.c
+@@ -139,8 +139,10 @@ static int anon_release_pages(char *rel_
+ static void anon_allocate_area(void **alloc_area)
+ {
+-      if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
+-              fprintf(stderr, "out of memory\n");
++      *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
++                         MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
++      if (*alloc_area == MAP_FAILED)
++              fprintf(stderr, "mmap of anonymous memory failed");
+               *alloc_area = NULL;
+       }
+ }
index 14ed0ef8328df9e42d048579466e9f6d7ef756b0..5a72f71c8c314c37535fe5fafc1c82b02d7ed568 100644 (file)
@@ -86,3 +86,11 @@ usb-serial-option-add-support-for-u-blox-lara-r6-family.patch
 usb-serial-cp210x-fix-comments-for-ge-cs1000.patch
 usb-serial-cp210x-add-id-for-cel-em3588-usb-zigbee-stick.patch
 usb-dwc2-gadget-fix-sending-zero-length-packet-in-ddma-mode.patch
+firmware-efi-tell-memblock-about-efi-iomem-reservations.patch
+tracing-histogram-rename-cpu-to-common_cpu.patch
+tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch
+btrfs-check-for-missing-device-in-btrfs_trim_fs.patch
+media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch
+ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch
+selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch
+userfaultfd-do-not-untag-user-pointers.patch
diff --git a/queue-5.4/tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch b/queue-5.4/tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch
new file mode 100644 (file)
index 0000000..11c0fd0
--- /dev/null
@@ -0,0 +1,102 @@
+From 67f0d6d9883c13174669f88adac4f0ee656cc16a Mon Sep 17 00:00:00 2001
+From: Haoran Luo <www@aegistudio.net>
+Date: Wed, 21 Jul 2021 14:12:07 +0000
+Subject: tracing: Fix bug in rb_per_cpu_empty() that might cause deadloop.
+
+From: Haoran Luo <www@aegistudio.net>
+
+commit 67f0d6d9883c13174669f88adac4f0ee656cc16a upstream.
+
+The "rb_per_cpu_empty()" misinterpret the condition (as not-empty) when
+"head_page" and "commit_page" of "struct ring_buffer_per_cpu" points to
+the same buffer page, whose "buffer_data_page" is empty and "read" field
+is non-zero.
+
+An error scenario could be constructed as followed (kernel perspective):
+
+1. All pages in the buffer has been accessed by reader(s) so that all of
+them will have non-zero "read" field.
+
+2. Read and clear all buffer pages so that "rb_num_of_entries()" will
+return 0 rendering there's no more data to read. It is also required
+that the "read_page", "commit_page" and "tail_page" points to the same
+page, while "head_page" is the next page of them.
+
+3. Invoke "ring_buffer_lock_reserve()" with large enough "length"
+so that it shot pass the end of current tail buffer page. Now the
+"head_page", "commit_page" and "tail_page" points to the same page.
+
+4. Discard current event with "ring_buffer_discard_commit()", so that
+"head_page", "commit_page" and "tail_page" points to a page whose buffer
+data page is now empty.
+
+When the error scenario has been constructed, "tracing_read_pipe" will
+be trapped inside a deadloop: "trace_empty()" returns 0 since
+"rb_per_cpu_empty()" returns 0 when it hits the CPU containing such
+constructed ring buffer. Then "trace_find_next_entry_inc()" always
+return NULL since "rb_num_of_entries()" reports there's no more entry
+to read. Finally "trace_seq_to_user()" returns "-EBUSY" spanking
+"tracing_read_pipe" back to the start of the "waitagain" loop.
+
+I've also written a proof-of-concept script to construct the scenario
+and trigger the bug automatically, you can use it to trace and validate
+my reasoning above:
+
+  https://github.com/aegistudio/RingBufferDetonator.git
+
+Tests has been carried out on linux kernel 5.14-rc2
+(2734d6c1b1a089fb593ef6a23d4b70903526fe0c), my fixed version
+of kernel (for testing whether my update fixes the bug) and
+some older kernels (for range of affected kernels). Test result is
+also attached to the proof-of-concept repository.
+
+Link: https://lore.kernel.org/linux-trace-devel/YPaNxsIlb2yjSi5Y@aegistudio/
+Link: https://lore.kernel.org/linux-trace-devel/YPgrN85WL9VyrZ55@aegistudio
+
+Cc: stable@vger.kernel.org
+Fixes: bf41a158cacba ("ring-buffer: make reentrant")
+Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
+Signed-off-by: Haoran Luo <www@aegistudio.net>
+Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/ring_buffer.c |   28 ++++++++++++++++++++++++----
+ 1 file changed, 24 insertions(+), 4 deletions(-)
+
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -3221,10 +3221,30 @@ static bool rb_per_cpu_empty(struct ring
+       if (unlikely(!head))
+               return true;
+-      return reader->read == rb_page_commit(reader) &&
+-              (commit == reader ||
+-               (commit == head &&
+-                head->read == rb_page_commit(commit)));
++      /* Reader should exhaust content in reader page */
++      if (reader->read != rb_page_commit(reader))
++              return false;
++
++      /*
++       * If writers are committing on the reader page, knowing all
++       * committed content has been read, the ring buffer is empty.
++       */
++      if (commit == reader)
++              return true;
++
++      /*
++       * If writers are committing on a page other than reader page
++       * and head page, there should always be content to read.
++       */
++      if (commit != head)
++              return false;
++
++      /*
++       * Writers are committing on the head page, we just need
++       * to care about there're committed data, and the reader will
++       * swap reader page with head page when it is to read data.
++       */
++      return rb_page_commit(commit) == 0;
+ }
+ /**
diff --git a/queue-5.4/tracing-histogram-rename-cpu-to-common_cpu.patch b/queue-5.4/tracing-histogram-rename-cpu-to-common_cpu.patch
new file mode 100644 (file)
index 0000000..a7fa68f
--- /dev/null
@@ -0,0 +1,152 @@
+From 1e3bac71c5053c99d438771fc9fa5082ae5d90aa Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
+Date: Wed, 21 Jul 2021 11:00:53 -0400
+Subject: tracing/histogram: Rename "cpu" to "common_cpu"
+
+From: Steven Rostedt (VMware) <rostedt@goodmis.org>
+
+commit 1e3bac71c5053c99d438771fc9fa5082ae5d90aa upstream.
+
+Currently the histogram logic allows the user to write "cpu" in as an
+event field, and it will record the CPU that the event happened on.
+
+The problem with this is that there's a lot of events that have "cpu"
+as a real field, and using "cpu" as the CPU it ran on, makes it
+impossible to run histograms on the "cpu" field of events.
+
+For example, if I want to have a histogram on the count of the
+workqueue_queue_work event on its cpu field, running:
+
+ ># echo 'hist:keys=cpu' > events/workqueue/workqueue_queue_work/trigger
+
+Gives a misleading and wrong result.
+
+Change the command to "common_cpu" as no event should have "common_*"
+fields as that's a reserved name for fields used by all events. And
+this makes sense here as common_cpu would be a field used by all events.
+
+Now we can even do:
+
+ ># echo 'hist:keys=common_cpu,cpu if cpu < 100' > events/workqueue/workqueue_queue_work/trigger
+ ># cat events/workqueue/workqueue_queue_work/hist
+ # event histogram
+ #
+ # trigger info: hist:keys=common_cpu,cpu:vals=hitcount:sort=hitcount:size=2048 if cpu < 100 [active]
+ #
+
+ { common_cpu:          0, cpu:          2 } hitcount:          1
+ { common_cpu:          0, cpu:          4 } hitcount:          1
+ { common_cpu:          7, cpu:          7 } hitcount:          1
+ { common_cpu:          0, cpu:          7 } hitcount:          1
+ { common_cpu:          0, cpu:          1 } hitcount:          1
+ { common_cpu:          0, cpu:          6 } hitcount:          2
+ { common_cpu:          0, cpu:          5 } hitcount:          2
+ { common_cpu:          1, cpu:          1 } hitcount:          4
+ { common_cpu:          6, cpu:          6 } hitcount:          4
+ { common_cpu:          5, cpu:          5 } hitcount:         14
+ { common_cpu:          4, cpu:          4 } hitcount:         26
+ { common_cpu:          0, cpu:          0 } hitcount:         39
+ { common_cpu:          2, cpu:          2 } hitcount:        184
+
+Now for backward compatibility, I added a trick. If "cpu" is used, and
+the field is not found, it will fall back to "common_cpu" and work as
+it did before. This way, it will still work for old programs that use
+"cpu" to get the actual CPU, but if the event has a "cpu" as a field, it
+will get that event's "cpu" field, which is probably what it wants
+anyway.
+
+I updated the tracefs/README to include documentation about both the
+common_timestamp and the common_cpu. This way, if that text is present in
+the README, then an application can know that common_cpu is supported over
+just plain "cpu".
+
+Link: https://lkml.kernel.org/r/20210721110053.26b4f641@oasis.local.home
+
+Cc: Namhyung Kim <namhyung@kernel.org>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: stable@vger.kernel.org
+Fixes: 8b7622bf94a44 ("tracing: Add cpu field for hist triggers")
+Reviewed-by: Tom Zanussi <zanussi@kernel.org>
+Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
+Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/trace/histogram.rst |    2 +-
+ kernel/trace/trace.c              |    4 ++++
+ kernel/trace/trace_events_hist.c  |   22 ++++++++++++++++------
+ 3 files changed, 21 insertions(+), 7 deletions(-)
+
+--- a/Documentation/trace/histogram.rst
++++ b/Documentation/trace/histogram.rst
+@@ -191,7 +191,7 @@ Documentation written by Tom Zanussi
+                                 with the event, in nanoseconds.  May be
+                               modified by .usecs to have timestamps
+                               interpreted as microseconds.
+-    cpu                    int  the cpu on which the event occurred.
++    common_cpu             int  the cpu on which the event occurred.
+     ====================== ==== =======================================
+ Extended error information
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -4975,6 +4975,10 @@ static const char readme_msg[] =
+       "\t            [:name=histname1]\n"
+       "\t            [:<handler>.<action>]\n"
+       "\t            [if <filter>]\n\n"
++      "\t    Note, special fields can be used as well:\n"
++      "\t            common_timestamp - to record current timestamp\n"
++      "\t            common_cpu - to record the CPU the event happened on\n"
++      "\n"
+       "\t    When a matching event is hit, an entry is added to a hash\n"
+       "\t    table using the key(s) and value(s) named, and the value of a\n"
+       "\t    sum called 'hitcount' is incremented.  Keys and values\n"
+--- a/kernel/trace/trace_events_hist.c
++++ b/kernel/trace/trace_events_hist.c
+@@ -2001,7 +2001,7 @@ static const char *hist_field_name(struc
+                field->flags & HIST_FIELD_FL_ALIAS)
+               field_name = hist_field_name(field->operands[0], ++level);
+       else if (field->flags & HIST_FIELD_FL_CPU)
+-              field_name = "cpu";
++              field_name = "common_cpu";
+       else if (field->flags & HIST_FIELD_FL_EXPR ||
+                field->flags & HIST_FIELD_FL_VAR_REF) {
+               if (field->system) {
+@@ -2873,14 +2873,24 @@ parse_field(struct hist_trigger_data *hi
+               hist_data->enable_timestamps = true;
+               if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
+                       hist_data->attrs->ts_in_usecs = true;
+-      } else if (strcmp(field_name, "cpu") == 0)
++      } else if (strcmp(field_name, "common_cpu") == 0)
+               *flags |= HIST_FIELD_FL_CPU;
+       else {
+               field = trace_find_event_field(file->event_call, field_name);
+               if (!field || !field->size) {
+-                      hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name));
+-                      field = ERR_PTR(-EINVAL);
+-                      goto out;
++                      /*
++                       * For backward compatibility, if field_name
++                       * was "cpu", then we treat this the same as
++                       * common_cpu.
++                       */
++                      if (strcmp(field_name, "cpu") == 0) {
++                              *flags |= HIST_FIELD_FL_CPU;
++                      } else {
++                              hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
++                                       errpos(field_name));
++                              field = ERR_PTR(-EINVAL);
++                              goto out;
++                      }
+               }
+       }
+  out:
+@@ -5641,7 +5651,7 @@ static void hist_field_print(struct seq_
+               seq_printf(m, "%s=", hist_field->var.name);
+       if (hist_field->flags & HIST_FIELD_FL_CPU)
+-              seq_puts(m, "cpu");
++              seq_puts(m, "common_cpu");
+       else if (field_name) {
+               if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
+                   hist_field->flags & HIST_FIELD_FL_ALIAS)
diff --git a/queue-5.4/userfaultfd-do-not-untag-user-pointers.patch b/queue-5.4/userfaultfd-do-not-untag-user-pointers.patch
new file mode 100644 (file)
index 0000000..7b3c41b
--- /dev/null
@@ -0,0 +1,187 @@
+From e71e2ace5721a8b921dca18b045069e7bb411277 Mon Sep 17 00:00:00 2001
+From: Peter Collingbourne <pcc@google.com>
+Date: Fri, 23 Jul 2021 15:50:01 -0700
+Subject: userfaultfd: do not untag user pointers
+
+From: Peter Collingbourne <pcc@google.com>
+
+commit e71e2ace5721a8b921dca18b045069e7bb411277 upstream.
+
+Patch series "userfaultfd: do not untag user pointers", v5.
+
+If a user program uses userfaultfd on ranges of heap memory, it may end
+up passing a tagged pointer to the kernel in the range.start field of
+the UFFDIO_REGISTER ioctl.  This can happen when using an MTE-capable
+allocator, or on Android if using the Tagged Pointers feature for MTE
+readiness [1].
+
+When a fault subsequently occurs, the tag is stripped from the fault
+address returned to the application in the fault.address field of struct
+uffd_msg.  However, from the application's perspective, the tagged
+address *is* the memory address, so if the application is unaware of
+memory tags, it may get confused by receiving an address that is, from
+its point of view, outside of the bounds of the allocation.  We observed
+this behavior in the kselftest for userfaultfd [2] but other
+applications could have the same problem.
+
+Address this by not untagging pointers passed to the userfaultfd ioctls.
+Instead, let the system call fail.  Also change the kselftest to use
+mmap so that it doesn't encounter this problem.
+
+[1] https://source.android.com/devices/tech/debug/tagged-pointers
+[2] tools/testing/selftests/vm/userfaultfd.c
+
+This patch (of 2):
+
+Do not untag pointers passed to the userfaultfd ioctls.  Instead, let
+the system call fail.  This will provide an early indication of problems
+with tag-unaware userspace code instead of letting the code get confused
+later, and is consistent with how we decided to handle brk/mmap/mremap
+in commit dcde237319e6 ("mm: Avoid creating virtual address aliases in
+brk()/mmap()/mremap()"), as well as being consistent with the existing
+tagged address ABI documentation relating to how ioctl arguments are
+handled.
+
+The code change is a revert of commit 7d0325749a6c ("userfaultfd: untag
+user pointers") plus some fixups to some additional calls to
+validate_range that have appeared since then.
+
+[1] https://source.android.com/devices/tech/debug/tagged-pointers
+[2] tools/testing/selftests/vm/userfaultfd.c
+
+Link: https://lkml.kernel.org/r/20210714195437.118982-1-pcc@google.com
+Link: https://lkml.kernel.org/r/20210714195437.118982-2-pcc@google.com
+Link: https://linux-review.googlesource.com/id/I761aa9f0344454c482b83fcfcce547db0a25501b
+Fixes: 63f0c6037965 ("arm64: Introduce prctl() options to control the tagged user addresses ABI")
+Signed-off-by: Peter Collingbourne <pcc@google.com>
+Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Alistair Delva <adelva@google.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Dave Martin <Dave.Martin@arm.com>
+Cc: Evgenii Stepanov <eugenis@google.com>
+Cc: Lokesh Gidra <lokeshgidra@google.com>
+Cc: Mitch Phillips <mitchp@google.com>
+Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Cc: Will Deacon <will@kernel.org>
+Cc: William McVicker <willmcvicker@google.com>
+Cc: <stable@vger.kernel.org>   [5.4]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/arm64/tagged-address-abi.rst |   26 ++++++++++++++++++--------
+ fs/userfaultfd.c                           |   22 ++++++++++------------
+ 2 files changed, 28 insertions(+), 20 deletions(-)
+
+--- a/Documentation/arm64/tagged-address-abi.rst
++++ b/Documentation/arm64/tagged-address-abi.rst
+@@ -45,14 +45,24 @@ how the user addresses are used by the k
+ 1. User addresses not accessed by the kernel but used for address space
+    management (e.g. ``mprotect()``, ``madvise()``). The use of valid
+-   tagged pointers in this context is allowed with the exception of
+-   ``brk()``, ``mmap()`` and the ``new_address`` argument to
+-   ``mremap()`` as these have the potential to alias with existing
+-   user addresses.
+-
+-   NOTE: This behaviour changed in v5.6 and so some earlier kernels may
+-   incorrectly accept valid tagged pointers for the ``brk()``,
+-   ``mmap()`` and ``mremap()`` system calls.
++   tagged pointers in this context is allowed with these exceptions:
++
++   - ``brk()``, ``mmap()`` and the ``new_address`` argument to
++     ``mremap()`` as these have the potential to alias with existing
++      user addresses.
++
++     NOTE: This behaviour changed in v5.6 and so some earlier kernels may
++     incorrectly accept valid tagged pointers for the ``brk()``,
++     ``mmap()`` and ``mremap()`` system calls.
++
++   - The ``range.start``, ``start`` and ``dst`` arguments to the
++     ``UFFDIO_*`` ``ioctl()``s used on a file descriptor obtained from
++     ``userfaultfd()``, as fault addresses subsequently obtained by reading
++     the file descriptor will be untagged, which may otherwise confuse
++     tag-unaware programs.
++
++     NOTE: This behaviour changed in v5.14 and so some earlier kernels may
++     incorrectly accept valid tagged pointers for this system call.
+ 2. User addresses accessed by the kernel (e.g. ``write()``). This ABI
+    relaxation is disabled by default and the application thread needs to
+--- a/fs/userfaultfd.c
++++ b/fs/userfaultfd.c
+@@ -1272,23 +1272,21 @@ static __always_inline void wake_userfau
+ }
+ static __always_inline int validate_range(struct mm_struct *mm,
+-                                        __u64 *start, __u64 len)
++                                        __u64 start, __u64 len)
+ {
+       __u64 task_size = mm->task_size;
+-      *start = untagged_addr(*start);
+-
+-      if (*start & ~PAGE_MASK)
++      if (start & ~PAGE_MASK)
+               return -EINVAL;
+       if (len & ~PAGE_MASK)
+               return -EINVAL;
+       if (!len)
+               return -EINVAL;
+-      if (*start < mmap_min_addr)
++      if (start < mmap_min_addr)
+               return -EINVAL;
+-      if (*start >= task_size)
++      if (start >= task_size)
+               return -EINVAL;
+-      if (len > task_size - *start)
++      if (len > task_size - start)
+               return -EINVAL;
+       return 0;
+ }
+@@ -1338,7 +1336,7 @@ static int userfaultfd_register(struct u
+               goto out;
+       }
+-      ret = validate_range(mm, &uffdio_register.range.start,
++      ret = validate_range(mm, uffdio_register.range.start,
+                            uffdio_register.range.len);
+       if (ret)
+               goto out;
+@@ -1527,7 +1525,7 @@ static int userfaultfd_unregister(struct
+       if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
+               goto out;
+-      ret = validate_range(mm, &uffdio_unregister.start,
++      ret = validate_range(mm, uffdio_unregister.start,
+                            uffdio_unregister.len);
+       if (ret)
+               goto out;
+@@ -1678,7 +1676,7 @@ static int userfaultfd_wake(struct userf
+       if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
+               goto out;
+-      ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
++      ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+       if (ret)
+               goto out;
+@@ -1718,7 +1716,7 @@ static int userfaultfd_copy(struct userf
+                          sizeof(uffdio_copy)-sizeof(__s64)))
+               goto out;
+-      ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
++      ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+       if (ret)
+               goto out;
+       /*
+@@ -1774,7 +1772,7 @@ static int userfaultfd_zeropage(struct u
+                          sizeof(uffdio_zeropage)-sizeof(__s64)))
+               goto out;
+-      ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
++      ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+                            uffdio_zeropage.range.len);
+       if (ret)
+               goto out;