]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.10-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 26 Jul 2021 08:53:55 +0000 (10:53 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 26 Jul 2021 08:53:55 +0000 (10:53 +0200)
added patches:
btrfs-check-for-missing-device-in-btrfs_trim_fs.patch
bus-mhi-core-validate-channel-id-when-processing-command-completions.patch
firmware-efi-tell-memblock-about-efi-iomem-reservations.patch
io_uring-explicitly-count-entries-for-poll-reqs.patch
io_uring-remove-double-poll-entry-on-arm-failure.patch
ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch
media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch
posix-cpu-timers-fix-rearm-racing-against-process-tick.patch
selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch
tracepoints-update-static_call-before-tp_funcs-when-adding-a-tracepoint.patch
tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch
tracing-histogram-rename-cpu-to-common_cpu.patch
tracing-synthetic-event-field_pos-is-an-index-not-a-boolean.patch
userfaultfd-do-not-untag-user-pointers.patch

15 files changed:
queue-5.10/btrfs-check-for-missing-device-in-btrfs_trim_fs.patch [new file with mode: 0644]
queue-5.10/bus-mhi-core-validate-channel-id-when-processing-command-completions.patch [new file with mode: 0644]
queue-5.10/firmware-efi-tell-memblock-about-efi-iomem-reservations.patch [new file with mode: 0644]
queue-5.10/io_uring-explicitly-count-entries-for-poll-reqs.patch [new file with mode: 0644]
queue-5.10/io_uring-remove-double-poll-entry-on-arm-failure.patch [new file with mode: 0644]
queue-5.10/ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch [new file with mode: 0644]
queue-5.10/media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch [new file with mode: 0644]
queue-5.10/posix-cpu-timers-fix-rearm-racing-against-process-tick.patch [new file with mode: 0644]
queue-5.10/selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch [new file with mode: 0644]
queue-5.10/series
queue-5.10/tracepoints-update-static_call-before-tp_funcs-when-adding-a-tracepoint.patch [new file with mode: 0644]
queue-5.10/tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch [new file with mode: 0644]
queue-5.10/tracing-histogram-rename-cpu-to-common_cpu.patch [new file with mode: 0644]
queue-5.10/tracing-synthetic-event-field_pos-is-an-index-not-a-boolean.patch [new file with mode: 0644]
queue-5.10/userfaultfd-do-not-untag-user-pointers.patch [new file with mode: 0644]

diff --git a/queue-5.10/btrfs-check-for-missing-device-in-btrfs_trim_fs.patch b/queue-5.10/btrfs-check-for-missing-device-in-btrfs_trim_fs.patch
new file mode 100644 (file)
index 0000000..47f84df
--- /dev/null
@@ -0,0 +1,80 @@
+From 16a200f66ede3f9afa2e51d90ade017aaa18d213 Mon Sep 17 00:00:00 2001
+From: Anand Jain <anand.jain@oracle.com>
+Date: Sun, 4 Jul 2021 19:14:39 +0800
+Subject: btrfs: check for missing device in btrfs_trim_fs
+
+From: Anand Jain <anand.jain@oracle.com>
+
+commit 16a200f66ede3f9afa2e51d90ade017aaa18d213 upstream.
+
+A fstrim on a degraded raid1 can trigger the following null pointer
+dereference:
+
+  BTRFS info (device loop0): allowing degraded mounts
+  BTRFS info (device loop0): disk space caching is enabled
+  BTRFS info (device loop0): has skinny extents
+  BTRFS warning (device loop0): devid 2 uuid 97ac16f7-e14d-4db1-95bc-3d489b424adb is missing
+  BTRFS warning (device loop0): devid 2 uuid 97ac16f7-e14d-4db1-95bc-3d489b424adb is missing
+  BTRFS info (device loop0): enabling ssd optimizations
+  BUG: kernel NULL pointer dereference, address: 0000000000000620
+  PGD 0 P4D 0
+  Oops: 0000 [#1] SMP NOPTI
+  CPU: 0 PID: 4574 Comm: fstrim Not tainted 5.13.0-rc7+ #31
+  Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
+  RIP: 0010:btrfs_trim_fs+0x199/0x4a0 [btrfs]
+  RSP: 0018:ffff959541797d28 EFLAGS: 00010293
+  RAX: 0000000000000000 RBX: ffff946f84eca508 RCX: a7a67937adff8608
+  RDX: ffff946e8122d000 RSI: 0000000000000000 RDI: ffffffffc02fdbf0
+  RBP: ffff946ea4615000 R08: 0000000000000001 R09: 0000000000000000
+  R10: 0000000000000000 R11: ffff946e8122d960 R12: 0000000000000000
+  R13: ffff959541797db8 R14: ffff946e8122d000 R15: ffff959541797db8
+  FS:  00007f55917a5080(0000) GS:ffff946f9bc00000(0000) knlGS:0000000000000000
+  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  CR2: 0000000000000620 CR3: 000000002d2c8001 CR4: 00000000000706f0
+  Call Trace:
+  btrfs_ioctl_fitrim+0x167/0x260 [btrfs]
+  btrfs_ioctl+0x1c00/0x2fe0 [btrfs]
+  ? selinux_file_ioctl+0x140/0x240
+  ? syscall_trace_enter.constprop.0+0x188/0x240
+  ? __x64_sys_ioctl+0x83/0xb0
+  __x64_sys_ioctl+0x83/0xb0
+
+Reproducer:
+
+  $ mkfs.btrfs -fq -d raid1 -m raid1 /dev/loop0 /dev/loop1
+  $ mount /dev/loop0 /btrfs
+  $ umount /btrfs
+  $ btrfs dev scan --forget
+  $ mount -o degraded /dev/loop0 /btrfs
+
+  $ fstrim /btrfs
+
+The reason is we call btrfs_trim_free_extents() for the missing device,
+which uses device->bdev (NULL for missing device) to find if the device
+supports discard.
+
+Fix is to check if the device is missing before calling
+btrfs_trim_free_extents().
+
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent-tree.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -5883,6 +5883,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *
+       mutex_lock(&fs_info->fs_devices->device_list_mutex);
+       devices = &fs_info->fs_devices->devices;
+       list_for_each_entry(device, devices, dev_list) {
++              if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
++                      continue;
++
+               ret = btrfs_trim_free_extents(device, &group_trimmed);
+               if (ret) {
+                       dev_failed++;
diff --git a/queue-5.10/bus-mhi-core-validate-channel-id-when-processing-command-completions.patch b/queue-5.10/bus-mhi-core-validate-channel-id-when-processing-command-completions.patch
new file mode 100644 (file)
index 0000000..d1dc9eb
--- /dev/null
@@ -0,0 +1,56 @@
+From 546362a9ef2ef40b57c6605f14e88ced507f8dd0 Mon Sep 17 00:00:00 2001
+From: Bhaumik Bhatt <bbhatt@codeaurora.org>
+Date: Fri, 16 Jul 2021 13:21:05 +0530
+Subject: bus: mhi: core: Validate channel ID when processing command completions
+
+From: Bhaumik Bhatt <bbhatt@codeaurora.org>
+
+commit 546362a9ef2ef40b57c6605f14e88ced507f8dd0 upstream.
+
+MHI reads the channel ID from the event ring element sent by the
+device which can be any value between 0 and 255. In order to
+prevent any out of bound accesses, add a check against the maximum
+number of channels supported by the controller and those channels
+not configured yet so as to skip processing of that event ring
+element.
+
+Link: https://lore.kernel.org/r/1624558141-11045-1-git-send-email-bbhatt@codeaurora.org
+Fixes: 1d3173a3bae7 ("bus: mhi: core: Add support for processing events from client device")
+Cc: stable@vger.kernel.org #5.10
+Reviewed-by: Hemant Kumar <hemantk@codeaurora.org>
+Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
+Signed-off-by: Bhaumik Bhatt <bbhatt@codeaurora.org>
+Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+Link: https://lore.kernel.org/r/20210716075106.49938-3-manivannan.sadhasivam@linaro.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/bus/mhi/core/main.c |   17 ++++++++++++-----
+ 1 file changed, 12 insertions(+), 5 deletions(-)
+
+--- a/drivers/bus/mhi/core/main.c
++++ b/drivers/bus/mhi/core/main.c
+@@ -706,11 +706,18 @@ static void mhi_process_cmd_completion(s
+       cmd_pkt = mhi_to_virtual(mhi_ring, ptr);
+       chan = MHI_TRE_GET_CMD_CHID(cmd_pkt);
+-      mhi_chan = &mhi_cntrl->mhi_chan[chan];
+-      write_lock_bh(&mhi_chan->lock);
+-      mhi_chan->ccs = MHI_TRE_GET_EV_CODE(tre);
+-      complete(&mhi_chan->completion);
+-      write_unlock_bh(&mhi_chan->lock);
++
++      if (chan < mhi_cntrl->max_chan &&
++          mhi_cntrl->mhi_chan[chan].configured) {
++              mhi_chan = &mhi_cntrl->mhi_chan[chan];
++              write_lock_bh(&mhi_chan->lock);
++              mhi_chan->ccs = MHI_TRE_GET_EV_CODE(tre);
++              complete(&mhi_chan->completion);
++              write_unlock_bh(&mhi_chan->lock);
++      } else {
++              dev_err(&mhi_cntrl->mhi_dev->dev,
++                      "Completion packet for invalid channel ID: %d\n", chan);
++      }
+       mhi_del_ring_element(mhi_cntrl, mhi_ring);
+ }
diff --git a/queue-5.10/firmware-efi-tell-memblock-about-efi-iomem-reservations.patch b/queue-5.10/firmware-efi-tell-memblock-about-efi-iomem-reservations.patch
new file mode 100644 (file)
index 0000000..907fee2
--- /dev/null
@@ -0,0 +1,66 @@
+From 2bab693a608bdf614b9fcd44083c5100f34b9f77 Mon Sep 17 00:00:00 2001
+From: Marc Zyngier <maz@kernel.org>
+Date: Tue, 13 Jul 2021 19:43:26 +0100
+Subject: firmware/efi: Tell memblock about EFI iomem reservations
+
+From: Marc Zyngier <maz@kernel.org>
+
+commit 2bab693a608bdf614b9fcd44083c5100f34b9f77 upstream.
+
+kexec_load_file() relies on the memblock infrastructure to avoid
+stamping over regions of memory that are essential to the survival
+of the system.
+
+However, nobody seems to agree how to flag these regions as reserved,
+and (for example) EFI only publishes its reservations in /proc/iomem
+for the benefit of the traditional, userspace based kexec tool.
+
+On arm64 platforms with GICv3, this can result in the payload being
+placed at the location of the LPI tables. Shock, horror!
+
+Let's augment the EFI reservation code with a memblock_reserve() call,
+protecting our dear tables from the secondary kernel invasion.
+
+Reported-by: Moritz Fischer <mdf@kernel.org>
+Tested-by: Moritz Fischer <mdf@kernel.org>
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Cc: James Morse <james.morse@arm.com>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Will Deacon <will@kernel.org>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/firmware/efi/efi.c |   13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/drivers/firmware/efi/efi.c
++++ b/drivers/firmware/efi/efi.c
+@@ -896,6 +896,7 @@ static int __init efi_memreserve_map_roo
+ static int efi_mem_reserve_iomem(phys_addr_t addr, u64 size)
+ {
+       struct resource *res, *parent;
++      int ret;
+       res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
+       if (!res)
+@@ -908,7 +909,17 @@ static int efi_mem_reserve_iomem(phys_ad
+       /* we expect a conflict with a 'System RAM' region */
+       parent = request_resource_conflict(&iomem_resource, res);
+-      return parent ? request_resource(parent, res) : 0;
++      ret = parent ? request_resource(parent, res) : 0;
++
++      /*
++       * Given that efi_mem_reserve_iomem() can be called at any
++       * time, only call memblock_reserve() if the architecture
++       * keeps the infrastructure around.
++       */
++      if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK) && !ret)
++              memblock_reserve(addr, size);
++
++      return ret;
+ }
+ int __ref efi_mem_reserve_persistent(phys_addr_t addr, u64 size)
diff --git a/queue-5.10/io_uring-explicitly-count-entries-for-poll-reqs.patch b/queue-5.10/io_uring-explicitly-count-entries-for-poll-reqs.patch
new file mode 100644 (file)
index 0000000..cba0ad7
--- /dev/null
@@ -0,0 +1,74 @@
+From 68b11e8b1562986c134764433af64e97d30c9fc0 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Tue, 20 Jul 2021 10:50:43 +0100
+Subject: io_uring: explicitly count entries for poll reqs
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit 68b11e8b1562986c134764433af64e97d30c9fc0 upstream.
+
+If __io_queue_proc() fails to add a second poll entry, e.g. kmalloc()
+failed, but it goes on with a third waitqueue, it may succeed and
+overwrite the error status. Count the number of poll entries we added,
+so we can set pt->error to zero at the beginning and find out when the
+mentioned scenario happens.
+
+Cc: stable@vger.kernel.org
+Fixes: 18bceab101add ("io_uring: allow POLL_ADD with double poll_wait() users")
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/9d6b9e561f88bcc0163623b74a76c39f712151c3.1626774457.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c |   16 ++++++++++------
+ 1 file changed, 10 insertions(+), 6 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -4916,6 +4916,7 @@ static int io_connect(struct io_kiocb *r
+ struct io_poll_table {
+       struct poll_table_struct pt;
+       struct io_kiocb *req;
++      int nr_entries;
+       int error;
+ };
+@@ -5098,11 +5099,11 @@ static void __io_queue_proc(struct io_po
+       struct io_kiocb *req = pt->req;
+       /*
+-       * If poll->head is already set, it's because the file being polled
+-       * uses multiple waitqueues for poll handling (eg one for read, one
+-       * for write). Setup a separate io_poll_iocb if this happens.
++       * The file being polled uses multiple waitqueues for poll handling
++       * (e.g. one for read, one for write). Setup a separate io_poll_iocb
++       * if this happens.
+        */
+-      if (unlikely(poll->head)) {
++      if (unlikely(pt->nr_entries)) {
+               struct io_poll_iocb *poll_one = poll;
+               /* already have a 2nd entry, fail a third attempt */
+@@ -5124,7 +5125,7 @@ static void __io_queue_proc(struct io_po
+               *poll_ptr = poll;
+       }
+-      pt->error = 0;
++      pt->nr_entries++;
+       poll->head = head;
+       if (poll->events & EPOLLEXCLUSIVE)
+@@ -5210,9 +5211,12 @@ static __poll_t __io_arm_poll_handler(st
+       ipt->pt._key = mask;
+       ipt->req = req;
+-      ipt->error = -EINVAL;
++      ipt->error = 0;
++      ipt->nr_entries = 0;
+       mask = vfs_poll(req->file, &ipt->pt) & poll->events;
++      if (unlikely(!ipt->nr_entries) && !ipt->error)
++              ipt->error = -EINVAL;
+       spin_lock_irq(&ctx->completion_lock);
+       if (likely(poll->head)) {
diff --git a/queue-5.10/io_uring-remove-double-poll-entry-on-arm-failure.patch b/queue-5.10/io_uring-remove-double-poll-entry-on-arm-failure.patch
new file mode 100644 (file)
index 0000000..e2c3d78
--- /dev/null
@@ -0,0 +1,46 @@
+From 46fee9ab02cb24979bbe07631fc3ae95ae08aa3e Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Tue, 20 Jul 2021 10:50:44 +0100
+Subject: io_uring: remove double poll entry on arm failure
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit 46fee9ab02cb24979bbe07631fc3ae95ae08aa3e upstream.
+
+__io_queue_proc() can enqueue both poll entries and still fail
+afterwards, so the callers trying to cancel it should also try to remove
+the second poll entry (if any).
+
+For example, it may leave the request alive referencing a io_uring
+context but not accessible for cancellation:
+
+[  282.599913][ T1620] task:iou-sqp-23145   state:D stack:28720 pid:23155 ppid:  8844 flags:0x00004004
+[  282.609927][ T1620] Call Trace:
+[  282.613711][ T1620]  __schedule+0x93a/0x26f0
+[  282.634647][ T1620]  schedule+0xd3/0x270
+[  282.638874][ T1620]  io_uring_cancel_generic+0x54d/0x890
+[  282.660346][ T1620]  io_sq_thread+0xaac/0x1250
+[  282.696394][ T1620]  ret_from_fork+0x1f/0x30
+
+Cc: stable@vger.kernel.org
+Fixes: 18bceab101add ("io_uring: allow POLL_ADD with double poll_wait() users")
+Reported-and-tested-by: syzbot+ac957324022b7132accf@syzkaller.appspotmail.com
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/0ec1228fc5eda4cb524eeda857da8efdc43c331c.1626774457.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -5219,6 +5219,8 @@ static __poll_t __io_arm_poll_handler(st
+               ipt->error = -EINVAL;
+       spin_lock_irq(&ctx->completion_lock);
++      if (ipt->error)
++              io_poll_remove_double(req);
+       if (likely(poll->head)) {
+               spin_lock(&poll->head->lock);
+               if (unlikely(list_empty(&poll->wait.entry))) {
diff --git a/queue-5.10/ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch b/queue-5.10/ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch
new file mode 100644 (file)
index 0000000..2c8104c
--- /dev/null
@@ -0,0 +1,55 @@
+From 09cfae9f13d51700b0fecf591dcd658fc5375428 Mon Sep 17 00:00:00 2001
+From: Markus Boehme <markubo@amazon.com>
+Date: Tue, 20 Jul 2021 16:26:19 -0700
+Subject: ixgbe: Fix packet corruption due to missing DMA sync
+
+From: Markus Boehme <markubo@amazon.com>
+
+commit 09cfae9f13d51700b0fecf591dcd658fc5375428 upstream.
+
+When receiving a packet with multiple fragments, hardware may still
+touch the first fragment until the entire packet has been received. The
+driver therefore keeps the first fragment mapped for DMA until end of
+packet has been asserted, and delays its dma_sync call until then.
+
+The driver tries to fit multiple receive buffers on one page. When using
+3K receive buffers (e.g. using Jumbo frames and legacy-rx is turned
+off/build_skb is being used) on an architecture with 4K pages, the
+driver allocates an order 1 compound page and uses one page per receive
+buffer. To determine the correct offset for a delayed DMA sync of the
+first fragment of a multi-fragment packet, the driver then cannot just
+use PAGE_MASK on the DMA address but has to construct a mask based on
+the actual size of the backing page.
+
+Using PAGE_MASK in the 3K RX buffer/4K page architecture configuration
+will always sync the first page of a compound page. With the SWIOTLB
+enabled this can lead to corrupted packets (zeroed out first fragment,
+re-used garbage from another packet) and various consequences, such as
+slow/stalling data transfers and connection resets. For example, testing
+on a link with MTU exceeding 3058 bytes on a host with SWIOTLB enabled
+(e.g. "iommu=soft swiotlb=262144,force") TCP transfers quickly fizzle
+out without this patch.
+
+Cc: stable@vger.kernel.org
+Fixes: 0c5661ecc5dd7 ("ixgbe: fix crash in build_skb Rx code path")
+Signed-off-by: Markus Boehme <markubo@amazon.com>
+Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
++++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+@@ -1825,7 +1825,8 @@ static void ixgbe_dma_sync_frag(struct i
+                               struct sk_buff *skb)
+ {
+       if (ring_uses_build_skb(rx_ring)) {
+-              unsigned long offset = (unsigned long)(skb->data) & ~PAGE_MASK;
++              unsigned long mask = (unsigned long)ixgbe_rx_pg_size(rx_ring) - 1;
++              unsigned long offset = (unsigned long)(skb->data) & mask;
+               dma_sync_single_range_for_cpu(rx_ring->dev,
+                                             IXGBE_CB(skb)->dma,
diff --git a/queue-5.10/media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch b/queue-5.10/media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch
new file mode 100644 (file)
index 0000000..d233006
--- /dev/null
@@ -0,0 +1,82 @@
+From 8d4abca95ecc82fc8c41912fa0085281f19cc29f Mon Sep 17 00:00:00 2001
+From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
+Date: Mon, 19 Apr 2021 18:43:32 -0500
+Subject: media: ngene: Fix out-of-bounds bug in ngene_command_config_free_buf()
+
+From: Gustavo A. R. Silva <gustavoars@kernel.org>
+
+commit 8d4abca95ecc82fc8c41912fa0085281f19cc29f upstream.
+
+Fix an 11-year old bug in ngene_command_config_free_buf() while
+addressing the following warnings caught with -Warray-bounds:
+
+arch/alpha/include/asm/string.h:22:16: warning: '__builtin_memcpy' offset [12, 16] from the object at 'com' is out of the bounds of referenced subobject 'config' with type 'unsigned char' at offset 10 [-Warray-bounds]
+arch/x86/include/asm/string_32.h:182:25: warning: '__builtin_memcpy' offset [12, 16] from the object at 'com' is out of the bounds of referenced subobject 'config' with type 'unsigned char' at offset 10 [-Warray-bounds]
+
+The problem is that the original code is trying to copy 6 bytes of
+data into a one-byte size member _config_ of the wrong structue
+FW_CONFIGURE_BUFFERS, in a single call to memcpy(). This causes a
+legitimate compiler warning because memcpy() overruns the length
+of &com.cmd.ConfigureBuffers.config. It seems that the right
+structure is FW_CONFIGURE_FREE_BUFFERS, instead, because it contains
+6 more members apart from the header _hdr_. Also, the name of
+the function ngene_command_config_free_buf() suggests that the actual
+intention is to ConfigureFreeBuffers, instead of ConfigureBuffers
+(which takes place in the function ngene_command_config_buf(), above).
+
+Fix this by enclosing those 6 members of struct FW_CONFIGURE_FREE_BUFFERS
+into new struct config, and use &com.cmd.ConfigureFreeBuffers.config as
+the destination address, instead of &com.cmd.ConfigureBuffers.config,
+when calling memcpy().
+
+This also helps with the ongoing efforts to globally enable
+-Warray-bounds and get us closer to being able to tighten the
+FORTIFY_SOURCE routines on memcpy().
+
+Link: https://github.com/KSPP/linux/issues/109
+Fixes: dae52d009fc9 ("V4L/DVB: ngene: Initial check-in")
+Cc: stable@vger.kernel.org
+Reported-by: kernel test robot <lkp@intel.com>
+Reviewed-by: Kees Cook <keescook@chromium.org>
+Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
+Link: https://lore.kernel.org/linux-hardening/20210420001631.GA45456@embeddedor/
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/media/pci/ngene/ngene-core.c |    2 +-
+ drivers/media/pci/ngene/ngene.h      |   14 ++++++++------
+ 2 files changed, 9 insertions(+), 7 deletions(-)
+
+--- a/drivers/media/pci/ngene/ngene-core.c
++++ b/drivers/media/pci/ngene/ngene-core.c
+@@ -385,7 +385,7 @@ static int ngene_command_config_free_buf
+       com.cmd.hdr.Opcode = CMD_CONFIGURE_FREE_BUFFER;
+       com.cmd.hdr.Length = 6;
+-      memcpy(&com.cmd.ConfigureBuffers.config, config, 6);
++      memcpy(&com.cmd.ConfigureFreeBuffers.config, config, 6);
+       com.in_len = 6;
+       com.out_len = 0;
+--- a/drivers/media/pci/ngene/ngene.h
++++ b/drivers/media/pci/ngene/ngene.h
+@@ -407,12 +407,14 @@ enum _BUFFER_CONFIGS {
+ struct FW_CONFIGURE_FREE_BUFFERS {
+       struct FW_HEADER hdr;
+-      u8   UVI1_BufferLength;
+-      u8   UVI2_BufferLength;
+-      u8   TVO_BufferLength;
+-      u8   AUD1_BufferLength;
+-      u8   AUD2_BufferLength;
+-      u8   TVA_BufferLength;
++      struct {
++              u8   UVI1_BufferLength;
++              u8   UVI2_BufferLength;
++              u8   TVO_BufferLength;
++              u8   AUD1_BufferLength;
++              u8   AUD2_BufferLength;
++              u8   TVA_BufferLength;
++      } __packed config;
+ } __attribute__ ((__packed__));
+ struct FW_CONFIGURE_UART {
diff --git a/queue-5.10/posix-cpu-timers-fix-rearm-racing-against-process-tick.patch b/queue-5.10/posix-cpu-timers-fix-rearm-racing-against-process-tick.patch
new file mode 100644 (file)
index 0000000..85675c1
--- /dev/null
@@ -0,0 +1,73 @@
+From 1a3402d93c73bf6bb4df6d7c2aac35abfc3c50e2 Mon Sep 17 00:00:00 2001
+From: Frederic Weisbecker <frederic@kernel.org>
+Date: Thu, 3 Jun 2021 01:15:59 +0200
+Subject: posix-cpu-timers: Fix rearm racing against process tick
+
+From: Frederic Weisbecker <frederic@kernel.org>
+
+commit 1a3402d93c73bf6bb4df6d7c2aac35abfc3c50e2 upstream.
+
+Since the process wide cputime counter is started locklessly from
+posix_cpu_timer_rearm(), it can be concurrently stopped by operations
+on other timers from the same thread group, such as in the following
+unlucky scenario:
+
+         CPU 0                                CPU 1
+         -----                                -----
+                                           timer_settime(TIMER B)
+   posix_cpu_timer_rearm(TIMER A)
+       cpu_clock_sample_group()
+           (pct->timers_active already true)
+
+                                           handle_posix_cpu_timers()
+                                               check_process_timers()
+                                                   stop_process_timers()
+                                                       pct->timers_active = false
+       arm_timer(TIMER A)
+
+   tick -> run_posix_cpu_timers()
+       // sees !pct->timers_active, ignore
+       // our TIMER A
+
+Fix this with simply locking process wide cputime counting start and
+timer arm in the same block.
+
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
+Fixes: 60f2ceaa8111 ("posix-cpu-timers: Remove unnecessary locking around cpu_clock_sample_group")
+Cc: stable@vger.kernel.org
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Eric W. Biederman <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/time/posix-cpu-timers.c |   10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/kernel/time/posix-cpu-timers.c
++++ b/kernel/time/posix-cpu-timers.c
+@@ -991,6 +991,11 @@ static void posix_cpu_timer_rearm(struct
+       if (!p)
+               goto out;
++      /* Protect timer list r/w in arm_timer() */
++      sighand = lock_task_sighand(p, &flags);
++      if (unlikely(sighand == NULL))
++              goto out;
++
+       /*
+        * Fetch the current sample and update the timer's expiry time.
+        */
+@@ -1001,11 +1006,6 @@ static void posix_cpu_timer_rearm(struct
+       bump_cpu_timer(timer, now);
+-      /* Protect timer list r/w in arm_timer() */
+-      sighand = lock_task_sighand(p, &flags);
+-      if (unlikely(sighand == NULL))
+-              goto out;
+-
+       /*
+        * Now re-arm for the new expiry time.
+        */
diff --git a/queue-5.10/selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch b/queue-5.10/selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch
new file mode 100644 (file)
index 0000000..b802709
--- /dev/null
@@ -0,0 +1,56 @@
+From 0db282ba2c12c1515d490d14a1ff696643ab0f1b Mon Sep 17 00:00:00 2001
+From: Peter Collingbourne <pcc@google.com>
+Date: Fri, 23 Jul 2021 15:50:04 -0700
+Subject: selftest: use mmap instead of posix_memalign to allocate memory
+
+From: Peter Collingbourne <pcc@google.com>
+
+commit 0db282ba2c12c1515d490d14a1ff696643ab0f1b upstream.
+
+This test passes pointers obtained from anon_allocate_area to the
+userfaultfd and mremap APIs.  This causes a problem if the system
+allocator returns tagged pointers because with the tagged address ABI
+the kernel rejects tagged addresses passed to these APIs, which would
+end up causing the test to fail.  To make this test compatible with such
+system allocators, stop using the system allocator to allocate memory in
+anon_allocate_area, and instead just use mmap.
+
+Link: https://lkml.kernel.org/r/20210714195437.118982-3-pcc@google.com
+Link: https://linux-review.googlesource.com/id/Icac91064fcd923f77a83e8e133f8631c5b8fc241
+Fixes: c47174fc362a ("userfaultfd: selftest")
+Co-developed-by: Lokesh Gidra <lokeshgidra@google.com>
+Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
+Signed-off-by: Peter Collingbourne <pcc@google.com>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Cc: Dave Martin <Dave.Martin@arm.com>
+Cc: Will Deacon <will@kernel.org>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Alistair Delva <adelva@google.com>
+Cc: William McVicker <willmcvicker@google.com>
+Cc: Evgenii Stepanov <eugenis@google.com>
+Cc: Mitch Phillips <mitchp@google.com>
+Cc: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: <stable@vger.kernel.org>   [5.4]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/vm/userfaultfd.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/tools/testing/selftests/vm/userfaultfd.c
++++ b/tools/testing/selftests/vm/userfaultfd.c
+@@ -180,8 +180,10 @@ static int anon_release_pages(char *rel_
+ static void anon_allocate_area(void **alloc_area)
+ {
+-      if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
+-              fprintf(stderr, "out of memory\n");
++      *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
++                         MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
++      if (*alloc_area == MAP_FAILED)
++              fprintf(stderr, "mmap of anonymous memory failed");
+               *alloc_area = NULL;
+       }
+ }
index 63f9b819c8dc86b285614a43753c2d24d34e4a21..ccf5c14b826591d881f3cccf2f1db65299165194 100644 (file)
@@ -132,3 +132,17 @@ usb-gadget-fix-unbalanced-pm_runtime_enable-in-tegra_xudc_probe.patch
 usb-dwc2-gadget-fix-goutnak-flow-for-slave-mode.patch
 usb-dwc2-gadget-fix-sending-zero-length-packet-in-ddma-mode.patch
 usb-typec-stusb160x-register-role-switch-before-interrupt-registration.patch
+firmware-efi-tell-memblock-about-efi-iomem-reservations.patch
+tracepoints-update-static_call-before-tp_funcs-when-adding-a-tracepoint.patch
+tracing-histogram-rename-cpu-to-common_cpu.patch
+tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch
+tracing-synthetic-event-field_pos-is-an-index-not-a-boolean.patch
+btrfs-check-for-missing-device-in-btrfs_trim_fs.patch
+media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch
+ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch
+bus-mhi-core-validate-channel-id-when-processing-command-completions.patch
+posix-cpu-timers-fix-rearm-racing-against-process-tick.patch
+selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch
+io_uring-explicitly-count-entries-for-poll-reqs.patch
+io_uring-remove-double-poll-entry-on-arm-failure.patch
+userfaultfd-do-not-untag-user-pointers.patch
diff --git a/queue-5.10/tracepoints-update-static_call-before-tp_funcs-when-adding-a-tracepoint.patch b/queue-5.10/tracepoints-update-static_call-before-tp_funcs-when-adding-a-tracepoint.patch
new file mode 100644 (file)
index 0000000..f6c3646
--- /dev/null
@@ -0,0 +1,120 @@
+From 352384d5c84ebe40fa77098cc234fe173247d8ef Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
+Date: Thu, 22 Jul 2021 21:52:18 -0400
+Subject: tracepoints: Update static_call before tp_funcs when adding a tracepoint
+
+From: Steven Rostedt (VMware) <rostedt@goodmis.org>
+
+commit 352384d5c84ebe40fa77098cc234fe173247d8ef upstream.
+
+Because of the significant overhead that retpolines pose on indirect
+calls, the tracepoint code was updated to use the new "static_calls" that
+can modify the running code to directly call a function instead of using
+an indirect caller, and this function can be changed at runtime.
+
+In the tracepoint code that calls all the registered callbacks that are
+attached to a tracepoint, the following is done:
+
+       it_func_ptr = rcu_dereference_raw((&__tracepoint_##name)->funcs);
+       if (it_func_ptr) {
+               __data = (it_func_ptr)->data;
+               static_call(tp_func_##name)(__data, args);
+       }
+
+If there's just a single callback, the static_call is updated to just call
+that callback directly. Once another handler is added, then the static
+caller is updated to call the iterator, that simply loops over all the
+funcs in the array and calls each of the callbacks like the old method
+using indirect calling.
+
+The issue was discovered with a race between updating the funcs array and
+updating the static_call. The funcs array was updated first and then the
+static_call was updated. This is not an issue as long as the first element
+in the old array is the same as the first element in the new array. But
+that assumption is incorrect, because callbacks also have a priority
+field, and if there's a callback added that has a higher priority than the
+callback on the old array, then it will become the first callback in the
+new array. This means that it is possible to call the old callback with
+the new callback data element, which can cause a kernel panic.
+
+       static_call = callback1()
+       funcs[] = {callback1,data1};
+       callback2 has higher priority than callback1
+
+       CPU 1                           CPU 2
+       -----                           -----
+
+   new_funcs = {callback2,data2},
+               {callback1,data1}
+
+   rcu_assign_pointer(tp->funcs, new_funcs);
+
+  /*
+   * Now tp->funcs has the new array
+   * but the static_call still calls callback1
+   */
+
+                               it_func_ptr = tp->funcs [ new_funcs ]
+                               data = it_func_ptr->data [ data2 ]
+                               static_call(callback1, data);
+
+                               /* Now callback1 is called with
+                                * callback2's data */
+
+                               [ KERNEL PANIC ]
+
+   update_static_call(iterator);
+
+To prevent this from happening, always switch the static_call to the
+iterator before assigning the tp->funcs to the new array. The iterator will
+always properly match the callback with its data.
+
+To trigger this bug:
+
+  In one terminal:
+
+    while :; do hackbench 50; done
+
+  In another terminal
+
+    echo 1 > /sys/kernel/tracing/events/sched/sched_waking/enable
+    while :; do
+        echo 1 > /sys/kernel/tracing/set_event_pid;
+        sleep 0.5
+        echo 0 > /sys/kernel/tracing/set_event_pid;
+        sleep 0.5
+   done
+
+And it doesn't take long to crash. This is because the set_event_pid adds
+a callback to the sched_waking tracepoint with a high priority, which will
+be called before the sched_waking trace event callback is called.
+
+Note, the removal to a single callback updates the array first, before
+changing the static_call to single callback, which is the proper order as
+the first element in the array is the same as what the static_call is
+being changed to.
+
+Link: https://lore.kernel.org/io-uring/4ebea8f0-58c9-e571-fd30-0ce4f6f09c70@samba.org/
+
+Cc: stable@vger.kernel.org
+Fixes: d25e37d89dd2f ("tracepoint: Optimize using static_call()")
+Reported-by: Stefan Metzmacher <metze@samba.org>
+tested-by: Stefan Metzmacher <metze@samba.org>
+Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/tracepoint.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/tracepoint.c
++++ b/kernel/tracepoint.c
+@@ -320,8 +320,8 @@ static int tracepoint_add_func(struct tr
+        * a pointer to it.  This array is referenced by __DO_TRACE from
+        * include/linux/tracepoint.h using rcu_dereference_sched().
+        */
+-      rcu_assign_pointer(tp->funcs, tp_funcs);
+       tracepoint_update_call(tp, tp_funcs, false);
++      rcu_assign_pointer(tp->funcs, tp_funcs);
+       static_key_enable(&tp->key);
+       release_probes(old);
diff --git a/queue-5.10/tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch b/queue-5.10/tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch
new file mode 100644 (file)
index 0000000..eeeaf7e
--- /dev/null
@@ -0,0 +1,102 @@
+From 67f0d6d9883c13174669f88adac4f0ee656cc16a Mon Sep 17 00:00:00 2001
+From: Haoran Luo <www@aegistudio.net>
+Date: Wed, 21 Jul 2021 14:12:07 +0000
+Subject: tracing: Fix bug in rb_per_cpu_empty() that might cause deadloop.
+
+From: Haoran Luo <www@aegistudio.net>
+
+commit 67f0d6d9883c13174669f88adac4f0ee656cc16a upstream.
+
+The "rb_per_cpu_empty()" misinterpret the condition (as not-empty) when
+"head_page" and "commit_page" of "struct ring_buffer_per_cpu" points to
+the same buffer page, whose "buffer_data_page" is empty and "read" field
+is non-zero.
+
+An error scenario could be constructed as followed (kernel perspective):
+
+1. All pages in the buffer has been accessed by reader(s) so that all of
+them will have non-zero "read" field.
+
+2. Read and clear all buffer pages so that "rb_num_of_entries()" will
+return 0 rendering there's no more data to read. It is also required
+that the "read_page", "commit_page" and "tail_page" points to the same
+page, while "head_page" is the next page of them.
+
+3. Invoke "ring_buffer_lock_reserve()" with large enough "length"
+so that it shot pass the end of current tail buffer page. Now the
+"head_page", "commit_page" and "tail_page" points to the same page.
+
+4. Discard current event with "ring_buffer_discard_commit()", so that
+"head_page", "commit_page" and "tail_page" points to a page whose buffer
+data page is now empty.
+
+When the error scenario has been constructed, "tracing_read_pipe" will
+be trapped inside a deadloop: "trace_empty()" returns 0 since
+"rb_per_cpu_empty()" returns 0 when it hits the CPU containing such
+constructed ring buffer. Then "trace_find_next_entry_inc()" always
+return NULL since "rb_num_of_entries()" reports there's no more entry
+to read. Finally "trace_seq_to_user()" returns "-EBUSY" spanking
+"tracing_read_pipe" back to the start of the "waitagain" loop.
+
+I've also written a proof-of-concept script to construct the scenario
+and trigger the bug automatically, you can use it to trace and validate
+my reasoning above:
+
+  https://github.com/aegistudio/RingBufferDetonator.git
+
+Tests has been carried out on linux kernel 5.14-rc2
+(2734d6c1b1a089fb593ef6a23d4b70903526fe0c), my fixed version
+of kernel (for testing whether my update fixes the bug) and
+some older kernels (for range of affected kernels). Test result is
+also attached to the proof-of-concept repository.
+
+Link: https://lore.kernel.org/linux-trace-devel/YPaNxsIlb2yjSi5Y@aegistudio/
+Link: https://lore.kernel.org/linux-trace-devel/YPgrN85WL9VyrZ55@aegistudio
+
+Cc: stable@vger.kernel.org
+Fixes: bf41a158cacba ("ring-buffer: make reentrant")
+Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
+Signed-off-by: Haoran Luo <www@aegistudio.net>
+Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/ring_buffer.c |   28 ++++++++++++++++++++++++----
+ 1 file changed, 24 insertions(+), 4 deletions(-)
+
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -3649,10 +3649,30 @@ static bool rb_per_cpu_empty(struct ring
+       if (unlikely(!head))
+               return true;
+-      return reader->read == rb_page_commit(reader) &&
+-              (commit == reader ||
+-               (commit == head &&
+-                head->read == rb_page_commit(commit)));
++      /* Reader should exhaust content in reader page */
++      if (reader->read != rb_page_commit(reader))
++              return false;
++
++      /*
++       * If writers are committing on the reader page, knowing all
++       * committed content has been read, the ring buffer is empty.
++       */
++      if (commit == reader)
++              return true;
++
++      /*
++       * If writers are committing on a page other than reader page
++       * and head page, there should always be content to read.
++       */
++      if (commit != head)
++              return false;
++
++      /*
++       * Writers are committing on the head page, we just need
++       * to care about there're committed data, and the reader will
++       * swap reader page with head page when it is to read data.
++       */
++      return rb_page_commit(commit) == 0;
+ }
+ /**
diff --git a/queue-5.10/tracing-histogram-rename-cpu-to-common_cpu.patch b/queue-5.10/tracing-histogram-rename-cpu-to-common_cpu.patch
new file mode 100644 (file)
index 0000000..2baafd3
--- /dev/null
@@ -0,0 +1,152 @@
+From 1e3bac71c5053c99d438771fc9fa5082ae5d90aa Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
+Date: Wed, 21 Jul 2021 11:00:53 -0400
+Subject: tracing/histogram: Rename "cpu" to "common_cpu"
+
+From: Steven Rostedt (VMware) <rostedt@goodmis.org>
+
+commit 1e3bac71c5053c99d438771fc9fa5082ae5d90aa upstream.
+
+Currently the histogram logic allows the user to write "cpu" in as an
+event field, and it will record the CPU that the event happened on.
+
+The problem with this is that there's a lot of events that have "cpu"
+as a real field, and using "cpu" as the CPU it ran on, makes it
+impossible to run histograms on the "cpu" field of events.
+
+For example, if I want to have a histogram on the count of the
+workqueue_queue_work event on its cpu field, running:
+
+ ># echo 'hist:keys=cpu' > events/workqueue/workqueue_queue_work/trigger
+
+Gives a misleading and wrong result.
+
+Change the command to "common_cpu" as no event should have "common_*"
+fields as that's a reserved name for fields used by all events. And
+this makes sense here as common_cpu would be a field used by all events.
+
+Now we can even do:
+
+ ># echo 'hist:keys=common_cpu,cpu if cpu < 100' > events/workqueue/workqueue_queue_work/trigger
+ ># cat events/workqueue/workqueue_queue_work/hist
+ # event histogram
+ #
+ # trigger info: hist:keys=common_cpu,cpu:vals=hitcount:sort=hitcount:size=2048 if cpu < 100 [active]
+ #
+
+ { common_cpu:          0, cpu:          2 } hitcount:          1
+ { common_cpu:          0, cpu:          4 } hitcount:          1
+ { common_cpu:          7, cpu:          7 } hitcount:          1
+ { common_cpu:          0, cpu:          7 } hitcount:          1
+ { common_cpu:          0, cpu:          1 } hitcount:          1
+ { common_cpu:          0, cpu:          6 } hitcount:          2
+ { common_cpu:          0, cpu:          5 } hitcount:          2
+ { common_cpu:          1, cpu:          1 } hitcount:          4
+ { common_cpu:          6, cpu:          6 } hitcount:          4
+ { common_cpu:          5, cpu:          5 } hitcount:         14
+ { common_cpu:          4, cpu:          4 } hitcount:         26
+ { common_cpu:          0, cpu:          0 } hitcount:         39
+ { common_cpu:          2, cpu:          2 } hitcount:        184
+
+Now for backward compatibility, I added a trick. If "cpu" is used, and
+the field is not found, it will fall back to "common_cpu" and work as
+it did before. This way, it will still work for old programs that use
+"cpu" to get the actual CPU, but if the event has a "cpu" as a field, it
+will get that event's "cpu" field, which is probably what it wants
+anyway.
+
+I updated the tracefs/README to include documentation about both the
+common_timestamp and the common_cpu. This way, if that text is present in
+the README, then an application can know that common_cpu is supported over
+just plain "cpu".
+
+Link: https://lkml.kernel.org/r/20210721110053.26b4f641@oasis.local.home
+
+Cc: Namhyung Kim <namhyung@kernel.org>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: stable@vger.kernel.org
+Fixes: 8b7622bf94a44 ("tracing: Add cpu field for hist triggers")
+Reviewed-by: Tom Zanussi <zanussi@kernel.org>
+Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
+Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/trace/histogram.rst |    2 +-
+ kernel/trace/trace.c              |    4 ++++
+ kernel/trace/trace_events_hist.c  |   22 ++++++++++++++++------
+ 3 files changed, 21 insertions(+), 7 deletions(-)
+
+--- a/Documentation/trace/histogram.rst
++++ b/Documentation/trace/histogram.rst
+@@ -191,7 +191,7 @@ Documentation written by Tom Zanussi
+                                 with the event, in nanoseconds.  May be
+                               modified by .usecs to have timestamps
+                               interpreted as microseconds.
+-    cpu                    int  the cpu on which the event occurred.
++    common_cpu             int  the cpu on which the event occurred.
+     ====================== ==== =======================================
+ Extended error information
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -5241,6 +5241,10 @@ static const char readme_msg[] =
+       "\t            [:name=histname1]\n"
+       "\t            [:<handler>.<action>]\n"
+       "\t            [if <filter>]\n\n"
++      "\t    Note, special fields can be used as well:\n"
++      "\t            common_timestamp - to record current timestamp\n"
++      "\t            common_cpu - to record the CPU the event happened on\n"
++      "\n"
+       "\t    When a matching event is hit, an entry is added to a hash\n"
+       "\t    table using the key(s) and value(s) named, and the value of a\n"
+       "\t    sum called 'hitcount' is incremented.  Keys and values\n"
+--- a/kernel/trace/trace_events_hist.c
++++ b/kernel/trace/trace_events_hist.c
+@@ -1095,7 +1095,7 @@ static const char *hist_field_name(struc
+                field->flags & HIST_FIELD_FL_ALIAS)
+               field_name = hist_field_name(field->operands[0], ++level);
+       else if (field->flags & HIST_FIELD_FL_CPU)
+-              field_name = "cpu";
++              field_name = "common_cpu";
+       else if (field->flags & HIST_FIELD_FL_EXPR ||
+                field->flags & HIST_FIELD_FL_VAR_REF) {
+               if (field->system) {
+@@ -1975,14 +1975,24 @@ parse_field(struct hist_trigger_data *hi
+               hist_data->enable_timestamps = true;
+               if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
+                       hist_data->attrs->ts_in_usecs = true;
+-      } else if (strcmp(field_name, "cpu") == 0)
++      } else if (strcmp(field_name, "common_cpu") == 0)
+               *flags |= HIST_FIELD_FL_CPU;
+       else {
+               field = trace_find_event_field(file->event_call, field_name);
+               if (!field || !field->size) {
+-                      hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name));
+-                      field = ERR_PTR(-EINVAL);
+-                      goto out;
++                      /*
++                       * For backward compatibility, if field_name
++                       * was "cpu", then we treat this the same as
++                       * common_cpu.
++                       */
++                      if (strcmp(field_name, "cpu") == 0) {
++                              *flags |= HIST_FIELD_FL_CPU;
++                      } else {
++                              hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
++                                       errpos(field_name));
++                              field = ERR_PTR(-EINVAL);
++                              goto out;
++                      }
+               }
+       }
+  out:
+@@ -5057,7 +5067,7 @@ static void hist_field_print(struct seq_
+               seq_printf(m, "%s=", hist_field->var.name);
+       if (hist_field->flags & HIST_FIELD_FL_CPU)
+-              seq_puts(m, "cpu");
++              seq_puts(m, "common_cpu");
+       else if (field_name) {
+               if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
+                   hist_field->flags & HIST_FIELD_FL_ALIAS)
diff --git a/queue-5.10/tracing-synthetic-event-field_pos-is-an-index-not-a-boolean.patch b/queue-5.10/tracing-synthetic-event-field_pos-is-an-index-not-a-boolean.patch
new file mode 100644 (file)
index 0000000..1a612ed
--- /dev/null
@@ -0,0 +1,98 @@
+From 3b13911a2fd0dd0146c9777a254840c5466cf120 Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
+Date: Wed, 21 Jul 2021 19:10:08 -0400
+Subject: tracing: Synthetic event field_pos is an index not a boolean
+
+From: Steven Rostedt (VMware) <rostedt@goodmis.org>
+
+commit 3b13911a2fd0dd0146c9777a254840c5466cf120 upstream.
+
+Performing the following:
+
+ ># echo 'wakeup_lat s32 pid; u64 delta; char wake_comm[]' > synthetic_events
+ ># echo 'hist:keys=pid:__arg__1=common_timestamp.usecs' > events/sched/sched_waking/trigger
+ ># echo 'hist:keys=next_pid:pid=next_pid,delta=common_timestamp.usecs-$__arg__1:onmatch(sched.sched_waking).trace(wakeup_lat,$pid,$delta,prev_comm)'\
+      > events/sched/sched_switch/trigger
+ ># echo 1 > events/synthetic/enable
+
+Crashed the kernel:
+
+ BUG: kernel NULL pointer dereference, address: 000000000000001b
+ #PF: supervisor read access in kernel mode
+ #PF: error_code(0x0000) - not-present page
+ PGD 0 P4D 0
+ Oops: 0000 [#1] PREEMPT SMP
+ CPU: 7 PID: 0 Comm: swapper/7 Not tainted 5.13.0-rc5-test+ #104
+ Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016
+ RIP: 0010:strlen+0x0/0x20
+ Code: f6 82 80 2b 0b bc 20 74 11 0f b6 50 01 48 83 c0 01 f6 82 80 2b 0b bc
+  20 75 ef c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 <80> 3f 00 74 10
+  48 89 f8 48 83 c0 01 80 38 9 f8 c3 31
+ RSP: 0018:ffffaa75000d79d0 EFLAGS: 00010046
+ RAX: 0000000000000002 RBX: ffff9cdb55575270 RCX: 0000000000000000
+ RDX: ffff9cdb58c7a320 RSI: ffffaa75000d7b40 RDI: 000000000000001b
+ RBP: ffffaa75000d7b40 R08: ffff9cdb40a4f010 R09: ffffaa75000d7ab8
+ R10: ffff9cdb4398c700 R11: 0000000000000008 R12: ffff9cdb58c7a320
+ R13: ffff9cdb55575270 R14: ffff9cdb58c7a000 R15: 0000000000000018
+ FS:  0000000000000000(0000) GS:ffff9cdb5aa00000(0000) knlGS:0000000000000000
+ CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ CR2: 000000000000001b CR3: 00000000c0612006 CR4: 00000000001706e0
+ Call Trace:
+  trace_event_raw_event_synth+0x90/0x1d0
+  action_trace+0x5b/0x70
+  event_hist_trigger+0x4bd/0x4e0
+  ? cpumask_next_and+0x20/0x30
+  ? update_sd_lb_stats.constprop.0+0xf6/0x840
+  ? __lock_acquire.constprop.0+0x125/0x550
+  ? find_held_lock+0x32/0x90
+  ? sched_clock_cpu+0xe/0xd0
+  ? lock_release+0x155/0x440
+  ? update_load_avg+0x8c/0x6f0
+  ? enqueue_entity+0x18a/0x920
+  ? __rb_reserve_next+0xe5/0x460
+  ? ring_buffer_lock_reserve+0x12a/0x3f0
+  event_triggers_call+0x52/0xe0
+  trace_event_buffer_commit+0x1ae/0x240
+  trace_event_raw_event_sched_switch+0x114/0x170
+  __traceiter_sched_switch+0x39/0x50
+  __schedule+0x431/0xb00
+  schedule_idle+0x28/0x40
+  do_idle+0x198/0x2e0
+  cpu_startup_entry+0x19/0x20
+  secondary_startup_64_no_verify+0xc2/0xcb
+
+The reason is that the dynamic events array keeps track of the field
+position of the fields array, via the field_pos variable in the
+synth_field structure. Unfortunately, that field is a boolean for some
+reason, which means any field_pos greater than 1 will be a bug (in this
+case it was 2).
+
+Link: https://lkml.kernel.org/r/20210721191008.638bce34@oasis.local.home
+
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Namhyung Kim <namhyung@kernel.org>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: stable@vger.kernel.org
+Fixes: bd82631d7ccdc ("tracing: Add support for dynamic strings to synthetic events")
+Reviewed-by: Tom Zanussi <zanussi@kernel.org>
+Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/trace_synth.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/trace/trace_synth.h
++++ b/kernel/trace/trace_synth.h
+@@ -14,10 +14,10 @@ struct synth_field {
+       char *name;
+       size_t size;
+       unsigned int offset;
++      unsigned int field_pos;
+       bool is_signed;
+       bool is_string;
+       bool is_dynamic;
+-      bool field_pos;
+ };
+ struct synth_event {
diff --git a/queue-5.10/userfaultfd-do-not-untag-user-pointers.patch b/queue-5.10/userfaultfd-do-not-untag-user-pointers.patch
new file mode 100644 (file)
index 0000000..c3fbf06
--- /dev/null
@@ -0,0 +1,196 @@
+From e71e2ace5721a8b921dca18b045069e7bb411277 Mon Sep 17 00:00:00 2001
+From: Peter Collingbourne <pcc@google.com>
+Date: Fri, 23 Jul 2021 15:50:01 -0700
+Subject: userfaultfd: do not untag user pointers
+
+From: Peter Collingbourne <pcc@google.com>
+
+commit e71e2ace5721a8b921dca18b045069e7bb411277 upstream.
+
+Patch series "userfaultfd: do not untag user pointers", v5.
+
+If a user program uses userfaultfd on ranges of heap memory, it may end
+up passing a tagged pointer to the kernel in the range.start field of
+the UFFDIO_REGISTER ioctl.  This can happen when using an MTE-capable
+allocator, or on Android if using the Tagged Pointers feature for MTE
+readiness [1].
+
+When a fault subsequently occurs, the tag is stripped from the fault
+address returned to the application in the fault.address field of struct
+uffd_msg.  However, from the application's perspective, the tagged
+address *is* the memory address, so if the application is unaware of
+memory tags, it may get confused by receiving an address that is, from
+its point of view, outside of the bounds of the allocation.  We observed
+this behavior in the kselftest for userfaultfd [2] but other
+applications could have the same problem.
+
+Address this by not untagging pointers passed to the userfaultfd ioctls.
+Instead, let the system call fail.  Also change the kselftest to use
+mmap so that it doesn't encounter this problem.
+
+[1] https://source.android.com/devices/tech/debug/tagged-pointers
+[2] tools/testing/selftests/vm/userfaultfd.c
+
+This patch (of 2):
+
+Do not untag pointers passed to the userfaultfd ioctls.  Instead, let
+the system call fail.  This will provide an early indication of problems
+with tag-unaware userspace code instead of letting the code get confused
+later, and is consistent with how we decided to handle brk/mmap/mremap
+in commit dcde237319e6 ("mm: Avoid creating virtual address aliases in
+brk()/mmap()/mremap()"), as well as being consistent with the existing
+tagged address ABI documentation relating to how ioctl arguments are
+handled.
+
+The code change is a revert of commit 7d0325749a6c ("userfaultfd: untag
+user pointers") plus some fixups to some additional calls to
+validate_range that have appeared since then.
+
+[1] https://source.android.com/devices/tech/debug/tagged-pointers
+[2] tools/testing/selftests/vm/userfaultfd.c
+
+Link: https://lkml.kernel.org/r/20210714195437.118982-1-pcc@google.com
+Link: https://lkml.kernel.org/r/20210714195437.118982-2-pcc@google.com
+Link: https://linux-review.googlesource.com/id/I761aa9f0344454c482b83fcfcce547db0a25501b
+Fixes: 63f0c6037965 ("arm64: Introduce prctl() options to control the tagged user addresses ABI")
+Signed-off-by: Peter Collingbourne <pcc@google.com>
+Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Alistair Delva <adelva@google.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Dave Martin <Dave.Martin@arm.com>
+Cc: Evgenii Stepanov <eugenis@google.com>
+Cc: Lokesh Gidra <lokeshgidra@google.com>
+Cc: Mitch Phillips <mitchp@google.com>
+Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Cc: Will Deacon <will@kernel.org>
+Cc: William McVicker <willmcvicker@google.com>
+Cc: <stable@vger.kernel.org>   [5.4]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/arm64/tagged-address-abi.rst |   26 ++++++++++++++++++--------
+ fs/userfaultfd.c                           |   24 +++++++++++-------------
+ 2 files changed, 29 insertions(+), 21 deletions(-)
+
+--- a/Documentation/arm64/tagged-address-abi.rst
++++ b/Documentation/arm64/tagged-address-abi.rst
+@@ -45,14 +45,24 @@ how the user addresses are used by the k
+ 1. User addresses not accessed by the kernel but used for address space
+    management (e.g. ``mprotect()``, ``madvise()``). The use of valid
+-   tagged pointers in this context is allowed with the exception of
+-   ``brk()``, ``mmap()`` and the ``new_address`` argument to
+-   ``mremap()`` as these have the potential to alias with existing
+-   user addresses.
+-
+-   NOTE: This behaviour changed in v5.6 and so some earlier kernels may
+-   incorrectly accept valid tagged pointers for the ``brk()``,
+-   ``mmap()`` and ``mremap()`` system calls.
++   tagged pointers in this context is allowed with these exceptions:
++
++   - ``brk()``, ``mmap()`` and the ``new_address`` argument to
++     ``mremap()`` as these have the potential to alias with existing
++      user addresses.
++
++     NOTE: This behaviour changed in v5.6 and so some earlier kernels may
++     incorrectly accept valid tagged pointers for the ``brk()``,
++     ``mmap()`` and ``mremap()`` system calls.
++
++   - The ``range.start``, ``start`` and ``dst`` arguments to the
++     ``UFFDIO_*`` ``ioctl()``s used on a file descriptor obtained from
++     ``userfaultfd()``, as fault addresses subsequently obtained by reading
++     the file descriptor will be untagged, which may otherwise confuse
++     tag-unaware programs.
++
++     NOTE: This behaviour changed in v5.14 and so some earlier kernels may
++     incorrectly accept valid tagged pointers for this system call.
+ 2. User addresses accessed by the kernel (e.g. ``write()``). This ABI
+    relaxation is disabled by default and the application thread needs to
+--- a/fs/userfaultfd.c
++++ b/fs/userfaultfd.c
+@@ -1228,23 +1228,21 @@ static __always_inline void wake_userfau
+ }
+ static __always_inline int validate_range(struct mm_struct *mm,
+-                                        __u64 *start, __u64 len)
++                                        __u64 start, __u64 len)
+ {
+       __u64 task_size = mm->task_size;
+-      *start = untagged_addr(*start);
+-
+-      if (*start & ~PAGE_MASK)
++      if (start & ~PAGE_MASK)
+               return -EINVAL;
+       if (len & ~PAGE_MASK)
+               return -EINVAL;
+       if (!len)
+               return -EINVAL;
+-      if (*start < mmap_min_addr)
++      if (start < mmap_min_addr)
+               return -EINVAL;
+-      if (*start >= task_size)
++      if (start >= task_size)
+               return -EINVAL;
+-      if (len > task_size - *start)
++      if (len > task_size - start)
+               return -EINVAL;
+       return 0;
+ }
+@@ -1290,7 +1288,7 @@ static int userfaultfd_register(struct u
+       if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
+               vm_flags |= VM_UFFD_WP;
+-      ret = validate_range(mm, &uffdio_register.range.start,
++      ret = validate_range(mm, uffdio_register.range.start,
+                            uffdio_register.range.len);
+       if (ret)
+               goto out;
+@@ -1490,7 +1488,7 @@ static int userfaultfd_unregister(struct
+       if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
+               goto out;
+-      ret = validate_range(mm, &uffdio_unregister.start,
++      ret = validate_range(mm, uffdio_unregister.start,
+                            uffdio_unregister.len);
+       if (ret)
+               goto out;
+@@ -1639,7 +1637,7 @@ static int userfaultfd_wake(struct userf
+       if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
+               goto out;
+-      ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
++      ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+       if (ret)
+               goto out;
+@@ -1679,7 +1677,7 @@ static int userfaultfd_copy(struct userf
+                          sizeof(uffdio_copy)-sizeof(__s64)))
+               goto out;
+-      ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
++      ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+       if (ret)
+               goto out;
+       /*
+@@ -1736,7 +1734,7 @@ static int userfaultfd_zeropage(struct u
+                          sizeof(uffdio_zeropage)-sizeof(__s64)))
+               goto out;
+-      ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
++      ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+                            uffdio_zeropage.range.len);
+       if (ret)
+               goto out;
+@@ -1786,7 +1784,7 @@ static int userfaultfd_writeprotect(stru
+                          sizeof(struct uffdio_writeprotect)))
+               return -EFAULT;
+-      ret = validate_range(ctx->mm, &uffdio_wp.range.start,
++      ret = validate_range(ctx->mm, uffdio_wp.range.start,
+                            uffdio_wp.range.len);
+       if (ret)
+               return ret;