From b99f37d21177329733d7cdcb6ac3420889da340c Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 18 Feb 2025 07:30:03 -0500 Subject: [PATCH] Fixes for 6.13 Signed-off-by: Sasha Levin --- .../arp-use-rcu-protection-in-arp_xmit.patch | 45 ++ ...page-cache-after-race-between-readah.patch | 208 +++++++++ ..._get_extent_map-and-pass-btrfs_inode.patch | 70 +++ ...migrate_disable-to-avoid-calling-get.patch | 82 ++++ ...pr_info-for-checking-clocksource-syn.patch | 45 ++ ...c-string-helpers-into-c-only-kernel-.patch | 70 +++ ...md-pstate-convert-mutex-use-to-guard.patch | 132 ++++++ ...tate-fix-cpufreq_policy-ref-counting.patch | 55 +++ ...te-merge-amd_pstate_epp_cpu_offline-.patch | 69 +++ ...te-refactor-amd_pstate_epp_reenable-.patch | 97 ++++ ...te-remove-the-cppc_state-check-in-of.patch | 56 +++ ...ream-pollin-in-xe_oa_buffer_check_un.patch | 69 +++ ...xpose-an-unblock-after-n-reports-oa-.patch | 176 +++++++ ...api-make-oa-buffer-size-configurable.patch | 280 +++++++++++ ...th-iavf-extend-the-netdev_lock-usage.patch | 297 ++++++++++++ ...-use-rcu-protection-to-fetch-dev_net.patch | 81 ++++ ...ading-space-from-irq_chip-irq_print_.patch | 81 ++++ ...ke-sure-rumble-work-is-canceled-on-r.patch | 38 ++ ...ve-hidraw-input-un-registering-to-wo.patch | 117 +++++ ...f-fix-a-locking-bug-in-an-error-path.patch | 38 ++ ...static-inline-dst_dev_overhead-to-ds.patch | 49 ++ ...md-cleanup-struct-io_uring_cmd_data-.patch | 49 ++ ...md-don-t-assume-io_uring_cmd_data-la.patch | 46 ++ ...md-switch-sqe-to-async_data-on-eagai.patch | 88 ++++ ...md-unconditionally-copy-sqes-at-prep.patch | 92 ++++ ...d-rcu-protection-to-ip4_dst_hoplimit.patch | 47 ++ .../ipv4-icmp-convert-to-dev_net_rcu.patch | 150 ++++++ ...cu-protection-in-__ip_rt_update_pmtu.patch | 77 ++++ ...e-rcu-protection-in-inet_select_addr.patch | 41 ++ ...otection-in-ip_dst_mtu_maybe_forward.patch | 57 +++ ...cu-protection-in-ipv4_default_advmss.patch | 48 ++ ...-use-rcu-protection-in-rt_is_expired.patch | 44 ++ .../ipv6-icmp-convert-to-dev_net_rcu.patch | 191 ++++++++ ...st-add-rcu-protection-to-mld_newpack.patch | 80 ++++ ...-extend-rcu-protection-in-igmp6_send.patch | 105 +++++ ...rcu-protection-in-ip6_default_advmss.patch | 49 ++ ...end-rcu-protection-in-ndisc_send_skb.patch | 72 +++ ...se-rcu-protection-in-ndisc_alloc_skb.patch | 59 +++ ...use-rcu-protection-in-__neigh_notify.patch | 58 +++ queue-6.13/net-add-dev_net_rcu-helper.patch | 62 +++ ...d-netdev-up-protected-by-netdev_lock.patch | 118 +++++ ...dd-netdev_lock-netdev_unlock-helpers.patch | 435 ++++++++++++++++++ ...-ref-loops-in-rpl-seg6-and-ioam6-lwt.patch | 94 ++++ ...m6_iptunnel-mitigate-2-realloc-issue.patch | 190 ++++++++ ...pl_iptunnel-mitigate-2-realloc-issue.patch | 154 +++++++ ...g6_iptunnel-mitigate-2-realloc-issue.patch | 254 ++++++++++ ...netdev_lock-protect-netdev-reg_state.patch | 84 ++++ ...-retain-napi-ordering-on-netdev-napi.patch | 123 +++++ ...ct-netdev-napi_list-with-netdev_lock.patch | 213 +++++++++ ...rcu-protection-in-ovs_vport_cmd_fill.patch | 66 +++ ...introduce-and-use-a-single-page-frag.patch | 214 +++++++++ ...ntroduce-and-use-a-single-page-frag-.patch | 232 ++++++++++ ...fzero-init-padding-bits-to-bindgen_s.patch | 42 ++ ...etif_napi_add_tx-and-napi_enable-fro.patch | 54 +++ ...-broken-vmlinux-path-for-vmlinux_btf.patch | 47 ++ ...ve-unnecessary-i-flags-from-libbpf-e.patch | 63 +++ ...re-introduce-a-new-clock_gating-lock.patch | 336 ++++++++++++++ ...e-introduce-ufshcd_has_pending_tasks.patch | 58 +++ ...epare-to-introduce-a-new-clock_gatin.patch | 61 +++ ...gling-of-clk_gating.state-when-clock.patch | 48 ++ queue-6.13/series | 60 +++ 61 files changed, 6516 insertions(+) create mode 100644 queue-6.13/arp-use-rcu-protection-in-arp_xmit.patch create mode 100644 queue-6.13/btrfs-fix-stale-page-cache-after-race-between-readah.patch create mode 100644 queue-6.13/btrfs-rename-__get_extent_map-and-pass-btrfs_inode.patch create mode 100644 queue-6.13/clocksource-use-migrate_disable-to-avoid-calling-get.patch create mode 100644 queue-6.13/clocksource-use-pr_info-for-checking-clocksource-syn.patch create mode 100644 queue-6.13/compiler.h-move-c-string-helpers-into-c-only-kernel-.patch create mode 100644 queue-6.13/cpufreq-amd-pstate-convert-mutex-use-to-guard.patch create mode 100644 queue-6.13/cpufreq-amd-pstate-fix-cpufreq_policy-ref-counting.patch create mode 100644 queue-6.13/cpufreq-amd-pstate-merge-amd_pstate_epp_cpu_offline-.patch create mode 100644 queue-6.13/cpufreq-amd-pstate-refactor-amd_pstate_epp_reenable-.patch create mode 100644 queue-6.13/cpufreq-amd-pstate-remove-the-cppc_state-check-in-of.patch create mode 100644 queue-6.13/drm-xe-oa-set-stream-pollin-in-xe_oa_buffer_check_un.patch create mode 100644 queue-6.13/drm-xe-oa-uapi-expose-an-unblock-after-n-reports-oa-.patch create mode 100644 queue-6.13/drm-xe-oa-uapi-make-oa-buffer-size-configurable.patch create mode 100644 queue-6.13/eth-iavf-extend-the-netdev_lock-usage.patch create mode 100644 queue-6.13/flow_dissector-use-rcu-protection-to-fetch-dev_net.patch create mode 100644 queue-6.13/genirq-remove-leading-space-from-irq_chip-irq_print_.patch create mode 100644 queue-6.13/hid-hid-steam-make-sure-rumble-work-is-canceled-on-r.patch create mode 100644 queue-6.13/hid-hid-steam-move-hidraw-input-un-registering-to-wo.patch create mode 100644 queue-6.13/iavf-fix-a-locking-bug-in-an-error-path.patch create mode 100644 queue-6.13/include-net-add-static-inline-dst_dev_overhead-to-ds.patch create mode 100644 queue-6.13/io_uring-uring_cmd-cleanup-struct-io_uring_cmd_data-.patch create mode 100644 queue-6.13/io_uring-uring_cmd-don-t-assume-io_uring_cmd_data-la.patch create mode 100644 queue-6.13/io_uring-uring_cmd-switch-sqe-to-async_data-on-eagai.patch create mode 100644 queue-6.13/io_uring-uring_cmd-unconditionally-copy-sqes-at-prep.patch create mode 100644 queue-6.13/ipv4-add-rcu-protection-to-ip4_dst_hoplimit.patch create mode 100644 queue-6.13/ipv4-icmp-convert-to-dev_net_rcu.patch create mode 100644 queue-6.13/ipv4-use-rcu-protection-in-__ip_rt_update_pmtu.patch create mode 100644 queue-6.13/ipv4-use-rcu-protection-in-inet_select_addr.patch create mode 100644 queue-6.13/ipv4-use-rcu-protection-in-ip_dst_mtu_maybe_forward.patch create mode 100644 queue-6.13/ipv4-use-rcu-protection-in-ipv4_default_advmss.patch create mode 100644 queue-6.13/ipv4-use-rcu-protection-in-rt_is_expired.patch create mode 100644 queue-6.13/ipv6-icmp-convert-to-dev_net_rcu.patch create mode 100644 queue-6.13/ipv6-mcast-add-rcu-protection-to-mld_newpack.patch create mode 100644 queue-6.13/ipv6-mcast-extend-rcu-protection-in-igmp6_send.patch create mode 100644 queue-6.13/ipv6-use-rcu-protection-in-ip6_default_advmss.patch create mode 100644 queue-6.13/ndisc-extend-rcu-protection-in-ndisc_send_skb.patch create mode 100644 queue-6.13/ndisc-use-rcu-protection-in-ndisc_alloc_skb.patch create mode 100644 queue-6.13/neighbour-use-rcu-protection-in-__neigh_notify.patch create mode 100644 queue-6.13/net-add-dev_net_rcu-helper.patch create mode 100644 queue-6.13/net-add-netdev-up-protected-by-netdev_lock.patch create mode 100644 queue-6.13/net-add-netdev_lock-netdev_unlock-helpers.patch create mode 100644 queue-6.13/net-ipv6-fix-dst-ref-loops-in-rpl-seg6-and-ioam6-lwt.patch create mode 100644 queue-6.13/net-ipv6-ioam6_iptunnel-mitigate-2-realloc-issue.patch create mode 100644 queue-6.13/net-ipv6-rpl_iptunnel-mitigate-2-realloc-issue.patch create mode 100644 queue-6.13/net-ipv6-seg6_iptunnel-mitigate-2-realloc-issue.patch create mode 100644 queue-6.13/net-make-netdev_lock-protect-netdev-reg_state.patch create mode 100644 queue-6.13/net-make-sure-we-retain-napi-ordering-on-netdev-napi.patch create mode 100644 queue-6.13/net-protect-netdev-napi_list-with-netdev_lock.patch create mode 100644 queue-6.13/openvswitch-use-rcu-protection-in-ovs_vport_cmd_fill.patch create mode 100644 queue-6.13/reapply-net-skb-introduce-and-use-a-single-page-frag.patch create mode 100644 queue-6.13/revert-net-skb-introduce-and-use-a-single-page-frag-.patch create mode 100644 queue-6.13/rust-kbuild-add-fzero-init-padding-bits-to-bindgen_s.patch create mode 100644 queue-6.13/s390-qeth-move-netif_napi_add_tx-and-napi_enable-fro.patch create mode 100644 queue-6.13/samples-hid-fix-broken-vmlinux-path-for-vmlinux_btf.patch create mode 100644 queue-6.13/samples-hid-remove-unnecessary-i-flags-from-libbpf-e.patch create mode 100644 queue-6.13/scsi-ufs-core-introduce-a-new-clock_gating-lock.patch create mode 100644 queue-6.13/scsi-ufs-core-introduce-ufshcd_has_pending_tasks.patch create mode 100644 queue-6.13/scsi-ufs-core-prepare-to-introduce-a-new-clock_gatin.patch create mode 100644 queue-6.13/scsi-ufs-fix-toggling-of-clk_gating.state-when-clock.patch diff --git a/queue-6.13/arp-use-rcu-protection-in-arp_xmit.patch b/queue-6.13/arp-use-rcu-protection-in-arp_xmit.patch new file mode 100644 index 0000000000..135c79eea9 --- /dev/null +++ b/queue-6.13/arp-use-rcu-protection-in-arp_xmit.patch @@ -0,0 +1,45 @@ +From 42f2ee9e472d234cd12a59e5b7299468805358ea Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 7 Feb 2025 13:58:36 +0000 +Subject: arp: use RCU protection in arp_xmit() + +From: Eric Dumazet + +[ Upstream commit a42b69f692165ec39db42d595f4f65a4c8f42e44 ] + +arp_xmit() can be called without RTNL or RCU protection. + +Use RCU protection to avoid potential UAF. + +Fixes: 29a26a568038 ("netfilter: Pass struct net into the netfilter hooks") +Signed-off-by: Eric Dumazet +Reviewed-by: David Ahern +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250207135841.1948589-5-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ipv4/arp.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c +index cb9a7ed8abd3a..f23a1ec6694cb 100644 +--- a/net/ipv4/arp.c ++++ b/net/ipv4/arp.c +@@ -659,10 +659,12 @@ static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb + */ + void arp_xmit(struct sk_buff *skb) + { ++ rcu_read_lock(); + /* Send it off, maybe filter it using firewalling first. */ + NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, +- dev_net(skb->dev), NULL, skb, NULL, skb->dev, ++ dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev, + arp_xmit_finish); ++ rcu_read_unlock(); + } + EXPORT_SYMBOL(arp_xmit); + +-- +2.39.5 + diff --git a/queue-6.13/btrfs-fix-stale-page-cache-after-race-between-readah.patch b/queue-6.13/btrfs-fix-stale-page-cache-after-race-between-readah.patch new file mode 100644 index 0000000000..b1acc6735b --- /dev/null +++ b/queue-6.13/btrfs-fix-stale-page-cache-after-race-between-readah.patch @@ -0,0 +1,208 @@ +From 9da018ab8a1121beec117c40f5bda5d0857a3377 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 4 Feb 2025 11:02:32 +0000 +Subject: btrfs: fix stale page cache after race between readahead and direct + IO write + +From: Filipe Manana + +[ Upstream commit acc18e1c1d8c0d59d793cf87790ccfcafb1bf5f0 ] + +After commit ac325fc2aad5 ("btrfs: do not hold the extent lock for entire +read") we can now trigger a race between a task doing a direct IO write +and readahead. When this race is triggered it results in tasks getting +stale data when they attempt do a buffered read (including the task that +did the direct IO write). + +This race can be sporadically triggered with test case generic/418, failing +like this: + + $ ./check generic/418 + FSTYP -- btrfs + PLATFORM -- Linux/x86_64 debian0 6.13.0-rc7-btrfs-next-185+ #17 SMP PREEMPT_DYNAMIC Mon Feb 3 12:28:46 WET 2025 + MKFS_OPTIONS -- /dev/sdc + MOUNT_OPTIONS -- /dev/sdc /home/fdmanana/btrfs-tests/scratch_1 + + generic/418 14s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//generic/418.out.bad) + --- tests/generic/418.out 2020-06-10 19:29:03.850519863 +0100 + +++ /home/fdmanana/git/hub/xfstests/results//generic/418.out.bad 2025-02-03 15:42:36.974609476 +0000 + @@ -1,2 +1,5 @@ + QA output created by 418 + +cmpbuf: offset 0: Expected: 0x1, got 0x0 + +[6:0] FAIL - comparison failed, offset 24576 + +diotest -wp -b 4096 -n 8 -i 4 failed at loop 3 + Silence is golden + ... + (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/generic/418.out /home/fdmanana/git/hub/xfstests/results//generic/418.out.bad' to see the entire diff) + Ran: generic/418 + Failures: generic/418 + Failed 1 of 1 tests + +The race happens like this: + +1) A file has a prealloc extent for the range [16K, 28K); + +2) Task A starts a direct IO write against file range [24K, 28K). + At the start of the direct IO write it invalidates the page cache at + __iomap_dio_rw() with kiocb_invalidate_pages() for the 4K page at file + offset 24K; + +3) Task A enters btrfs_dio_iomap_begin() and locks the extent range + [24K, 28K); + +4) Task B starts a readahead for file range [16K, 28K), entering + btrfs_readahead(). + + First it attempts to read the page at offset 16K by entering + btrfs_do_readpage(), where it calls get_extent_map(), locks the range + [16K, 20K) and gets the extent map for the range [16K, 28K), caching + it into the 'em_cached' variable declared in the local stack of + btrfs_readahead(), and then unlocks the range [16K, 20K). + + Since the extent map has the prealloc flag, at btrfs_do_readpage() we + zero out the page's content and don't submit any bio to read the page + from the extent. + + Then it attempts to read the page at offset 20K entering + btrfs_do_readpage() where we reuse the previously cached extent map + (decided by get_extent_map()) since it spans the page's range and + it's still in the inode's extent map tree. + + Just like for the previous page, we zero out the page's content since + the extent map has the prealloc flag set. + + Then it attempts to read the page at offset 24K entering + btrfs_do_readpage() where we reuse the previously cached extent map + (decided by get_extent_map()) since it spans the page's range and + it's still in the inode's extent map tree. + + Just like for the previous pages, we zero out the page's content since + the extent map has the prealloc flag set. Note that we didn't lock the + extent range [24K, 28K), so we didn't synchronize with the ongoing + direct IO write being performed by task A; + +5) Task A enters btrfs_create_dio_extent() and creates an ordered extent + for the range [24K, 28K), with the flags BTRFS_ORDERED_DIRECT and + BTRFS_ORDERED_PREALLOC set; + +6) Task A unlocks the range [24K, 28K) at btrfs_dio_iomap_begin(); + +7) The ordered extent enters btrfs_finish_one_ordered() and locks the + range [24K, 28K); + +8) Task A enters fs/iomap/direct-io.c:iomap_dio_complete() and it tries + to invalidate the page at offset 24K by calling + kiocb_invalidate_post_direct_write(), resulting in a call chain that + ends up at btrfs_release_folio(). + + The btrfs_release_folio() call ends up returning false because the range + for the page at file offset 24K is currently locked by the task doing + the ordered extent completion in the previous step (7), so we have: + + btrfs_release_folio() -> + __btrfs_release_folio() -> + try_release_extent_mapping() -> + try_release_extent_state() + + This last function checking that the range is locked and returning false + and propagating it up to btrfs_release_folio(). + + So this results in a failure to invalidate the page and + kiocb_invalidate_post_direct_write() triggers this message logged in + dmesg: + + Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O! + + After this we leave the page cache with stale data for the file range + [24K, 28K), filled with zeroes instead of the data written by direct IO + write (all bytes with a 0x01 value), so any task attempting to read with + buffered IO, including the task that did the direct IO write, will get + all bytes in the range with a 0x00 value instead of the written data. + +Fix this by locking the range, with btrfs_lock_and_flush_ordered_range(), +at the two callers of btrfs_do_readpage() instead of doing it at +get_extent_map(), just like we did before commit ac325fc2aad5 ("btrfs: do +not hold the extent lock for entire read"), and unlocking the range after +all the calls to btrfs_do_readpage(). This way we never reuse a cached +extent map without flushing any pending ordered extents from a concurrent +direct IO write. + +Fixes: ac325fc2aad5 ("btrfs: do not hold the extent lock for entire read") +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/extent_io.c | 18 +++++++++++++++--- + 1 file changed, 15 insertions(+), 3 deletions(-) + +diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c +index e23eb1bca4508..d14ecbe24d775 100644 +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -906,7 +906,6 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, + u64 len, struct extent_map **em_cached) + { + struct extent_map *em; +- struct extent_state *cached_state = NULL; + + ASSERT(em_cached); + +@@ -922,14 +921,12 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, + *em_cached = NULL; + } + +- btrfs_lock_and_flush_ordered_range(inode, start, start + len - 1, &cached_state); + em = btrfs_get_extent(inode, folio, start, len); + if (!IS_ERR(em)) { + BUG_ON(*em_cached); + refcount_inc(&em->refs); + *em_cached = em; + } +- unlock_extent(&inode->io_tree, start, start + len - 1, &cached_state); + + return em; + } +@@ -1086,11 +1083,18 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, + + int btrfs_read_folio(struct file *file, struct folio *folio) + { ++ struct btrfs_inode *inode = folio_to_inode(folio); ++ const u64 start = folio_pos(folio); ++ const u64 end = start + folio_size(folio) - 1; ++ struct extent_state *cached_state = NULL; + struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + struct extent_map *em_cached = NULL; + int ret; + ++ btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); ++ unlock_extent(&inode->io_tree, start, end, &cached_state); ++ + free_extent_map(em_cached); + + /* +@@ -2331,12 +2335,20 @@ void btrfs_readahead(struct readahead_control *rac) + { + struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; + struct folio *folio; ++ struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); ++ const u64 start = readahead_pos(rac); ++ const u64 end = start + readahead_length(rac) - 1; ++ struct extent_state *cached_state = NULL; + struct extent_map *em_cached = NULL; + u64 prev_em_start = (u64)-1; + ++ btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); ++ + while ((folio = readahead_folio(rac)) != NULL) + btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + ++ unlock_extent(&inode->io_tree, start, end, &cached_state); ++ + if (em_cached) + free_extent_map(em_cached); + submit_one_bio(&bio_ctrl); +-- +2.39.5 + diff --git a/queue-6.13/btrfs-rename-__get_extent_map-and-pass-btrfs_inode.patch b/queue-6.13/btrfs-rename-__get_extent_map-and-pass-btrfs_inode.patch new file mode 100644 index 0000000000..772570185c --- /dev/null +++ b/queue-6.13/btrfs-rename-__get_extent_map-and-pass-btrfs_inode.patch @@ -0,0 +1,70 @@ +From b9bf1ece94cfb4aca2b6ddc514ea1ad68a9f087e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 9 Jan 2025 11:24:15 +0100 +Subject: btrfs: rename __get_extent_map() and pass btrfs_inode + +From: David Sterba + +[ Upstream commit 06de96faf795b5c276a3be612da6b08c6112e747 ] + +The double underscore naming scheme does not apply here, there's only +only get_extent_map(). As the definition is changed also pass the struct +btrfs_inode. + +Reviewed-by: Johannes Thumshirn +Reviewed-by: Anand Jain +Signed-off-by: David Sterba +Stable-dep-of: acc18e1c1d8c ("btrfs: fix stale page cache after race between readahead and direct IO write") +Signed-off-by: Sasha Levin +--- + fs/btrfs/extent_io.c | 15 +++++++-------- + 1 file changed, 7 insertions(+), 8 deletions(-) + +diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c +index b923d0cec61c7..e23eb1bca4508 100644 +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -901,9 +901,9 @@ void clear_folio_extent_mapped(struct folio *folio) + folio_detach_private(folio); + } + +-static struct extent_map *__get_extent_map(struct inode *inode, +- struct folio *folio, u64 start, +- u64 len, struct extent_map **em_cached) ++static struct extent_map *get_extent_map(struct btrfs_inode *inode, ++ struct folio *folio, u64 start, ++ u64 len, struct extent_map **em_cached) + { + struct extent_map *em; + struct extent_state *cached_state = NULL; +@@ -922,14 +922,14 @@ static struct extent_map *__get_extent_map(struct inode *inode, + *em_cached = NULL; + } + +- btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start, start + len - 1, &cached_state); +- em = btrfs_get_extent(BTRFS_I(inode), folio, start, len); ++ btrfs_lock_and_flush_ordered_range(inode, start, start + len - 1, &cached_state); ++ em = btrfs_get_extent(inode, folio, start, len); + if (!IS_ERR(em)) { + BUG_ON(*em_cached); + refcount_inc(&em->refs); + *em_cached = em; + } +- unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); ++ unlock_extent(&inode->io_tree, start, start + len - 1, &cached_state); + + return em; + } +@@ -985,8 +985,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, + end_folio_read(folio, true, cur, iosize); + break; + } +- em = __get_extent_map(inode, folio, cur, end - cur + 1, +- em_cached); ++ em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); + if (IS_ERR(em)) { + end_folio_read(folio, false, cur, end + 1 - cur); + return PTR_ERR(em); +-- +2.39.5 + diff --git a/queue-6.13/clocksource-use-migrate_disable-to-avoid-calling-get.patch b/queue-6.13/clocksource-use-migrate_disable-to-avoid-calling-get.patch new file mode 100644 index 0000000000..75254e1b89 --- /dev/null +++ b/queue-6.13/clocksource-use-migrate_disable-to-avoid-calling-get.patch @@ -0,0 +1,82 @@ +From f76a3e8e8fc1c0219f7e0bda36fe78258b264235 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 31 Jan 2025 12:33:23 -0500 +Subject: clocksource: Use migrate_disable() to avoid calling get_random_u32() + in atomic context + +From: Waiman Long + +[ Upstream commit 6bb05a33337b2c842373857b63de5c9bf1ae2a09 ] + +The following bug report happened with a PREEMPT_RT kernel: + + BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 + in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 2012, name: kwatchdog + preempt_count: 1, expected: 0 + RCU nest depth: 0, expected: 0 + get_random_u32+0x4f/0x110 + clocksource_verify_choose_cpus+0xab/0x1a0 + clocksource_verify_percpu.part.0+0x6b/0x330 + clocksource_watchdog_kthread+0x193/0x1a0 + +It is due to the fact that clocksource_verify_choose_cpus() is invoked with +preemption disabled. This function invokes get_random_u32() to obtain +random numbers for choosing CPUs. The batched_entropy_32 local lock and/or +the base_crng.lock spinlock in driver/char/random.c will be acquired during +the call. In PREEMPT_RT kernel, they are both sleeping locks and so cannot +be acquired in atomic context. + +Fix this problem by using migrate_disable() to allow smp_processor_id() to +be reliably used without introducing atomic context. preempt_disable() is +then called after clocksource_verify_choose_cpus() but before the +clocksource measurement is being run to avoid introducing unexpected +latency. + +Fixes: 7560c02bdffb ("clocksource: Check per-CPU clock synchronization when marked unstable") +Suggested-by: Sebastian Andrzej Siewior +Signed-off-by: Waiman Long +Signed-off-by: Thomas Gleixner +Reviewed-by: Paul E. McKenney +Reviewed-by: Sebastian Andrzej Siewior +Link: https://lore.kernel.org/all/20250131173323.891943-2-longman@redhat.com +Signed-off-by: Sasha Levin +--- + kernel/time/clocksource.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c +index 77d9566d3aa68..2a7802ec480cc 100644 +--- a/kernel/time/clocksource.c ++++ b/kernel/time/clocksource.c +@@ -373,10 +373,10 @@ void clocksource_verify_percpu(struct clocksource *cs) + cpumask_clear(&cpus_ahead); + cpumask_clear(&cpus_behind); + cpus_read_lock(); +- preempt_disable(); ++ migrate_disable(); + clocksource_verify_choose_cpus(); + if (cpumask_empty(&cpus_chosen)) { +- preempt_enable(); ++ migrate_enable(); + cpus_read_unlock(); + pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); + return; +@@ -384,6 +384,7 @@ void clocksource_verify_percpu(struct clocksource *cs) + testcpu = smp_processor_id(); + pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", + cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); ++ preempt_disable(); + for_each_cpu(cpu, &cpus_chosen) { + if (cpu == testcpu) + continue; +@@ -403,6 +404,7 @@ void clocksource_verify_percpu(struct clocksource *cs) + cs_nsec_min = cs_nsec; + } + preempt_enable(); ++ migrate_enable(); + cpus_read_unlock(); + if (!cpumask_empty(&cpus_ahead)) + pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", +-- +2.39.5 + diff --git a/queue-6.13/clocksource-use-pr_info-for-checking-clocksource-syn.patch b/queue-6.13/clocksource-use-pr_info-for-checking-clocksource-syn.patch new file mode 100644 index 0000000000..5a201a4563 --- /dev/null +++ b/queue-6.13/clocksource-use-pr_info-for-checking-clocksource-syn.patch @@ -0,0 +1,45 @@ +From ec0c09e7665b5bf56f92a5d3716b1b3bfff11160 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 24 Jan 2025 20:54:41 -0500 +Subject: clocksource: Use pr_info() for "Checking clocksource synchronization" + message + +From: Waiman Long + +[ Upstream commit 1f566840a82982141f94086061927a90e79440e5 ] + +The "Checking clocksource synchronization" message is normally printed +when clocksource_verify_percpu() is called for a given clocksource if +both the CLOCK_SOURCE_UNSTABLE and CLOCK_SOURCE_VERIFY_PERCPU flags +are set. + +It is an informational message and so pr_info() is the correct choice. + +Signed-off-by: Waiman Long +Signed-off-by: Thomas Gleixner +Reviewed-by: Paul E. McKenney +Acked-by: John Stultz +Link: https://lore.kernel.org/all/20250125015442.3740588-1-longman@redhat.com +Stable-dep-of: 6bb05a33337b ("clocksource: Use migrate_disable() to avoid calling get_random_u32() in atomic context") +Signed-off-by: Sasha Levin +--- + kernel/time/clocksource.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c +index 7304d7cf47f2d..77d9566d3aa68 100644 +--- a/kernel/time/clocksource.c ++++ b/kernel/time/clocksource.c +@@ -382,7 +382,8 @@ void clocksource_verify_percpu(struct clocksource *cs) + return; + } + testcpu = smp_processor_id(); +- pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); ++ pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", ++ cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + for_each_cpu(cpu, &cpus_chosen) { + if (cpu == testcpu) + continue; +-- +2.39.5 + diff --git a/queue-6.13/compiler.h-move-c-string-helpers-into-c-only-kernel-.patch b/queue-6.13/compiler.h-move-c-string-helpers-into-c-only-kernel-.patch new file mode 100644 index 0000000000..fe2f97fbc2 --- /dev/null +++ b/queue-6.13/compiler.h-move-c-string-helpers-into-c-only-kernel-.patch @@ -0,0 +1,70 @@ +From 6ebc76bf95d6ec3b958b501d6fce71f85ef2d576 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Feb 2025 12:32:49 -0800 +Subject: compiler.h: Move C string helpers into C-only kernel section + +From: Kees Cook + +[ Upstream commit cb7380de9e4cbc9a24216b722ec50e092ae83036 ] + +The C kernel helpers for evaluating C Strings were positioned where they +were visible to assembly inclusion, which was not intended. Move them +into the kernel and C-only area of the header so future changes won't +confuse the assembler. + +Fixes: d7a516c6eeae ("compiler.h: Fix undefined BUILD_BUG_ON_ZERO()") +Fixes: 559048d156ff ("string: Check for "nonstring" attribute on strscpy() arguments") +Reviewed-by: Miguel Ojeda +Signed-off-by: Kees Cook +Signed-off-by: Sasha Levin +--- + include/linux/compiler.h | 26 +++++++++++++------------- + 1 file changed, 13 insertions(+), 13 deletions(-) + +diff --git a/include/linux/compiler.h b/include/linux/compiler.h +index 240c632c5b957..7af999a131cb2 100644 +--- a/include/linux/compiler.h ++++ b/include/linux/compiler.h +@@ -214,6 +214,19 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, + __v; \ + }) + ++#ifdef __CHECKER__ ++#define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) ++#else /* __CHECKER__ */ ++#define __BUILD_BUG_ON_ZERO_MSG(e, msg) ((int)sizeof(struct {_Static_assert(!(e), msg);})) ++#endif /* __CHECKER__ */ ++ ++/* &a[0] degrades to a pointer: a different type from an array */ ++#define __must_be_array(a) __BUILD_BUG_ON_ZERO_MSG(__same_type((a), &(a)[0]), "must be array") ++ ++/* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ ++#define __must_be_cstr(p) \ ++ __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)") ++ + #endif /* __KERNEL__ */ + + /** +@@ -254,19 +267,6 @@ static inline void *offset_to_ptr(const int *off) + + #define __ADDRESSABLE_ASM_STR(sym) __stringify(__ADDRESSABLE_ASM(sym)) + +-#ifdef __CHECKER__ +-#define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) +-#else /* __CHECKER__ */ +-#define __BUILD_BUG_ON_ZERO_MSG(e, msg) ((int)sizeof(struct {_Static_assert(!(e), msg);})) +-#endif /* __CHECKER__ */ +- +-/* &a[0] degrades to a pointer: a different type from an array */ +-#define __must_be_array(a) __BUILD_BUG_ON_ZERO_MSG(__same_type((a), &(a)[0]), "must be array") +- +-/* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ +-#define __must_be_cstr(p) \ +- __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)") +- + /* + * This returns a constant expression while determining if an argument is + * a constant expression, most importantly without evaluating the argument. +-- +2.39.5 + diff --git a/queue-6.13/cpufreq-amd-pstate-convert-mutex-use-to-guard.patch b/queue-6.13/cpufreq-amd-pstate-convert-mutex-use-to-guard.patch new file mode 100644 index 0000000000..d12970a29d --- /dev/null +++ b/queue-6.13/cpufreq-amd-pstate-convert-mutex-use-to-guard.patch @@ -0,0 +1,132 @@ +From 63808f2c8f69e194896171320d0de3cb2ccfc4e4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 9 Dec 2024 12:52:37 -0600 +Subject: cpufreq/amd-pstate: convert mutex use to guard() + +From: Mario Limonciello + +[ Upstream commit 6c093d5a5b73ec1caf1e706510ae6031af2f9d43 ] + +Using scoped guard declaration will unlock mutexes automatically. + +Reviewed-by: Gautham R. Shenoy +Link: https://lore.kernel.org/r/20241209185248.16301-5-mario.limonciello@amd.com +Signed-off-by: Mario Limonciello +Stable-dep-of: 3ace20038e19 ("cpufreq/amd-pstate: Fix cpufreq_policy ref counting") +Signed-off-by: Sasha Levin +--- + drivers/cpufreq/amd-pstate.c | 32 ++++++++++++-------------------- + 1 file changed, 12 insertions(+), 20 deletions(-) + +diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c +index 03a5fd713ad59..bdaa19c25887b 100644 +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -727,12 +727,12 @@ static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) + pr_err("Boost mode is not supported by this processor or SBIOS\n"); + return -EOPNOTSUPP; + } +- mutex_lock(&amd_pstate_driver_lock); ++ guard(mutex)(&amd_pstate_driver_lock); ++ + ret = amd_pstate_cpu_boost_update(policy, state); + WRITE_ONCE(cpudata->boost_state, !ret ? state : false); + policy->boost_enabled = !ret ? state : false; + refresh_frequency_limits(policy); +- mutex_unlock(&amd_pstate_driver_lock); + + return ret; + } +@@ -823,7 +823,8 @@ static void amd_pstate_update_limits(unsigned int cpu) + if (!amd_pstate_prefcore) + return; + +- mutex_lock(&amd_pstate_driver_lock); ++ guard(mutex)(&amd_pstate_driver_lock); ++ + ret = amd_get_highest_perf(cpu, &cur_high); + if (ret) + goto free_cpufreq_put; +@@ -843,7 +844,6 @@ static void amd_pstate_update_limits(unsigned int cpu) + if (!highest_perf_changed) + cpufreq_update_policy(cpu); + +- mutex_unlock(&amd_pstate_driver_lock); + } + + /* +@@ -1172,11 +1172,11 @@ static ssize_t store_energy_performance_preference( + if (ret < 0) + return -EINVAL; + +- mutex_lock(&amd_pstate_limits_lock); ++ guard(mutex)(&amd_pstate_limits_lock); ++ + ret = amd_pstate_set_energy_pref_index(cpudata, ret); +- mutex_unlock(&amd_pstate_limits_lock); + +- return ret ?: count; ++ return ret ? ret : count; + } + + static ssize_t show_energy_performance_preference( +@@ -1340,13 +1340,10 @@ EXPORT_SYMBOL_GPL(amd_pstate_update_status); + static ssize_t status_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +- ssize_t ret; + +- mutex_lock(&amd_pstate_driver_lock); +- ret = amd_pstate_show_status(buf); +- mutex_unlock(&amd_pstate_driver_lock); ++ guard(mutex)(&amd_pstate_driver_lock); + +- return ret; ++ return amd_pstate_show_status(buf); + } + + static ssize_t status_store(struct device *a, struct device_attribute *b, +@@ -1355,9 +1352,8 @@ static ssize_t status_store(struct device *a, struct device_attribute *b, + char *p = memchr(buf, '\n', count); + int ret; + +- mutex_lock(&amd_pstate_driver_lock); ++ guard(mutex)(&amd_pstate_driver_lock); + ret = amd_pstate_update_status(buf, p ? p - buf : count); +- mutex_unlock(&amd_pstate_driver_lock); + + return ret < 0 ? ret : count; + } +@@ -1640,13 +1636,11 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) + + min_perf = READ_ONCE(cpudata->lowest_perf); + +- mutex_lock(&amd_pstate_limits_lock); ++ guard(mutex)(&amd_pstate_limits_lock); + + amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, false); + amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE); + +- mutex_unlock(&amd_pstate_limits_lock); +- + return 0; + } + +@@ -1675,13 +1669,11 @@ static int amd_pstate_epp_resume(struct cpufreq_policy *policy) + struct amd_cpudata *cpudata = policy->driver_data; + + if (cpudata->suspended) { +- mutex_lock(&amd_pstate_limits_lock); ++ guard(mutex)(&amd_pstate_limits_lock); + + /* enable amd pstate from suspend state*/ + amd_pstate_epp_reenable(cpudata); + +- mutex_unlock(&amd_pstate_limits_lock); +- + cpudata->suspended = false; + } + +-- +2.39.5 + diff --git a/queue-6.13/cpufreq-amd-pstate-fix-cpufreq_policy-ref-counting.patch b/queue-6.13/cpufreq-amd-pstate-fix-cpufreq_policy-ref-counting.patch new file mode 100644 index 0000000000..71d8061056 --- /dev/null +++ b/queue-6.13/cpufreq-amd-pstate-fix-cpufreq_policy-ref-counting.patch @@ -0,0 +1,55 @@ +From ebb30c2a945bbaf83c9017b3b89e3e4bd9e74fe1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Feb 2025 11:25:20 +0000 +Subject: cpufreq/amd-pstate: Fix cpufreq_policy ref counting + +From: Dhananjay Ugwekar + +[ Upstream commit 3ace20038e19f23fe73259513f1f08d4bf1a3c83 ] + +amd_pstate_update_limits() takes a cpufreq_policy reference but doesn't +decrement the refcount in one of the exit paths, fix that. + +Fixes: 45722e777fd9 ("cpufreq: amd-pstate: Optimize amd_pstate_update_limits()") +Signed-off-by: Dhananjay Ugwekar +Reviewed-by: Mario Limonciello +Link: https://lore.kernel.org/r/20250205112523.201101-10-dhananjay.ugwekar@amd.com +Signed-off-by: Mario Limonciello +Signed-off-by: Sasha Levin +--- + drivers/cpufreq/amd-pstate.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c +index bdaa19c25887b..0aea414b8ac4a 100644 +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -809,20 +809,21 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) + + static void amd_pstate_update_limits(unsigned int cpu) + { +- struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); ++ struct cpufreq_policy *policy = NULL; + struct amd_cpudata *cpudata; + u32 prev_high = 0, cur_high = 0; + int ret; + bool highest_perf_changed = false; + ++ if (!amd_pstate_prefcore) ++ return; ++ ++ policy = cpufreq_cpu_get(cpu); + if (!policy) + return; + + cpudata = policy->driver_data; + +- if (!amd_pstate_prefcore) +- return; +- + guard(mutex)(&amd_pstate_driver_lock); + + ret = amd_get_highest_perf(cpu, &cur_high); +-- +2.39.5 + diff --git a/queue-6.13/cpufreq-amd-pstate-merge-amd_pstate_epp_cpu_offline-.patch b/queue-6.13/cpufreq-amd-pstate-merge-amd_pstate_epp_cpu_offline-.patch new file mode 100644 index 0000000000..ffc1f1dfec --- /dev/null +++ b/queue-6.13/cpufreq-amd-pstate-merge-amd_pstate_epp_cpu_offline-.patch @@ -0,0 +1,69 @@ +From f58c9557a52aac1f523bb20d981b9c5d217f93fc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 4 Dec 2024 14:48:42 +0000 +Subject: cpufreq/amd-pstate: Merge amd_pstate_epp_cpu_offline() and + amd_pstate_epp_offline() + +From: Dhananjay Ugwekar + +[ Upstream commit 53ec2101dfede8fecdd240662281a12e537c3411 ] + +amd_pstate_epp_offline() is only called from within +amd_pstate_epp_cpu_offline() and doesn't make much sense to have it at all. +Hence, remove it. + +Also remove the unncessary debug print in the offline path while at it. + +Signed-off-by: Dhananjay Ugwekar +Reviewed-by: Gautham R. Shenoy +Reviewed-by: Mario Limonciello +Link: https://lore.kernel.org/r/20241204144842.164178-6-Dhananjay.Ugwekar@amd.com +Signed-off-by: Mario Limonciello +Stable-dep-of: 3ace20038e19 ("cpufreq/amd-pstate: Fix cpufreq_policy ref counting") +Signed-off-by: Sasha Levin +--- + drivers/cpufreq/amd-pstate.c | 17 ++++------------- + 1 file changed, 4 insertions(+), 13 deletions(-) + +diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c +index e798420bcb5f9..03a5fd713ad59 100644 +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1630,11 +1630,14 @@ static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) + return 0; + } + +-static void amd_pstate_epp_offline(struct cpufreq_policy *policy) ++static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; + int min_perf; + ++ if (cpudata->suspended) ++ return 0; ++ + min_perf = READ_ONCE(cpudata->lowest_perf); + + mutex_lock(&amd_pstate_limits_lock); +@@ -1643,18 +1646,6 @@ static void amd_pstate_epp_offline(struct cpufreq_policy *policy) + amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE); + + mutex_unlock(&amd_pstate_limits_lock); +-} +- +-static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) +-{ +- struct amd_cpudata *cpudata = policy->driver_data; +- +- pr_debug("AMD CPU Core %d going offline\n", cpudata->cpu); +- +- if (cpudata->suspended) +- return 0; +- +- amd_pstate_epp_offline(policy); + + return 0; + } +-- +2.39.5 + diff --git a/queue-6.13/cpufreq-amd-pstate-refactor-amd_pstate_epp_reenable-.patch b/queue-6.13/cpufreq-amd-pstate-refactor-amd_pstate_epp_reenable-.patch new file mode 100644 index 0000000000..6562616ca3 --- /dev/null +++ b/queue-6.13/cpufreq-amd-pstate-refactor-amd_pstate_epp_reenable-.patch @@ -0,0 +1,97 @@ +From b5e1c512cd93238b6d9d265a7183b50905753681 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 4 Dec 2024 14:48:40 +0000 +Subject: cpufreq/amd-pstate: Refactor amd_pstate_epp_reenable() and + amd_pstate_epp_offline() + +From: Dhananjay Ugwekar + +[ Upstream commit b1089e0c8817fda93d474eaa82ad86386887aefe ] + +Replace similar code chunks with amd_pstate_update_perf() and +amd_pstate_set_epp() function calls. + +Signed-off-by: Dhananjay Ugwekar +Reviewed-by: Mario Limonciello +Reviewed-by: Gautham R. Shenoy +Link: https://lore.kernel.org/r/20241204144842.164178-4-Dhananjay.Ugwekar@amd.com +[ML: Fix LKP reported error about unused variable] +Signed-off-by: Mario Limonciello +Stable-dep-of: 3ace20038e19 ("cpufreq/amd-pstate: Fix cpufreq_policy ref counting") +Signed-off-by: Sasha Levin +--- + drivers/cpufreq/amd-pstate.c | 38 +++++++----------------------------- + 1 file changed, 7 insertions(+), 31 deletions(-) + +diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c +index f6d04eb40af94..72c613cba7086 100644 +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1605,25 +1605,17 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) + + static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) + { +- struct cppc_perf_ctrls perf_ctrls; +- u64 value, max_perf; ++ u64 max_perf; + int ret; + + ret = amd_pstate_cppc_enable(true); + if (ret) + pr_err("failed to enable amd pstate during resume, return %d\n", ret); + +- value = READ_ONCE(cpudata->cppc_req_cached); + max_perf = READ_ONCE(cpudata->highest_perf); + +- if (cpu_feature_enabled(X86_FEATURE_CPPC)) { +- wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); +- } else { +- perf_ctrls.max_perf = max_perf; +- cppc_set_perf(cpudata->cpu, &perf_ctrls); +- perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached); +- cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); +- } ++ amd_pstate_update_perf(cpudata, 0, 0, max_perf, false); ++ amd_pstate_set_epp(cpudata, cpudata->epp_cached); + } + + static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) +@@ -1643,31 +1635,15 @@ static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) + static void amd_pstate_epp_offline(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; +- struct cppc_perf_ctrls perf_ctrls; + int min_perf; +- u64 value; + + min_perf = READ_ONCE(cpudata->lowest_perf); +- value = READ_ONCE(cpudata->cppc_req_cached); + + mutex_lock(&amd_pstate_limits_lock); +- if (cpu_feature_enabled(X86_FEATURE_CPPC)) { +- cpudata->epp_policy = CPUFREQ_POLICY_UNKNOWN; +- +- /* Set max perf same as min perf */ +- value &= ~AMD_CPPC_MAX_PERF(~0L); +- value |= AMD_CPPC_MAX_PERF(min_perf); +- value &= ~AMD_CPPC_MIN_PERF(~0L); +- value |= AMD_CPPC_MIN_PERF(min_perf); +- wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); +- } else { +- perf_ctrls.desired_perf = 0; +- perf_ctrls.min_perf = min_perf; +- perf_ctrls.max_perf = min_perf; +- cppc_set_perf(cpudata->cpu, &perf_ctrls); +- perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE); +- cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); +- } ++ ++ amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, false); ++ amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE); ++ + mutex_unlock(&amd_pstate_limits_lock); + } + +-- +2.39.5 + diff --git a/queue-6.13/cpufreq-amd-pstate-remove-the-cppc_state-check-in-of.patch b/queue-6.13/cpufreq-amd-pstate-remove-the-cppc_state-check-in-of.patch new file mode 100644 index 0000000000..c061a1fd2f --- /dev/null +++ b/queue-6.13/cpufreq-amd-pstate-remove-the-cppc_state-check-in-of.patch @@ -0,0 +1,56 @@ +From eade9503c75c97b700b9390297f5d49552e426fb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 4 Dec 2024 14:48:41 +0000 +Subject: cpufreq/amd-pstate: Remove the cppc_state check in offline/online + functions + +From: Dhananjay Ugwekar + +[ Upstream commit b78f8c87ec3e7499bb049986838636d3afbc7ece ] + +Only amd_pstate_epp driver (i.e. cppc_state = ACTIVE) enters the +amd_pstate_epp_offline() and amd_pstate_epp_cpu_online() functions, +so remove the unnecessary if condition checking if cppc_state is +equal to AMD_PSTATE_ACTIVE. + +Signed-off-by: Dhananjay Ugwekar +Reviewed-by: Mario Limonciello +Reviewed-by: Gautham R. Shenoy +Link: https://lore.kernel.org/r/20241204144842.164178-5-Dhananjay.Ugwekar@amd.com +Signed-off-by: Mario Limonciello +Stable-dep-of: 3ace20038e19 ("cpufreq/amd-pstate: Fix cpufreq_policy ref counting") +Signed-off-by: Sasha Levin +--- + drivers/cpufreq/amd-pstate.c | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c +index 72c613cba7086..e798420bcb5f9 100644 +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1624,10 +1624,8 @@ static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) + + pr_debug("AMD CPU Core %d going online\n", cpudata->cpu); + +- if (cppc_state == AMD_PSTATE_ACTIVE) { +- amd_pstate_epp_reenable(cpudata); +- cpudata->suspended = false; +- } ++ amd_pstate_epp_reenable(cpudata); ++ cpudata->suspended = false; + + return 0; + } +@@ -1656,8 +1654,7 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) + if (cpudata->suspended) + return 0; + +- if (cppc_state == AMD_PSTATE_ACTIVE) +- amd_pstate_epp_offline(policy); ++ amd_pstate_epp_offline(policy); + + return 0; + } +-- +2.39.5 + diff --git a/queue-6.13/drm-xe-oa-set-stream-pollin-in-xe_oa_buffer_check_un.patch b/queue-6.13/drm-xe-oa-set-stream-pollin-in-xe_oa_buffer_check_un.patch new file mode 100644 index 0000000000..5849074b75 --- /dev/null +++ b/queue-6.13/drm-xe-oa-set-stream-pollin-in-xe_oa_buffer_check_un.patch @@ -0,0 +1,69 @@ +From a51d0181dc34baddfaee4b2f4a6d61caabcd6e37 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Jan 2025 14:20:29 -0800 +Subject: drm/xe/oa: Set stream->pollin in xe_oa_buffer_check_unlocked + +From: Ashutosh Dixit + +[ Upstream commit 990d35edc5d333ca6cd3acfdfc13683dc5bb105f ] + +We rely on stream->pollin to decide whether or not to block during +poll/read calls. However, currently there are blocking read code paths +which don't even set stream->pollin. The best place to consistently set +stream->pollin for all code paths is therefore to set it in +xe_oa_buffer_check_unlocked. + +Fixes: e936f885f1e9 ("drm/xe/oa/uapi: Expose OA stream fd") +Signed-off-by: Ashutosh Dixit +Acked-by: Rodrigo Vivi +Reviewed-by: Jonathan Cavitt +Reviewed-by: Umesh Nerlige Ramappa +Link: https://patchwork.freedesktop.org/patch/msgid/20250115222029.3002103-1-ashutosh.dixit@intel.com +(cherry picked from commit d3fedff828bb7e4a422c42caeafd5d974e24ee43) +Signed-off-by: Rodrigo Vivi +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/xe/xe_oa.c | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c +index d56b0a0ede0da..913f6ba606370 100644 +--- a/drivers/gpu/drm/xe/xe_oa.c ++++ b/drivers/gpu/drm/xe/xe_oa.c +@@ -239,7 +239,6 @@ static bool xe_oa_buffer_check_unlocked(struct xe_oa_stream *stream) + u32 tail, hw_tail, partial_report_size, available; + int report_size = stream->oa_buffer.format->size; + unsigned long flags; +- bool pollin; + + spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); + +@@ -284,11 +283,11 @@ static bool xe_oa_buffer_check_unlocked(struct xe_oa_stream *stream) + stream->oa_buffer.tail = tail; + + available = xe_oa_circ_diff(stream, stream->oa_buffer.tail, stream->oa_buffer.head); +- pollin = available >= stream->wait_num_reports * report_size; ++ stream->pollin = available >= stream->wait_num_reports * report_size; + + spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); + +- return pollin; ++ return stream->pollin; + } + + static enum hrtimer_restart xe_oa_poll_check_timer_cb(struct hrtimer *hrtimer) +@@ -296,10 +295,8 @@ static enum hrtimer_restart xe_oa_poll_check_timer_cb(struct hrtimer *hrtimer) + struct xe_oa_stream *stream = + container_of(hrtimer, typeof(*stream), poll_check_timer); + +- if (xe_oa_buffer_check_unlocked(stream)) { +- stream->pollin = true; ++ if (xe_oa_buffer_check_unlocked(stream)) + wake_up(&stream->poll_wq); +- } + + hrtimer_forward_now(hrtimer, ns_to_ktime(stream->poll_period_ns)); + +-- +2.39.5 + diff --git a/queue-6.13/drm-xe-oa-uapi-expose-an-unblock-after-n-reports-oa-.patch b/queue-6.13/drm-xe-oa-uapi-expose-an-unblock-after-n-reports-oa-.patch new file mode 100644 index 0000000000..4beaf97403 --- /dev/null +++ b/queue-6.13/drm-xe-oa-uapi-expose-an-unblock-after-n-reports-oa-.patch @@ -0,0 +1,176 @@ +From 6495b4bc52308b5ac453f90ee83594e23f10c0e3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 Dec 2024 14:49:03 -0800 +Subject: drm/xe/oa/uapi: Expose an unblock after N reports OA property + +From: Ashutosh Dixit + +[ Upstream commit 5637797add2af632a5d037044ab1b0b35643902e ] + +Expose an "unblock after N reports" OA property, to allow userspace threads +to be woken up less frequently. + +Co-developed-by: Umesh Nerlige Ramappa +Signed-off-by: Umesh Nerlige Ramappa +Signed-off-by: Ashutosh Dixit +Reviewed-by: Jonathan Cavitt +Reviewed-by: Umesh Nerlige Ramappa +Link: https://patchwork.freedesktop.org/patch/msgid/20241212224903.1853862-1-ashutosh.dixit@intel.com +Stable-dep-of: 990d35edc5d3 ("drm/xe/oa: Set stream->pollin in xe_oa_buffer_check_unlocked") +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/xe/xe_oa.c | 30 ++++++++++++++++++++++++++---- + drivers/gpu/drm/xe/xe_oa_types.h | 3 +++ + drivers/gpu/drm/xe/xe_query.c | 3 ++- + include/uapi/drm/xe_drm.h | 7 +++++++ + 4 files changed, 38 insertions(+), 5 deletions(-) + +diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c +index dd9d2d374b2d4..d56b0a0ede0da 100644 +--- a/drivers/gpu/drm/xe/xe_oa.c ++++ b/drivers/gpu/drm/xe/xe_oa.c +@@ -91,6 +91,7 @@ struct xe_oa_open_param { + int num_syncs; + struct xe_sync_entry *syncs; + size_t oa_buffer_size; ++ int wait_num_reports; + }; + + struct xe_oa_config_bo { +@@ -235,11 +236,10 @@ static void oa_timestamp_clear(struct xe_oa_stream *stream, u32 *report) + static bool xe_oa_buffer_check_unlocked(struct xe_oa_stream *stream) + { + u32 gtt_offset = xe_bo_ggtt_addr(stream->oa_buffer.bo); ++ u32 tail, hw_tail, partial_report_size, available; + int report_size = stream->oa_buffer.format->size; +- u32 tail, hw_tail; + unsigned long flags; + bool pollin; +- u32 partial_report_size; + + spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); + +@@ -283,8 +283,8 @@ static bool xe_oa_buffer_check_unlocked(struct xe_oa_stream *stream) + + stream->oa_buffer.tail = tail; + +- pollin = xe_oa_circ_diff(stream, stream->oa_buffer.tail, +- stream->oa_buffer.head) >= report_size; ++ available = xe_oa_circ_diff(stream, stream->oa_buffer.tail, stream->oa_buffer.head); ++ pollin = available >= stream->wait_num_reports * report_size; + + spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); + +@@ -1247,6 +1247,17 @@ static int xe_oa_set_prop_oa_buffer_size(struct xe_oa *oa, u64 value, + return 0; + } + ++static int xe_oa_set_prop_wait_num_reports(struct xe_oa *oa, u64 value, ++ struct xe_oa_open_param *param) ++{ ++ if (!value) { ++ drm_dbg(&oa->xe->drm, "wait_num_reports %llu\n", value); ++ return -EINVAL; ++ } ++ param->wait_num_reports = value; ++ return 0; ++} ++ + static int xe_oa_set_prop_ret_inval(struct xe_oa *oa, u64 value, + struct xe_oa_open_param *param) + { +@@ -1268,6 +1279,7 @@ static const xe_oa_set_property_fn xe_oa_set_property_funcs_open[] = { + [DRM_XE_OA_PROPERTY_NUM_SYNCS] = xe_oa_set_prop_num_syncs, + [DRM_XE_OA_PROPERTY_SYNCS] = xe_oa_set_prop_syncs_user, + [DRM_XE_OA_PROPERTY_OA_BUFFER_SIZE] = xe_oa_set_prop_oa_buffer_size, ++ [DRM_XE_OA_PROPERTY_WAIT_NUM_REPORTS] = xe_oa_set_prop_wait_num_reports, + }; + + static const xe_oa_set_property_fn xe_oa_set_property_funcs_config[] = { +@@ -1283,6 +1295,7 @@ static const xe_oa_set_property_fn xe_oa_set_property_funcs_config[] = { + [DRM_XE_OA_PROPERTY_NUM_SYNCS] = xe_oa_set_prop_num_syncs, + [DRM_XE_OA_PROPERTY_SYNCS] = xe_oa_set_prop_syncs_user, + [DRM_XE_OA_PROPERTY_OA_BUFFER_SIZE] = xe_oa_set_prop_ret_inval, ++ [DRM_XE_OA_PROPERTY_WAIT_NUM_REPORTS] = xe_oa_set_prop_ret_inval, + }; + + static int xe_oa_user_ext_set_property(struct xe_oa *oa, enum xe_oa_user_extn_from from, +@@ -1759,6 +1772,7 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream, + stream->periodic = param->period_exponent > 0; + stream->period_exponent = param->period_exponent; + stream->no_preempt = param->no_preempt; ++ stream->wait_num_reports = param->wait_num_reports; + + stream->xef = xe_file_get(param->xef); + stream->num_syncs = param->num_syncs; +@@ -2118,6 +2132,14 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f + if (!param.oa_buffer_size) + param.oa_buffer_size = DEFAULT_XE_OA_BUFFER_SIZE; + ++ if (!param.wait_num_reports) ++ param.wait_num_reports = 1; ++ if (param.wait_num_reports > param.oa_buffer_size / f->size) { ++ drm_dbg(&oa->xe->drm, "wait_num_reports %d\n", param.wait_num_reports); ++ ret = -EINVAL; ++ goto err_exec_q; ++ } ++ + ret = xe_oa_parse_syncs(oa, ¶m); + if (ret) + goto err_exec_q; +diff --git a/drivers/gpu/drm/xe/xe_oa_types.h b/drivers/gpu/drm/xe/xe_oa_types.h +index df77939156288..2dcd3b9562e97 100644 +--- a/drivers/gpu/drm/xe/xe_oa_types.h ++++ b/drivers/gpu/drm/xe/xe_oa_types.h +@@ -218,6 +218,9 @@ struct xe_oa_stream { + /** @pollin: Whether there is data available to read */ + bool pollin; + ++ /** @wait_num_reports: Number of reports to wait for before signalling pollin */ ++ int wait_num_reports; ++ + /** @periodic: Whether periodic sampling is currently enabled */ + bool periodic; + +diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c +index 1cda6cbd9b795..1bdffe6315d54 100644 +--- a/drivers/gpu/drm/xe/xe_query.c ++++ b/drivers/gpu/drm/xe/xe_query.c +@@ -671,7 +671,8 @@ static int query_oa_units(struct xe_device *xe, + du->oa_unit_type = u->type; + du->oa_timestamp_freq = xe_oa_timestamp_frequency(gt); + du->capabilities = DRM_XE_OA_CAPS_BASE | DRM_XE_OA_CAPS_SYNCS | +- DRM_XE_OA_CAPS_OA_BUFFER_SIZE; ++ DRM_XE_OA_CAPS_OA_BUFFER_SIZE | ++ DRM_XE_OA_CAPS_WAIT_NUM_REPORTS; + + j = 0; + for_each_hw_engine(hwe, gt, hwe_id) { +diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h +index 0383b52cbd869..f62689ca861a4 100644 +--- a/include/uapi/drm/xe_drm.h ++++ b/include/uapi/drm/xe_drm.h +@@ -1487,6 +1487,7 @@ struct drm_xe_oa_unit { + #define DRM_XE_OA_CAPS_BASE (1 << 0) + #define DRM_XE_OA_CAPS_SYNCS (1 << 1) + #define DRM_XE_OA_CAPS_OA_BUFFER_SIZE (1 << 2) ++#define DRM_XE_OA_CAPS_WAIT_NUM_REPORTS (1 << 3) + + /** @oa_timestamp_freq: OA timestamp freq */ + __u64 oa_timestamp_freq; +@@ -1660,6 +1661,12 @@ enum drm_xe_oa_property_id { + * buffer is allocated by default. + */ + DRM_XE_OA_PROPERTY_OA_BUFFER_SIZE, ++ ++ /** ++ * @DRM_XE_OA_PROPERTY_WAIT_NUM_REPORTS: Number of reports to wait ++ * for before unblocking poll or read ++ */ ++ DRM_XE_OA_PROPERTY_WAIT_NUM_REPORTS, + }; + + /** +-- +2.39.5 + diff --git a/queue-6.13/drm-xe-oa-uapi-make-oa-buffer-size-configurable.patch b/queue-6.13/drm-xe-oa-uapi-make-oa-buffer-size-configurable.patch new file mode 100644 index 0000000000..9d5cbfb8fa --- /dev/null +++ b/queue-6.13/drm-xe-oa-uapi-make-oa-buffer-size-configurable.patch @@ -0,0 +1,280 @@ +From 2611eee2bfc0e074c50d889eb468a7f2f5485219 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 5 Dec 2024 09:49:13 +0530 +Subject: drm/xe/oa/uapi: Make OA buffer size configurable + +From: Sai Teja Pottumuttu + +[ Upstream commit 720f63a838731d25ab34c306db59c12834ce09b4 ] + +Add a new property called DRM_XE_OA_PROPERTY_OA_BUFFER_SIZE to +allow OA buffer size to be configurable from userspace. + +With this OA buffer size can be configured to any power of 2 +size between 128KB and 128MB and it would default to 16MB in case +the size is not supplied. + +v2: + - Rebase +v3: + - Add oa buffer size to capabilities [Ashutosh] + - Address several nitpicks [Ashutosh] + - Fix commit message/subject [Ashutosh] + +BSpec: 61100, 61228 +Signed-off-by: Sai Teja Pottumuttu +Reviewed-by: Ashutosh Dixit +Signed-off-by: Ashutosh Dixit +Link: https://patchwork.freedesktop.org/patch/msgid/20241205041913.883767-2-sai.teja.pottumuttu@intel.com +Stable-dep-of: 990d35edc5d3 ("drm/xe/oa: Set stream->pollin in xe_oa_buffer_check_unlocked") +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/xe/regs/xe_oa_regs.h | 9 +---- + drivers/gpu/drm/xe/xe_oa.c | 55 ++++++++++++++++++++++------ + drivers/gpu/drm/xe/xe_oa_types.h | 2 +- + drivers/gpu/drm/xe/xe_query.c | 3 +- + include/uapi/drm/xe_drm.h | 9 +++++ + 5 files changed, 56 insertions(+), 22 deletions(-) + +diff --git a/drivers/gpu/drm/xe/regs/xe_oa_regs.h b/drivers/gpu/drm/xe/regs/xe_oa_regs.h +index 6d31573ed1765..a79ad2da070c2 100644 +--- a/drivers/gpu/drm/xe/regs/xe_oa_regs.h ++++ b/drivers/gpu/drm/xe/regs/xe_oa_regs.h +@@ -41,14 +41,6 @@ + + #define OAG_OABUFFER XE_REG(0xdb08) + #define OABUFFER_SIZE_MASK REG_GENMASK(5, 3) +-#define OABUFFER_SIZE_128K REG_FIELD_PREP(OABUFFER_SIZE_MASK, 0) +-#define OABUFFER_SIZE_256K REG_FIELD_PREP(OABUFFER_SIZE_MASK, 1) +-#define OABUFFER_SIZE_512K REG_FIELD_PREP(OABUFFER_SIZE_MASK, 2) +-#define OABUFFER_SIZE_1M REG_FIELD_PREP(OABUFFER_SIZE_MASK, 3) +-#define OABUFFER_SIZE_2M REG_FIELD_PREP(OABUFFER_SIZE_MASK, 4) +-#define OABUFFER_SIZE_4M REG_FIELD_PREP(OABUFFER_SIZE_MASK, 5) +-#define OABUFFER_SIZE_8M REG_FIELD_PREP(OABUFFER_SIZE_MASK, 6) +-#define OABUFFER_SIZE_16M REG_FIELD_PREP(OABUFFER_SIZE_MASK, 7) + #define OAG_OABUFFER_MEMORY_SELECT REG_BIT(0) /* 0: PPGTT, 1: GGTT */ + + #define OAG_OACONTROL XE_REG(0xdaf4) +@@ -67,6 +59,7 @@ + #define OAG_OA_DEBUG XE_REG(0xdaf8, XE_REG_OPTION_MASKED) + #define OAG_OA_DEBUG_DISABLE_MMIO_TRG REG_BIT(14) + #define OAG_OA_DEBUG_START_TRIGGER_SCOPE_CONTROL REG_BIT(13) ++#define OAG_OA_DEBUG_BUF_SIZE_SELECT REG_BIT(12) + #define OAG_OA_DEBUG_DISABLE_START_TRG_2_COUNT_QUAL REG_BIT(8) + #define OAG_OA_DEBUG_DISABLE_START_TRG_1_COUNT_QUAL REG_BIT(7) + #define OAG_OA_DEBUG_INCLUDE_CLK_RATIO REG_BIT(6) +diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c +index d8af82dcdce4b..dd9d2d374b2d4 100644 +--- a/drivers/gpu/drm/xe/xe_oa.c ++++ b/drivers/gpu/drm/xe/xe_oa.c +@@ -90,6 +90,7 @@ struct xe_oa_open_param { + struct drm_xe_sync __user *syncs_user; + int num_syncs; + struct xe_sync_entry *syncs; ++ size_t oa_buffer_size; + }; + + struct xe_oa_config_bo { +@@ -397,11 +398,19 @@ static int xe_oa_append_reports(struct xe_oa_stream *stream, char __user *buf, + + static void xe_oa_init_oa_buffer(struct xe_oa_stream *stream) + { +- struct xe_mmio *mmio = &stream->gt->mmio; + u32 gtt_offset = xe_bo_ggtt_addr(stream->oa_buffer.bo); +- u32 oa_buf = gtt_offset | OABUFFER_SIZE_16M | OAG_OABUFFER_MEMORY_SELECT; ++ int size_exponent = __ffs(stream->oa_buffer.bo->size); ++ u32 oa_buf = gtt_offset | OAG_OABUFFER_MEMORY_SELECT; ++ struct xe_mmio *mmio = &stream->gt->mmio; + unsigned long flags; + ++ /* ++ * If oa buffer size is more than 16MB (exponent greater than 24), the ++ * oa buffer size field is multiplied by 8 in xe_oa_enable_metric_set. ++ */ ++ oa_buf |= REG_FIELD_PREP(OABUFFER_SIZE_MASK, ++ size_exponent > 24 ? size_exponent - 20 : size_exponent - 17); ++ + spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); + + xe_mmio_write32(mmio, __oa_regs(stream)->oa_status, 0); +@@ -863,15 +872,12 @@ static void xe_oa_stream_destroy(struct xe_oa_stream *stream) + xe_file_put(stream->xef); + } + +-static int xe_oa_alloc_oa_buffer(struct xe_oa_stream *stream) ++static int xe_oa_alloc_oa_buffer(struct xe_oa_stream *stream, size_t size) + { + struct xe_bo *bo; + +- BUILD_BUG_ON_NOT_POWER_OF_2(XE_OA_BUFFER_SIZE); +- BUILD_BUG_ON(XE_OA_BUFFER_SIZE < SZ_128K || XE_OA_BUFFER_SIZE > SZ_16M); +- + bo = xe_bo_create_pin_map(stream->oa->xe, stream->gt->tile, NULL, +- XE_OA_BUFFER_SIZE, ttm_bo_type_kernel, ++ size, ttm_bo_type_kernel, + XE_BO_FLAG_SYSTEM | XE_BO_FLAG_GGTT); + if (IS_ERR(bo)) + return PTR_ERR(bo); +@@ -1049,6 +1055,13 @@ static u32 oag_report_ctx_switches(const struct xe_oa_stream *stream) + 0 : OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS); + } + ++static u32 oag_buf_size_select(const struct xe_oa_stream *stream) ++{ ++ return _MASKED_FIELD(OAG_OA_DEBUG_BUF_SIZE_SELECT, ++ stream->oa_buffer.bo->size > SZ_16M ? ++ OAG_OA_DEBUG_BUF_SIZE_SELECT : 0); ++} ++ + static int xe_oa_enable_metric_set(struct xe_oa_stream *stream) + { + struct xe_mmio *mmio = &stream->gt->mmio; +@@ -1081,6 +1094,7 @@ static int xe_oa_enable_metric_set(struct xe_oa_stream *stream) + xe_mmio_write32(mmio, __oa_regs(stream)->oa_debug, + _MASKED_BIT_ENABLE(oa_debug) | + oag_report_ctx_switches(stream) | ++ oag_buf_size_select(stream) | + oag_configure_mmio_trigger(stream, true)); + + xe_mmio_write32(mmio, __oa_regs(stream)->oa_ctx_ctrl, stream->periodic ? +@@ -1222,6 +1236,17 @@ static int xe_oa_set_prop_syncs_user(struct xe_oa *oa, u64 value, + return 0; + } + ++static int xe_oa_set_prop_oa_buffer_size(struct xe_oa *oa, u64 value, ++ struct xe_oa_open_param *param) ++{ ++ if (!is_power_of_2(value) || value < SZ_128K || value > SZ_128M) { ++ drm_dbg(&oa->xe->drm, "OA buffer size invalid %llu\n", value); ++ return -EINVAL; ++ } ++ param->oa_buffer_size = value; ++ return 0; ++} ++ + static int xe_oa_set_prop_ret_inval(struct xe_oa *oa, u64 value, + struct xe_oa_open_param *param) + { +@@ -1242,6 +1267,7 @@ static const xe_oa_set_property_fn xe_oa_set_property_funcs_open[] = { + [DRM_XE_OA_PROPERTY_NO_PREEMPT] = xe_oa_set_no_preempt, + [DRM_XE_OA_PROPERTY_NUM_SYNCS] = xe_oa_set_prop_num_syncs, + [DRM_XE_OA_PROPERTY_SYNCS] = xe_oa_set_prop_syncs_user, ++ [DRM_XE_OA_PROPERTY_OA_BUFFER_SIZE] = xe_oa_set_prop_oa_buffer_size, + }; + + static const xe_oa_set_property_fn xe_oa_set_property_funcs_config[] = { +@@ -1256,6 +1282,7 @@ static const xe_oa_set_property_fn xe_oa_set_property_funcs_config[] = { + [DRM_XE_OA_PROPERTY_NO_PREEMPT] = xe_oa_set_prop_ret_inval, + [DRM_XE_OA_PROPERTY_NUM_SYNCS] = xe_oa_set_prop_num_syncs, + [DRM_XE_OA_PROPERTY_SYNCS] = xe_oa_set_prop_syncs_user, ++ [DRM_XE_OA_PROPERTY_OA_BUFFER_SIZE] = xe_oa_set_prop_ret_inval, + }; + + static int xe_oa_user_ext_set_property(struct xe_oa *oa, enum xe_oa_user_extn_from from, +@@ -1515,7 +1542,7 @@ static long xe_oa_status_locked(struct xe_oa_stream *stream, unsigned long arg) + + static long xe_oa_info_locked(struct xe_oa_stream *stream, unsigned long arg) + { +- struct drm_xe_oa_stream_info info = { .oa_buf_size = XE_OA_BUFFER_SIZE, }; ++ struct drm_xe_oa_stream_info info = { .oa_buf_size = stream->oa_buffer.bo->size, }; + void __user *uaddr = (void __user *)arg; + + if (copy_to_user(uaddr, &info, sizeof(info))) +@@ -1601,7 +1628,7 @@ static int xe_oa_mmap(struct file *file, struct vm_area_struct *vma) + } + + /* Can mmap the entire OA buffer or nothing (no partial OA buffer mmaps) */ +- if (vma->vm_end - vma->vm_start != XE_OA_BUFFER_SIZE) { ++ if (vma->vm_end - vma->vm_start != stream->oa_buffer.bo->size) { + drm_dbg(&stream->oa->xe->drm, "Wrong mmap size, must be OA buffer size\n"); + return -EINVAL; + } +@@ -1745,9 +1772,10 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream, + if (GRAPHICS_VER(stream->oa->xe) >= 20 && + stream->hwe->oa_unit->type == DRM_XE_OA_UNIT_TYPE_OAG && stream->sample) + stream->oa_buffer.circ_size = +- XE_OA_BUFFER_SIZE - XE_OA_BUFFER_SIZE % stream->oa_buffer.format->size; ++ param->oa_buffer_size - ++ param->oa_buffer_size % stream->oa_buffer.format->size; + else +- stream->oa_buffer.circ_size = XE_OA_BUFFER_SIZE; ++ stream->oa_buffer.circ_size = param->oa_buffer_size; + + if (stream->exec_q && engine_supports_mi_query(stream->hwe)) { + /* If we don't find the context offset, just return error */ +@@ -1790,7 +1818,7 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream, + goto err_fw_put; + } + +- ret = xe_oa_alloc_oa_buffer(stream); ++ ret = xe_oa_alloc_oa_buffer(stream, param->oa_buffer_size); + if (ret) + goto err_fw_put; + +@@ -2087,6 +2115,9 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f + drm_dbg(&oa->xe->drm, "Using periodic sampling freq %lld Hz\n", oa_freq_hz); + } + ++ if (!param.oa_buffer_size) ++ param.oa_buffer_size = DEFAULT_XE_OA_BUFFER_SIZE; ++ + ret = xe_oa_parse_syncs(oa, ¶m); + if (ret) + goto err_exec_q; +diff --git a/drivers/gpu/drm/xe/xe_oa_types.h b/drivers/gpu/drm/xe/xe_oa_types.h +index fea9d981e414f..df77939156288 100644 +--- a/drivers/gpu/drm/xe/xe_oa_types.h ++++ b/drivers/gpu/drm/xe/xe_oa_types.h +@@ -15,7 +15,7 @@ + #include "regs/xe_reg_defs.h" + #include "xe_hw_engine_types.h" + +-#define XE_OA_BUFFER_SIZE SZ_16M ++#define DEFAULT_XE_OA_BUFFER_SIZE SZ_16M + + enum xe_oa_report_header { + HDR_32_BIT = 0, +diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c +index 170ae72d1a7bb..1cda6cbd9b795 100644 +--- a/drivers/gpu/drm/xe/xe_query.c ++++ b/drivers/gpu/drm/xe/xe_query.c +@@ -670,7 +670,8 @@ static int query_oa_units(struct xe_device *xe, + du->oa_unit_id = u->oa_unit_id; + du->oa_unit_type = u->type; + du->oa_timestamp_freq = xe_oa_timestamp_frequency(gt); +- du->capabilities = DRM_XE_OA_CAPS_BASE | DRM_XE_OA_CAPS_SYNCS; ++ du->capabilities = DRM_XE_OA_CAPS_BASE | DRM_XE_OA_CAPS_SYNCS | ++ DRM_XE_OA_CAPS_OA_BUFFER_SIZE; + + j = 0; + for_each_hw_engine(hwe, gt, hwe_id) { +diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h +index 4a8a4a63e99ca..0383b52cbd869 100644 +--- a/include/uapi/drm/xe_drm.h ++++ b/include/uapi/drm/xe_drm.h +@@ -1486,6 +1486,7 @@ struct drm_xe_oa_unit { + __u64 capabilities; + #define DRM_XE_OA_CAPS_BASE (1 << 0) + #define DRM_XE_OA_CAPS_SYNCS (1 << 1) ++#define DRM_XE_OA_CAPS_OA_BUFFER_SIZE (1 << 2) + + /** @oa_timestamp_freq: OA timestamp freq */ + __u64 oa_timestamp_freq; +@@ -1651,6 +1652,14 @@ enum drm_xe_oa_property_id { + * to the VM bind case. + */ + DRM_XE_OA_PROPERTY_SYNCS, ++ ++ /** ++ * @DRM_XE_OA_PROPERTY_OA_BUFFER_SIZE: Size of OA buffer to be ++ * allocated by the driver in bytes. Supported sizes are powers of ++ * 2 from 128 KiB to 128 MiB. When not specified, a 16 MiB OA ++ * buffer is allocated by default. ++ */ ++ DRM_XE_OA_PROPERTY_OA_BUFFER_SIZE, + }; + + /** +-- +2.39.5 + diff --git a/queue-6.13/eth-iavf-extend-the-netdev_lock-usage.patch b/queue-6.13/eth-iavf-extend-the-netdev_lock-usage.patch new file mode 100644 index 0000000000..8c2e8cb80c --- /dev/null +++ b/queue-6.13/eth-iavf-extend-the-netdev_lock-usage.patch @@ -0,0 +1,297 @@ +From e97a997da4966a6f66911fd3a6903f4c4fc8d978 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 10 Jan 2025 23:13:39 -0800 +Subject: eth: iavf: extend the netdev_lock usage + +From: Jakub Kicinski + +[ Upstream commit afc664987ab318c227ebc0f639f5afc921aaf674 ] + +iavf uses the netdev->lock already to protect shapers. +In an upcoming series we'll try to protect NAPI instances +with netdev->lock. + +We need to modify the protection a bit. All NAPI related +calls in the driver need to be consistently under the lock. +This will allow us to easily switch to a "we already hold +the lock" NAPI API later. + +register_netdevice(), OTOH, must not be called under +the netdev_lock() as we do not intend to have an +"already locked" version of this call. + +Link: https://patch.msgid.link/20250111071339.3709071-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Stable-dep-of: 011b03359038 ("Revert "net: skb: introduce and use a single page frag cache"") +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/iavf/iavf_main.c | 53 +++++++++++++++++---- + 1 file changed, 45 insertions(+), 8 deletions(-) + +diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c +index 2b8700abe56bb..7c427003184d5 100644 +--- a/drivers/net/ethernet/intel/iavf/iavf_main.c ++++ b/drivers/net/ethernet/intel/iavf/iavf_main.c +@@ -1983,6 +1983,7 @@ static int iavf_reinit_interrupt_scheme(struct iavf_adapter *adapter, bool runni + static void iavf_finish_config(struct work_struct *work) + { + struct iavf_adapter *adapter; ++ bool netdev_released = false; + int pairs, err; + + adapter = container_of(work, struct iavf_adapter, finish_config); +@@ -2003,7 +2004,16 @@ static void iavf_finish_config(struct work_struct *work) + + switch (adapter->state) { + case __IAVF_DOWN: ++ /* Set the real number of queues when reset occurs while ++ * state == __IAVF_DOWN ++ */ ++ pairs = adapter->num_active_queues; ++ netif_set_real_num_rx_queues(adapter->netdev, pairs); ++ netif_set_real_num_tx_queues(adapter->netdev, pairs); ++ + if (adapter->netdev->reg_state != NETREG_REGISTERED) { ++ mutex_unlock(&adapter->netdev->lock); ++ netdev_released = true; + err = register_netdevice(adapter->netdev); + if (err) { + dev_err(&adapter->pdev->dev, "Unable to register netdev (%d)\n", +@@ -2018,11 +2028,7 @@ static void iavf_finish_config(struct work_struct *work) + goto out; + } + } +- +- /* Set the real number of queues when reset occurs while +- * state == __IAVF_DOWN +- */ +- fallthrough; ++ break; + case __IAVF_RUNNING: + pairs = adapter->num_active_queues; + netif_set_real_num_rx_queues(adapter->netdev, pairs); +@@ -2035,7 +2041,8 @@ static void iavf_finish_config(struct work_struct *work) + + out: + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&adapter->netdev->lock); ++ if (!netdev_released) ++ mutex_unlock(&adapter->netdev->lock); + rtnl_unlock(); + } + +@@ -2728,12 +2735,16 @@ static void iavf_watchdog_task(struct work_struct *work) + struct iavf_adapter *adapter = container_of(work, + struct iavf_adapter, + watchdog_task.work); ++ struct net_device *netdev = adapter->netdev; + struct iavf_hw *hw = &adapter->hw; + u32 reg_val; + ++ mutex_lock(&netdev->lock); + if (!mutex_trylock(&adapter->crit_lock)) { +- if (adapter->state == __IAVF_REMOVE) ++ if (adapter->state == __IAVF_REMOVE) { ++ mutex_unlock(&netdev->lock); + return; ++ } + + goto restart_watchdog; + } +@@ -2745,30 +2756,35 @@ static void iavf_watchdog_task(struct work_struct *work) + case __IAVF_STARTUP: + iavf_startup(adapter); + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, + msecs_to_jiffies(30)); + return; + case __IAVF_INIT_VERSION_CHECK: + iavf_init_version_check(adapter); + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, + msecs_to_jiffies(30)); + return; + case __IAVF_INIT_GET_RESOURCES: + iavf_init_get_resources(adapter); + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, + msecs_to_jiffies(1)); + return; + case __IAVF_INIT_EXTENDED_CAPS: + iavf_init_process_extended_caps(adapter); + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, + msecs_to_jiffies(1)); + return; + case __IAVF_INIT_CONFIG_ADAPTER: + iavf_init_config_adapter(adapter); + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, + msecs_to_jiffies(1)); + return; +@@ -2780,6 +2796,7 @@ static void iavf_watchdog_task(struct work_struct *work) + * as it can loop forever + */ + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + return; + } + if (++adapter->aq_wait_count > IAVF_AQ_MAX_ERR) { +@@ -2788,6 +2805,7 @@ static void iavf_watchdog_task(struct work_struct *work) + adapter->flags |= IAVF_FLAG_PF_COMMS_FAILED; + iavf_shutdown_adminq(hw); + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + queue_delayed_work(adapter->wq, + &adapter->watchdog_task, (5 * HZ)); + return; +@@ -2795,6 +2813,7 @@ static void iavf_watchdog_task(struct work_struct *work) + /* Try again from failed step*/ + iavf_change_state(adapter, adapter->last_state); + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, HZ); + return; + case __IAVF_COMM_FAILED: +@@ -2807,6 +2826,7 @@ static void iavf_watchdog_task(struct work_struct *work) + iavf_change_state(adapter, __IAVF_INIT_FAILED); + adapter->flags &= ~IAVF_FLAG_PF_COMMS_FAILED; + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + return; + } + reg_val = rd32(hw, IAVF_VFGEN_RSTAT) & +@@ -2826,12 +2846,14 @@ static void iavf_watchdog_task(struct work_struct *work) + adapter->aq_required = 0; + adapter->current_op = VIRTCHNL_OP_UNKNOWN; + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + queue_delayed_work(adapter->wq, + &adapter->watchdog_task, + msecs_to_jiffies(10)); + return; + case __IAVF_RESETTING: + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, + HZ * 2); + return; +@@ -2862,6 +2884,7 @@ static void iavf_watchdog_task(struct work_struct *work) + case __IAVF_REMOVE: + default: + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + return; + } + +@@ -2873,12 +2896,14 @@ static void iavf_watchdog_task(struct work_struct *work) + dev_err(&adapter->pdev->dev, "Hardware reset detected\n"); + iavf_schedule_reset(adapter, IAVF_FLAG_RESET_PENDING); + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + queue_delayed_work(adapter->wq, + &adapter->watchdog_task, HZ * 2); + return; + } + + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + restart_watchdog: + if (adapter->state >= __IAVF_DOWN) + queue_work(adapter->wq, &adapter->adminq_task); +@@ -4355,14 +4380,17 @@ static int iavf_open(struct net_device *netdev) + return -EIO; + } + ++ mutex_lock(&netdev->lock); + while (!mutex_trylock(&adapter->crit_lock)) { + /* If we are in __IAVF_INIT_CONFIG_ADAPTER state the crit_lock + * is already taken and iavf_open is called from an upper + * device's notifier reacting on NETDEV_REGISTER event. + * We have to leave here to avoid dead lock. + */ +- if (adapter->state == __IAVF_INIT_CONFIG_ADAPTER) ++ if (adapter->state == __IAVF_INIT_CONFIG_ADAPTER) { ++ mutex_unlock(&netdev->lock); + return -EBUSY; ++ } + + usleep_range(500, 1000); + } +@@ -4411,6 +4439,7 @@ static int iavf_open(struct net_device *netdev) + iavf_irq_enable(adapter, true); + + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + + return 0; + +@@ -4423,6 +4452,7 @@ static int iavf_open(struct net_device *netdev) + iavf_free_all_tx_resources(adapter); + err_unlock: + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + + return err; + } +@@ -4444,10 +4474,12 @@ static int iavf_close(struct net_device *netdev) + u64 aq_to_restore; + int status; + ++ mutex_lock(&netdev->lock); + mutex_lock(&adapter->crit_lock); + + if (adapter->state <= __IAVF_DOWN_PENDING) { + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + return 0; + } + +@@ -4481,6 +4513,7 @@ static int iavf_close(struct net_device *netdev) + iavf_free_traffic_irqs(adapter); + + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + + /* We explicitly don't free resources here because the hardware is + * still active and can DMA into memory. Resources are cleared in +@@ -5357,6 +5390,7 @@ static int iavf_suspend(struct device *dev_d) + + netif_device_detach(netdev); + ++ mutex_lock(&netdev->lock); + mutex_lock(&adapter->crit_lock); + + if (netif_running(netdev)) { +@@ -5368,6 +5402,7 @@ static int iavf_suspend(struct device *dev_d) + iavf_reset_interrupt_capability(adapter); + + mutex_unlock(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + + return 0; + } +@@ -5466,6 +5501,7 @@ static void iavf_remove(struct pci_dev *pdev) + if (netdev->reg_state == NETREG_REGISTERED) + unregister_netdev(netdev); + ++ mutex_lock(&netdev->lock); + mutex_lock(&adapter->crit_lock); + dev_info(&adapter->pdev->dev, "Removing device\n"); + iavf_change_state(adapter, __IAVF_REMOVE); +@@ -5502,6 +5538,7 @@ static void iavf_remove(struct pci_dev *pdev) + mutex_destroy(&hw->aq.asq_mutex); + mutex_unlock(&adapter->crit_lock); + mutex_destroy(&adapter->crit_lock); ++ mutex_unlock(&netdev->lock); + + iounmap(hw->hw_addr); + pci_release_regions(pdev); +-- +2.39.5 + diff --git a/queue-6.13/flow_dissector-use-rcu-protection-to-fetch-dev_net.patch b/queue-6.13/flow_dissector-use-rcu-protection-to-fetch-dev_net.patch new file mode 100644 index 0000000000..db76040436 --- /dev/null +++ b/queue-6.13/flow_dissector-use-rcu-protection-to-fetch-dev_net.patch @@ -0,0 +1,81 @@ +From a5f8d1fe8d4ea527a69d80b29c4561e8f5cae669 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Feb 2025 15:51:17 +0000 +Subject: flow_dissector: use RCU protection to fetch dev_net() + +From: Eric Dumazet + +[ Upstream commit afec62cd0a4191cde6dd3a75382be4d51a38ce9b ] + +__skb_flow_dissect() can be called from arbitrary contexts. + +It must extend its RCU protection section to include +the call to dev_net(), which can become dev_net_rcu(). + +This makes sure the net structure can not disappear under us. + +Fixes: 9b52e3f267a6 ("flow_dissector: handle no-skb use case") +Signed-off-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250205155120.1676781-10-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/core/flow_dissector.c | 21 +++++++++++---------- + 1 file changed, 11 insertions(+), 10 deletions(-) + +diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c +index 0e638a37aa096..5db41bf2ed93e 100644 +--- a/net/core/flow_dissector.c ++++ b/net/core/flow_dissector.c +@@ -1108,10 +1108,12 @@ bool __skb_flow_dissect(const struct net *net, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + ++ rcu_read_lock(); ++ + if (skb) { + if (!net) { + if (skb->dev) +- net = dev_net(skb->dev); ++ net = dev_net_rcu(skb->dev); + else if (skb->sk) + net = sock_net(skb->sk); + } +@@ -1122,7 +1124,6 @@ bool __skb_flow_dissect(const struct net *net, + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; + struct bpf_prog_array *run_array; + +- rcu_read_lock(); + run_array = rcu_dereference(init_net.bpf.run_array[type]); + if (!run_array) + run_array = rcu_dereference(net->bpf.run_array[type]); +@@ -1150,17 +1151,17 @@ bool __skb_flow_dissect(const struct net *net, + prog = READ_ONCE(run_array->items[0].prog); + result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, + hlen, flags); +- if (result == BPF_FLOW_DISSECTOR_CONTINUE) +- goto dissect_continue; +- __skb_flow_bpf_to_target(&flow_keys, flow_dissector, +- target_container); +- rcu_read_unlock(); +- return result == BPF_OK; ++ if (result != BPF_FLOW_DISSECTOR_CONTINUE) { ++ __skb_flow_bpf_to_target(&flow_keys, flow_dissector, ++ target_container); ++ rcu_read_unlock(); ++ return result == BPF_OK; ++ } + } +-dissect_continue: +- rcu_read_unlock(); + } + ++ rcu_read_unlock(); ++ + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS)) { + struct ethhdr *eth = eth_hdr(skb); +-- +2.39.5 + diff --git a/queue-6.13/genirq-remove-leading-space-from-irq_chip-irq_print_.patch b/queue-6.13/genirq-remove-leading-space-from-irq_chip-irq_print_.patch new file mode 100644 index 0000000000..2cc44cd934 --- /dev/null +++ b/queue-6.13/genirq-remove-leading-space-from-irq_chip-irq_print_.patch @@ -0,0 +1,81 @@ +From 7fc8186bf729cb4fa54e0c05aa4c775ca6a508e8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Feb 2025 15:22:56 +0100 +Subject: genirq: Remove leading space from irq_chip::irq_print_chip() + callbacks + +From: Geert Uytterhoeven + +[ Upstream commit 29a61a1f40637ae010b828745fb41f60301c3a3d ] + +The space separator was factored out from the multiple chip name prints, +but several irq_chip::irq_print_chip() callbacks still print a leading +space. Remove the superfluous double spaces. + +Fixes: 9d9f204bdf7243bf ("genirq/proc: Add missing space separator back") +Signed-off-by: Geert Uytterhoeven +Signed-off-by: Thomas Gleixner +Link: https://lore.kernel.org/all/893f7e9646d8933cd6786d5a1ef3eb076d263768.1738764803.git.geert+renesas@glider.be +Signed-off-by: Sasha Levin +--- + arch/powerpc/sysdev/fsl_msi.c | 2 +- + drivers/bus/moxtet.c | 2 +- + drivers/irqchip/irq-partition-percpu.c | 2 +- + drivers/soc/qcom/smp2p.c | 2 +- + 4 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c +index 1aa0cb097c9c9..7b9a5ea9cad9d 100644 +--- a/arch/powerpc/sysdev/fsl_msi.c ++++ b/arch/powerpc/sysdev/fsl_msi.c +@@ -75,7 +75,7 @@ static void fsl_msi_print_chip(struct irq_data *irqd, struct seq_file *p) + srs = (hwirq >> msi_data->srs_shift) & MSI_SRS_MASK; + cascade_virq = msi_data->cascade_array[srs]->virq; + +- seq_printf(p, " fsl-msi-%d", cascade_virq); ++ seq_printf(p, "fsl-msi-%d", cascade_virq); + } + + +diff --git a/drivers/bus/moxtet.c b/drivers/bus/moxtet.c +index 6276551d79680..1e57ebfb76229 100644 +--- a/drivers/bus/moxtet.c ++++ b/drivers/bus/moxtet.c +@@ -657,7 +657,7 @@ static void moxtet_irq_print_chip(struct irq_data *d, struct seq_file *p) + + id = moxtet->modules[pos->idx]; + +- seq_printf(p, " moxtet-%s.%i#%i", mox_module_name(id), pos->idx, ++ seq_printf(p, "moxtet-%s.%i#%i", mox_module_name(id), pos->idx, + pos->bit); + } + +diff --git a/drivers/irqchip/irq-partition-percpu.c b/drivers/irqchip/irq-partition-percpu.c +index 8e76d2913e6be..4441ffe149ea0 100644 +--- a/drivers/irqchip/irq-partition-percpu.c ++++ b/drivers/irqchip/irq-partition-percpu.c +@@ -98,7 +98,7 @@ static void partition_irq_print_chip(struct irq_data *d, struct seq_file *p) + struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); + struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); + +- seq_printf(p, " %5s-%lu", chip->name, data->hwirq); ++ seq_printf(p, "%5s-%lu", chip->name, data->hwirq); + } + + static struct irq_chip partition_irq_chip = { +diff --git a/drivers/soc/qcom/smp2p.c b/drivers/soc/qcom/smp2p.c +index 4783ab1adb8d9..a3e88ced328a9 100644 +--- a/drivers/soc/qcom/smp2p.c ++++ b/drivers/soc/qcom/smp2p.c +@@ -365,7 +365,7 @@ static void smp2p_irq_print_chip(struct irq_data *irqd, struct seq_file *p) + { + struct smp2p_entry *entry = irq_data_get_irq_chip_data(irqd); + +- seq_printf(p, " %8s", dev_name(entry->smp2p->dev)); ++ seq_printf(p, "%8s", dev_name(entry->smp2p->dev)); + } + + static struct irq_chip smp2p_irq_chip = { +-- +2.39.5 + diff --git a/queue-6.13/hid-hid-steam-make-sure-rumble-work-is-canceled-on-r.patch b/queue-6.13/hid-hid-steam-make-sure-rumble-work-is-canceled-on-r.patch new file mode 100644 index 0000000000..4d2787c6ac --- /dev/null +++ b/queue-6.13/hid-hid-steam-make-sure-rumble-work-is-canceled-on-r.patch @@ -0,0 +1,38 @@ +From eca5196b2cfe719e166379176ab6a84cc995a545 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 25 Dec 2024 18:34:24 -0800 +Subject: HID: hid-steam: Make sure rumble work is canceled on removal + +From: Vicki Pfau + +[ Upstream commit cc4f952427aaa44ecfd92542e10a65cce67bd6f4 ] + +When a force feedback command is sent from userspace, work is scheduled to pass +this data to the controller without blocking userspace itself. However, in +theory, this work might not be properly canceled if the controller is removed +at the exact right time. This patch ensures the work is properly canceled when +the device is removed. + +Signed-off-by: Vicki Pfau +Signed-off-by: Jiri Kosina +Stable-dep-of: 79504249d7e2 ("HID: hid-steam: Move hidraw input (un)registering to work") +Signed-off-by: Sasha Levin +--- + drivers/hid/hid-steam.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/hid/hid-steam.c b/drivers/hid/hid-steam.c +index 12a6887cd12c9..48139ef80dc11 100644 +--- a/drivers/hid/hid-steam.c ++++ b/drivers/hid/hid-steam.c +@@ -1306,6 +1306,7 @@ static void steam_remove(struct hid_device *hdev) + + cancel_delayed_work_sync(&steam->mode_switch); + cancel_work_sync(&steam->work_connect); ++ cancel_work_sync(&steam->rumble_work); + hid_destroy_device(steam->client_hdev); + steam->client_hdev = NULL; + steam->client_opened = 0; +-- +2.39.5 + diff --git a/queue-6.13/hid-hid-steam-move-hidraw-input-un-registering-to-wo.patch b/queue-6.13/hid-hid-steam-move-hidraw-input-un-registering-to-wo.patch new file mode 100644 index 0000000000..20a815afd0 --- /dev/null +++ b/queue-6.13/hid-hid-steam-move-hidraw-input-un-registering-to-wo.patch @@ -0,0 +1,117 @@ +From c73eea670e4912d5cda5a67aa35fd025d63bd094 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 4 Feb 2025 19:55:27 -0800 +Subject: HID: hid-steam: Move hidraw input (un)registering to work + +From: Vicki Pfau + +[ Upstream commit 79504249d7e27cad4a3eeb9afc6386e418728ce0 ] + +Due to an interplay between locking in the input and hid transport subsystems, +attempting to register or deregister the relevant input devices during the +hidraw open/close events can lead to a lock ordering issue. Though this +shouldn't cause a deadlock, this commit moves the input device manipulation to +deferred work to sidestep the issue. + +Fixes: 385a4886778f6 ("HID: steam: remove input device when a hid client is running.") +Signed-off-by: Vicki Pfau +Signed-off-by: Jiri Kosina +Signed-off-by: Sasha Levin +--- + drivers/hid/hid-steam.c | 38 +++++++++++++++++++++++++++++++------- + 1 file changed, 31 insertions(+), 7 deletions(-) + +diff --git a/drivers/hid/hid-steam.c b/drivers/hid/hid-steam.c +index 48139ef80dc11..5f8518f6f5ac7 100644 +--- a/drivers/hid/hid-steam.c ++++ b/drivers/hid/hid-steam.c +@@ -313,6 +313,7 @@ struct steam_device { + u16 rumble_left; + u16 rumble_right; + unsigned int sensor_timestamp_us; ++ struct work_struct unregister_work; + }; + + static int steam_recv_report(struct steam_device *steam, +@@ -1072,6 +1073,31 @@ static void steam_mode_switch_cb(struct work_struct *work) + } + } + ++static void steam_work_unregister_cb(struct work_struct *work) ++{ ++ struct steam_device *steam = container_of(work, struct steam_device, ++ unregister_work); ++ unsigned long flags; ++ bool connected; ++ bool opened; ++ ++ spin_lock_irqsave(&steam->lock, flags); ++ opened = steam->client_opened; ++ connected = steam->connected; ++ spin_unlock_irqrestore(&steam->lock, flags); ++ ++ if (connected) { ++ if (opened) { ++ steam_sensors_unregister(steam); ++ steam_input_unregister(steam); ++ } else { ++ steam_set_lizard_mode(steam, lizard_mode); ++ steam_input_register(steam); ++ steam_sensors_register(steam); ++ } ++ } ++} ++ + static bool steam_is_valve_interface(struct hid_device *hdev) + { + struct hid_report_enum *rep_enum; +@@ -1117,8 +1143,7 @@ static int steam_client_ll_open(struct hid_device *hdev) + steam->client_opened++; + spin_unlock_irqrestore(&steam->lock, flags); + +- steam_sensors_unregister(steam); +- steam_input_unregister(steam); ++ schedule_work(&steam->unregister_work); + + return 0; + } +@@ -1135,11 +1160,7 @@ static void steam_client_ll_close(struct hid_device *hdev) + connected = steam->connected && !steam->client_opened; + spin_unlock_irqrestore(&steam->lock, flags); + +- if (connected) { +- steam_set_lizard_mode(steam, lizard_mode); +- steam_input_register(steam); +- steam_sensors_register(steam); +- } ++ schedule_work(&steam->unregister_work); + } + + static int steam_client_ll_raw_request(struct hid_device *hdev, +@@ -1231,6 +1252,7 @@ static int steam_probe(struct hid_device *hdev, + INIT_LIST_HEAD(&steam->list); + INIT_WORK(&steam->rumble_work, steam_haptic_rumble_cb); + steam->sensor_timestamp_us = 0; ++ INIT_WORK(&steam->unregister_work, steam_work_unregister_cb); + + /* + * With the real steam controller interface, do not connect hidraw. +@@ -1291,6 +1313,7 @@ static int steam_probe(struct hid_device *hdev, + cancel_work_sync(&steam->work_connect); + cancel_delayed_work_sync(&steam->mode_switch); + cancel_work_sync(&steam->rumble_work); ++ cancel_work_sync(&steam->unregister_work); + + return ret; + } +@@ -1307,6 +1330,7 @@ static void steam_remove(struct hid_device *hdev) + cancel_delayed_work_sync(&steam->mode_switch); + cancel_work_sync(&steam->work_connect); + cancel_work_sync(&steam->rumble_work); ++ cancel_work_sync(&steam->unregister_work); + hid_destroy_device(steam->client_hdev); + steam->client_hdev = NULL; + steam->client_opened = 0; +-- +2.39.5 + diff --git a/queue-6.13/iavf-fix-a-locking-bug-in-an-error-path.patch b/queue-6.13/iavf-fix-a-locking-bug-in-an-error-path.patch new file mode 100644 index 0000000000..7990adfbbc --- /dev/null +++ b/queue-6.13/iavf-fix-a-locking-bug-in-an-error-path.patch @@ -0,0 +1,38 @@ +From f51cefa621353b4fab9067b989bf8fd4c8258f70 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 6 Feb 2025 09:51:08 -0800 +Subject: iavf: Fix a locking bug in an error path + +From: Bart Van Assche + +[ Upstream commit e589adf5b70c07b1ab974d077046fdbf583b2f36 ] + +If the netdev lock has been obtained, unlock it before returning. +This bug has been detected by the Clang thread-safety analyzer. + +Fixes: afc664987ab3 ("eth: iavf: extend the netdev_lock usage") +Signed-off-by: Bart Van Assche +Link: https://patch.msgid.link/20250206175114.1974171-28-bvanassche@acm.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/iavf/iavf_main.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c +index 4639f55a17be1..37904e2de30bd 100644 +--- a/drivers/net/ethernet/intel/iavf/iavf_main.c ++++ b/drivers/net/ethernet/intel/iavf/iavf_main.c +@@ -2903,8 +2903,8 @@ static void iavf_watchdog_task(struct work_struct *work) + } + + mutex_unlock(&adapter->crit_lock); +- netdev_unlock(netdev); + restart_watchdog: ++ netdev_unlock(netdev); + if (adapter->state >= __IAVF_DOWN) + queue_work(adapter->wq, &adapter->adminq_task); + if (adapter->aq_required) +-- +2.39.5 + diff --git a/queue-6.13/include-net-add-static-inline-dst_dev_overhead-to-ds.patch b/queue-6.13/include-net-add-static-inline-dst_dev_overhead-to-ds.patch new file mode 100644 index 0000000000..47765169e2 --- /dev/null +++ b/queue-6.13/include-net-add-static-inline-dst_dev_overhead-to-ds.patch @@ -0,0 +1,49 @@ +From 68d90abd9928ebf5ce0d9a8f63998af6b6bcebda Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 3 Dec 2024 13:49:42 +0100 +Subject: include: net: add static inline dst_dev_overhead() to dst.h + +From: Justin Iurman + +[ Upstream commit 0600cf40e9b36fe17f9c9f04d4f9cef249eaa5e7 ] + +Add static inline dst_dev_overhead() function to include/net/dst.h. This +helper function is used by ioam6_iptunnel, rpl_iptunnel and +seg6_iptunnel to get the dev's overhead based on a cache entry +(dst_entry). If the cache is empty, the default and generic value +skb->mac_len is returned. Otherwise, LL_RESERVED_SPACE() over dst's dev +is returned. + +Signed-off-by: Justin Iurman +Cc: Alexander Lobakin +Cc: Vadim Fedorenko +Signed-off-by: Paolo Abeni +Stable-dep-of: 92191dd10730 ("net: ipv6: fix dst ref loops in rpl, seg6 and ioam6 lwtunnels") +Signed-off-by: Sasha Levin +--- + include/net/dst.h | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/include/net/dst.h b/include/net/dst.h +index 0f303cc602520..08647c99d79c9 100644 +--- a/include/net/dst.h ++++ b/include/net/dst.h +@@ -440,6 +440,15 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout) + dst->expires = expires; + } + ++static inline unsigned int dst_dev_overhead(struct dst_entry *dst, ++ struct sk_buff *skb) ++{ ++ if (likely(dst)) ++ return LL_RESERVED_SPACE(dst->dev); ++ ++ return skb->mac_len; ++} ++ + INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *, + struct sk_buff *)); + INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, +-- +2.39.5 + diff --git a/queue-6.13/io_uring-uring_cmd-cleanup-struct-io_uring_cmd_data-.patch b/queue-6.13/io_uring-uring_cmd-cleanup-struct-io_uring_cmd_data-.patch new file mode 100644 index 0000000000..816886fba3 --- /dev/null +++ b/queue-6.13/io_uring-uring_cmd-cleanup-struct-io_uring_cmd_data-.patch @@ -0,0 +1,49 @@ +From e5e1ae0460f2f0aa8ac5d15a57d2d94f497a5568 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 22 Jan 2025 19:50:31 -0700 +Subject: io_uring/uring_cmd: cleanup struct io_uring_cmd_data layout + +From: Jens Axboe + +[ Upstream commit eaf72f7b414f5944585e7dee9c915c7f8f7f6344 ] + +A few spots in uring_cmd assume that the SQEs copied are always at the +start of the structure, and hence mix req->async_data and the struct +itself. + +Clean that up and use the proper indices. + +Signed-off-by: Jens Axboe +Stable-dep-of: e663da62ba86 ("io_uring/uring_cmd: switch sqe to async_data on EAGAIN") +Signed-off-by: Sasha Levin +--- + io_uring/uring_cmd.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c +index f43adcc16cf65..caed143fb156d 100644 +--- a/io_uring/uring_cmd.c ++++ b/io_uring/uring_cmd.c +@@ -201,8 +201,8 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, + return 0; + } + +- memcpy(req->async_data, sqe, uring_sqe_size(req->ctx)); +- ioucmd->sqe = req->async_data; ++ memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); ++ ioucmd->sqe = cache->sqes; + return 0; + } + +@@ -269,7 +269,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) + struct io_uring_cmd_data *cache = req->async_data; + + if (ioucmd->sqe != (void *) cache) +- memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx)); ++ memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); + return -EAGAIN; + } else if (ret == -EIOCBQUEUED) { + return -EIOCBQUEUED; +-- +2.39.5 + diff --git a/queue-6.13/io_uring-uring_cmd-don-t-assume-io_uring_cmd_data-la.patch b/queue-6.13/io_uring-uring_cmd-don-t-assume-io_uring_cmd_data-la.patch new file mode 100644 index 0000000000..97e65c19e1 --- /dev/null +++ b/queue-6.13/io_uring-uring_cmd-don-t-assume-io_uring_cmd_data-la.patch @@ -0,0 +1,46 @@ +From 4d6ec1ef4bd63fd08c146120dd3aa9dad56a90c8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 12 Feb 2025 13:45:45 -0700 +Subject: io_uring/uring_cmd: don't assume io_uring_cmd_data layout + +From: Caleb Sander Mateos + +[ Upstream commit 34cae91215c6f65bed2a124fb9283da6ec0b8dd9 ] + +eaf72f7b414f ("io_uring/uring_cmd: cleanup struct io_uring_cmd_data +layout") removed most of the places assuming struct io_uring_cmd_data +has sqes as its first field. However, the EAGAIN case in io_uring_cmd() +still compares ioucmd->sqe to the struct io_uring_cmd_data pointer using +a void * cast. Since fa3595523d72 ("io_uring: get rid of alloc cache +init_once handling"), sqes is no longer io_uring_cmd_data's first field. +As a result, the pointers will always compare unequal and memcpy() may +be called with the same source and destination. + +Replace the incorrect void * cast with the address of the sqes field. + +Signed-off-by: Caleb Sander Mateos +Fixes: eaf72f7b414f ("io_uring/uring_cmd: cleanup struct io_uring_cmd_data layout") +Link: https://lore.kernel.org/r/20250212204546.3751645-2-csander@purestorage.com +Signed-off-by: Jens Axboe +Stable-dep-of: e663da62ba86 ("io_uring/uring_cmd: switch sqe to async_data on EAGAIN") +Signed-off-by: Sasha Levin +--- + io_uring/uring_cmd.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c +index caed143fb156d..b72154fefbee9 100644 +--- a/io_uring/uring_cmd.c ++++ b/io_uring/uring_cmd.c +@@ -268,7 +268,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) + if (ret == -EAGAIN) { + struct io_uring_cmd_data *cache = req->async_data; + +- if (ioucmd->sqe != (void *) cache) ++ if (ioucmd->sqe != cache->sqes) + memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); + return -EAGAIN; + } else if (ret == -EIOCBQUEUED) { +-- +2.39.5 + diff --git a/queue-6.13/io_uring-uring_cmd-switch-sqe-to-async_data-on-eagai.patch b/queue-6.13/io_uring-uring_cmd-switch-sqe-to-async_data-on-eagai.patch new file mode 100644 index 0000000000..822cc9c265 --- /dev/null +++ b/queue-6.13/io_uring-uring_cmd-switch-sqe-to-async_data-on-eagai.patch @@ -0,0 +1,88 @@ +From 4acc0d943d83cef38250e06eb18faad59283c19b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 12 Feb 2025 13:45:46 -0700 +Subject: io_uring/uring_cmd: switch sqe to async_data on EAGAIN + +From: Caleb Sander Mateos + +[ Upstream commit e663da62ba8672aaa66843f1af8b20e3bb1a0515 ] + +5eff57fa9f3a ("io_uring/uring_cmd: defer SQE copying until it's needed") +moved the unconditional memcpy() of the uring_cmd SQE to async_data +to 2 cases when the request goes async: +- If REQ_F_FORCE_ASYNC is set to force the initial issue to go async +- If ->uring_cmd() returns -EAGAIN in the initial non-blocking issue + +Unlike the REQ_F_FORCE_ASYNC case, in the EAGAIN case, io_uring_cmd() +copies the SQE to async_data but neglects to update the io_uring_cmd's +sqe field to point to async_data. As a result, sqe still points to the +slot in the userspace-mapped SQ. At the end of io_submit_sqes(), the +kernel advances the SQ head index, allowing userspace to reuse the slot +for a new SQE. If userspace reuses the slot before the io_uring worker +reissues the original SQE, the io_uring_cmd's SQE will be corrupted. + +Introduce a helper io_uring_cmd_cache_sqes() to copy the original SQE to +the io_uring_cmd's async_data and point sqe there. Use it for both the +REQ_F_FORCE_ASYNC and EAGAIN cases. This ensures the uring_cmd doesn't +read from the SQ slot after it has been returned to userspace. + +Signed-off-by: Caleb Sander Mateos +Fixes: 5eff57fa9f3a ("io_uring/uring_cmd: defer SQE copying until it's needed") +Link: https://lore.kernel.org/r/20250212204546.3751645-3-csander@purestorage.com +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/uring_cmd.c | 23 ++++++++++++++--------- + 1 file changed, 14 insertions(+), 9 deletions(-) + +diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c +index b72154fefbee9..0ec58fcd6fc9b 100644 +--- a/io_uring/uring_cmd.c ++++ b/io_uring/uring_cmd.c +@@ -185,6 +185,15 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, + } + EXPORT_SYMBOL_GPL(io_uring_cmd_done); + ++static void io_uring_cmd_cache_sqes(struct io_kiocb *req) ++{ ++ struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); ++ struct io_uring_cmd_data *cache = req->async_data; ++ ++ memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); ++ ioucmd->sqe = cache->sqes; ++} ++ + static int io_uring_cmd_prep_setup(struct io_kiocb *req, + const struct io_uring_sqe *sqe) + { +@@ -195,14 +204,10 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, + if (unlikely(!cache)) + return -ENOMEM; + +- if (!(req->flags & REQ_F_FORCE_ASYNC)) { +- /* defer memcpy until we need it */ +- ioucmd->sqe = sqe; +- return 0; +- } +- +- memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); +- ioucmd->sqe = cache->sqes; ++ ioucmd->sqe = sqe; ++ /* defer memcpy until we need it */ ++ if (unlikely(req->flags & REQ_F_FORCE_ASYNC)) ++ io_uring_cmd_cache_sqes(req); + return 0; + } + +@@ -269,7 +274,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) + struct io_uring_cmd_data *cache = req->async_data; + + if (ioucmd->sqe != cache->sqes) +- memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); ++ io_uring_cmd_cache_sqes(req); + return -EAGAIN; + } else if (ret == -EIOCBQUEUED) { + return -EIOCBQUEUED; +-- +2.39.5 + diff --git a/queue-6.13/io_uring-uring_cmd-unconditionally-copy-sqes-at-prep.patch b/queue-6.13/io_uring-uring_cmd-unconditionally-copy-sqes-at-prep.patch new file mode 100644 index 0000000000..dac95e715e --- /dev/null +++ b/queue-6.13/io_uring-uring_cmd-unconditionally-copy-sqes-at-prep.patch @@ -0,0 +1,92 @@ +From 194e0f4bece4ed070886433c8ec3c3a5555085d6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 13 Feb 2025 08:24:23 -0700 +Subject: io_uring/uring_cmd: unconditionally copy SQEs at prep time + +From: Jens Axboe + +[ Upstream commit d6211ebbdaa541af197b50b8dd8f22642ce0b87f ] + +This isn't generally necessary, but conditions have been observed where +SQE data is accessed from the original SQE after prep has been done and +outside of the initial issue. Opcode prep handlers must ensure that any +SQE related data is stable beyond the prep phase, but uring_cmd is a bit +special in how it handles the SQE which makes it susceptible to reading +stale data. If the application has reused the SQE before the original +completes, then that can lead to data corruption. + +Down the line we can relax this again once uring_cmd has been sanitized +a bit, and avoid unnecessarily copying the SQE. + +Fixes: 5eff57fa9f3a ("io_uring/uring_cmd: defer SQE copying until it's needed") +Reported-by: Caleb Sander Mateos +Reviewed-by: Caleb Sander Mateos +Reviewed-by: Li Zetao +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/uring_cmd.c | 34 +++++++++++----------------------- + 1 file changed, 11 insertions(+), 23 deletions(-) + +diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c +index 0ec58fcd6fc9b..8c44a5198414e 100644 +--- a/io_uring/uring_cmd.c ++++ b/io_uring/uring_cmd.c +@@ -185,15 +185,6 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, + } + EXPORT_SYMBOL_GPL(io_uring_cmd_done); + +-static void io_uring_cmd_cache_sqes(struct io_kiocb *req) +-{ +- struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); +- struct io_uring_cmd_data *cache = req->async_data; +- +- memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); +- ioucmd->sqe = cache->sqes; +-} +- + static int io_uring_cmd_prep_setup(struct io_kiocb *req, + const struct io_uring_sqe *sqe) + { +@@ -204,10 +195,15 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, + if (unlikely(!cache)) + return -ENOMEM; + +- ioucmd->sqe = sqe; +- /* defer memcpy until we need it */ +- if (unlikely(req->flags & REQ_F_FORCE_ASYNC)) +- io_uring_cmd_cache_sqes(req); ++ /* ++ * Unconditionally cache the SQE for now - this is only needed for ++ * requests that go async, but prep handlers must ensure that any ++ * sqe data is stable beyond prep. Since uring_cmd is special in ++ * that it doesn't read in per-op data, play it safe and ensure that ++ * any SQE data is stable beyond prep. This can later get relaxed. ++ */ ++ memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); ++ ioucmd->sqe = cache->sqes; + return 0; + } + +@@ -270,16 +266,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) + } + + ret = file->f_op->uring_cmd(ioucmd, issue_flags); +- if (ret == -EAGAIN) { +- struct io_uring_cmd_data *cache = req->async_data; +- +- if (ioucmd->sqe != cache->sqes) +- io_uring_cmd_cache_sqes(req); +- return -EAGAIN; +- } else if (ret == -EIOCBQUEUED) { +- return -EIOCBQUEUED; +- } +- ++ if (ret == -EAGAIN || ret == -EIOCBQUEUED) ++ return ret; + if (ret < 0) + req_set_fail(req); + io_req_uring_cleanup(req, issue_flags); +-- +2.39.5 + diff --git a/queue-6.13/ipv4-add-rcu-protection-to-ip4_dst_hoplimit.patch b/queue-6.13/ipv4-add-rcu-protection-to-ip4_dst_hoplimit.patch new file mode 100644 index 0000000000..075f1875c7 --- /dev/null +++ b/queue-6.13/ipv4-add-rcu-protection-to-ip4_dst_hoplimit.patch @@ -0,0 +1,47 @@ +From 219ce0f7b59df344e9491f142db2b216c232cb16 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Feb 2025 15:51:10 +0000 +Subject: ipv4: add RCU protection to ip4_dst_hoplimit() + +From: Eric Dumazet + +[ Upstream commit 469308552ca4560176cfc100e7ca84add1bebd7c ] + +ip4_dst_hoplimit() must use RCU protection to make +sure the net structure it reads does not disappear. + +Fixes: fa50d974d104 ("ipv4: Namespaceify ip_default_ttl sysctl knob") +Signed-off-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250205155120.1676781-3-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + include/net/route.h | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/include/net/route.h b/include/net/route.h +index 84cb1e04f5cd9..64949854d35dc 100644 +--- a/include/net/route.h ++++ b/include/net/route.h +@@ -368,10 +368,15 @@ static inline int inet_iif(const struct sk_buff *skb) + static inline int ip4_dst_hoplimit(const struct dst_entry *dst) + { + int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); +- struct net *net = dev_net(dst->dev); + +- if (hoplimit == 0) ++ if (hoplimit == 0) { ++ const struct net *net; ++ ++ rcu_read_lock(); ++ net = dev_net_rcu(dst->dev); + hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); ++ rcu_read_unlock(); ++ } + return hoplimit; + } + +-- +2.39.5 + diff --git a/queue-6.13/ipv4-icmp-convert-to-dev_net_rcu.patch b/queue-6.13/ipv4-icmp-convert-to-dev_net_rcu.patch new file mode 100644 index 0000000000..4673b559b3 --- /dev/null +++ b/queue-6.13/ipv4-icmp-convert-to-dev_net_rcu.patch @@ -0,0 +1,150 @@ +From 0ab5b9962cf407903feb3b67b59cf1c81d844cc5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Feb 2025 15:51:16 +0000 +Subject: ipv4: icmp: convert to dev_net_rcu() + +From: Eric Dumazet + +[ Upstream commit 4b8474a0951e605d2a27a2c483da4eb4b8c63760 ] + +__icmp_send() must ensure rcu_read_lock() is held, as spotted +by Jakub. + +Other ICMP uses of dev_net() seem safe, change them to dev_net_rcu() +to get LOCKDEP support. + +Fixes: dde1bc0e6f86 ("[NETNS]: Add namespace for ICMP replying code.") +Closes: https://lore.kernel.org/netdev/20250203153633.46ce0337@kernel.org/ +Reported-by: Jakub Kicinski +Signed-off-by: Eric Dumazet +Link: https://patch.msgid.link/20250205155120.1676781-9-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ipv4/icmp.c | 31 +++++++++++++++++-------------- + 1 file changed, 17 insertions(+), 14 deletions(-) + +diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c +index 094084b61bff8..5482edb5aade2 100644 +--- a/net/ipv4/icmp.c ++++ b/net/ipv4/icmp.c +@@ -399,10 +399,10 @@ static void icmp_push_reply(struct sock *sk, + + static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) + { +- struct ipcm_cookie ipc; + struct rtable *rt = skb_rtable(skb); +- struct net *net = dev_net(rt->dst.dev); ++ struct net *net = dev_net_rcu(rt->dst.dev); + bool apply_ratelimit = false; ++ struct ipcm_cookie ipc; + struct flowi4 fl4; + struct sock *sk; + struct inet_sock *inet; +@@ -608,12 +608,14 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, + struct sock *sk; + + if (!rt) +- goto out; ++ return; ++ ++ rcu_read_lock(); + + if (rt->dst.dev) +- net = dev_net(rt->dst.dev); ++ net = dev_net_rcu(rt->dst.dev); + else if (skb_in->dev) +- net = dev_net(skb_in->dev); ++ net = dev_net_rcu(skb_in->dev); + else + goto out; + +@@ -785,7 +787,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, + icmp_xmit_unlock(sk); + out_bh_enable: + local_bh_enable(); +-out:; ++out: ++ rcu_read_unlock(); + } + EXPORT_SYMBOL(__icmp_send); + +@@ -834,7 +837,7 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) + * avoid additional coding at protocol handlers. + */ + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { +- __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); ++ __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); + return; + } + +@@ -868,7 +871,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb) + struct net *net; + u32 info = 0; + +- net = dev_net(skb_dst(skb)->dev); ++ net = dev_net_rcu(skb_dst(skb)->dev); + + /* + * Incomplete header ? +@@ -979,7 +982,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb) + static enum skb_drop_reason icmp_redirect(struct sk_buff *skb) + { + if (skb->len < sizeof(struct iphdr)) { +- __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); ++ __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); + return SKB_DROP_REASON_PKT_TOO_SMALL; + } + +@@ -1011,7 +1014,7 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) + struct icmp_bxm icmp_param; + struct net *net; + +- net = dev_net(skb_dst(skb)->dev); ++ net = dev_net_rcu(skb_dst(skb)->dev); + /* should there be an ICMP stat for ignored echos? */ + if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all)) + return SKB_NOT_DROPPED_YET; +@@ -1040,9 +1043,9 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) + + bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) + { ++ struct net *net = dev_net_rcu(skb->dev); + struct icmp_ext_hdr *ext_hdr, _ext_hdr; + struct icmp_ext_echo_iio *iio, _iio; +- struct net *net = dev_net(skb->dev); + struct inet6_dev *in6_dev; + struct in_device *in_dev; + struct net_device *dev; +@@ -1181,7 +1184,7 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) + return SKB_NOT_DROPPED_YET; + + out_err: +- __ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); ++ __ICMP_INC_STATS(dev_net_rcu(skb_dst(skb)->dev), ICMP_MIB_INERRORS); + return SKB_DROP_REASON_PKT_TOO_SMALL; + } + +@@ -1198,7 +1201,7 @@ int icmp_rcv(struct sk_buff *skb) + { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; + struct rtable *rt = skb_rtable(skb); +- struct net *net = dev_net(rt->dst.dev); ++ struct net *net = dev_net_rcu(rt->dst.dev); + struct icmphdr *icmph; + + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { +@@ -1371,9 +1374,9 @@ int icmp_err(struct sk_buff *skb, u32 info) + struct iphdr *iph = (struct iphdr *)skb->data; + int offset = iph->ihl<<2; + struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset); ++ struct net *net = dev_net_rcu(skb->dev); + int type = icmp_hdr(skb)->type; + int code = icmp_hdr(skb)->code; +- struct net *net = dev_net(skb->dev); + + /* + * Use ping_err to handle all icmp errors except those +-- +2.39.5 + diff --git a/queue-6.13/ipv4-use-rcu-protection-in-__ip_rt_update_pmtu.patch b/queue-6.13/ipv4-use-rcu-protection-in-__ip_rt_update_pmtu.patch new file mode 100644 index 0000000000..d2f3e68b97 --- /dev/null +++ b/queue-6.13/ipv4-use-rcu-protection-in-__ip_rt_update_pmtu.patch @@ -0,0 +1,77 @@ +From 85003d41b2397dc27b26b76237142f39eaaede8c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Feb 2025 15:51:15 +0000 +Subject: ipv4: use RCU protection in __ip_rt_update_pmtu() + +From: Eric Dumazet + +[ Upstream commit 139512191bd06f1b496117c76372b2ce372c9a41 ] + +__ip_rt_update_pmtu() must use RCU protection to make +sure the net structure it reads does not disappear. + +Fixes: 2fbc6e89b2f1 ("ipv4: Update exception handling for multipath routes via same device") +Fixes: 1de6b15a434c ("Namespaceify min_pmtu sysctl") +Signed-off-by: Eric Dumazet +Link: https://patch.msgid.link/20250205155120.1676781-8-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ipv4/route.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/net/ipv4/route.c b/net/ipv4/route.c +index 152697459e918..cf84704af25c3 100644 +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -1008,9 +1008,9 @@ out: kfree_skb_reason(skb, reason); + static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) + { + struct dst_entry *dst = &rt->dst; +- struct net *net = dev_net(dst->dev); + struct fib_result res; + bool lock = false; ++ struct net *net; + u32 old_mtu; + + if (ip_mtu_locked(dst)) +@@ -1020,6 +1020,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) + if (old_mtu < mtu) + return; + ++ rcu_read_lock(); ++ net = dev_net_rcu(dst->dev); + if (mtu < net->ipv4.ip_rt_min_pmtu) { + lock = true; + mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); +@@ -1027,9 +1029,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) + + if (rt->rt_pmtu == mtu && !lock && + time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2)) +- return; ++ goto out; + +- rcu_read_lock(); + if (fib_lookup(net, fl4, &res, 0) == 0) { + struct fib_nh_common *nhc; + +@@ -1043,14 +1044,14 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) + update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, + jiffies + net->ipv4.ip_rt_mtu_expires); + } +- rcu_read_unlock(); +- return; ++ goto out; + } + #endif /* CONFIG_IP_ROUTE_MULTIPATH */ + nhc = FIB_RES_NHC(res); + update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, + jiffies + net->ipv4.ip_rt_mtu_expires); + } ++out: + rcu_read_unlock(); + } + +-- +2.39.5 + diff --git a/queue-6.13/ipv4-use-rcu-protection-in-inet_select_addr.patch b/queue-6.13/ipv4-use-rcu-protection-in-inet_select_addr.patch new file mode 100644 index 0000000000..3cea021705 --- /dev/null +++ b/queue-6.13/ipv4-use-rcu-protection-in-inet_select_addr.patch @@ -0,0 +1,41 @@ +From ae6258e38e3a5aad38a8b6c57530d3963786a6e2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Feb 2025 15:51:14 +0000 +Subject: ipv4: use RCU protection in inet_select_addr() + +From: Eric Dumazet + +[ Upstream commit 719817cd293e4fa389e1f69c396f3f816ed5aa41 ] + +inet_select_addr() must use RCU protection to make +sure the net structure it reads does not disappear. + +Fixes: c4544c724322 ("[NETNS]: Process inet_select_addr inside a namespace.") +Signed-off-by: Eric Dumazet +Link: https://patch.msgid.link/20250205155120.1676781-7-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ipv4/devinet.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c +index c8b3cf5fba4c0..55b8151759bc9 100644 +--- a/net/ipv4/devinet.c ++++ b/net/ipv4/devinet.c +@@ -1371,10 +1371,11 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) + __be32 addr = 0; + unsigned char localnet_scope = RT_SCOPE_HOST; + struct in_device *in_dev; +- struct net *net = dev_net(dev); ++ struct net *net; + int master_idx; + + rcu_read_lock(); ++ net = dev_net_rcu(dev); + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + goto no_in_dev; +-- +2.39.5 + diff --git a/queue-6.13/ipv4-use-rcu-protection-in-ip_dst_mtu_maybe_forward.patch b/queue-6.13/ipv4-use-rcu-protection-in-ip_dst_mtu_maybe_forward.patch new file mode 100644 index 0000000000..f5eef0e03a --- /dev/null +++ b/queue-6.13/ipv4-use-rcu-protection-in-ip_dst_mtu_maybe_forward.patch @@ -0,0 +1,57 @@ +From 101c5d94e9ce20eef9973a037e7375547a55ae1b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Feb 2025 15:51:11 +0000 +Subject: ipv4: use RCU protection in ip_dst_mtu_maybe_forward() + +From: Eric Dumazet + +[ Upstream commit 071d8012869b6af352acca346ade13e7be90a49f ] + +ip_dst_mtu_maybe_forward() must use RCU protection to make +sure the net structure it reads does not disappear. + +Fixes: f87c10a8aa1e8 ("ipv4: introduce ip_dst_mtu_maybe_forward and protect forwarding path against pmtu spoofing") +Signed-off-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250205155120.1676781-4-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + include/net/ip.h | 13 ++++++++++--- + 1 file changed, 10 insertions(+), 3 deletions(-) + +diff --git a/include/net/ip.h b/include/net/ip.h +index 0e548c1f2a0ec..23ecb10945b0f 100644 +--- a/include/net/ip.h ++++ b/include/net/ip.h +@@ -471,9 +471,12 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, + bool forwarding) + { + const struct rtable *rt = dst_rtable(dst); +- struct net *net = dev_net(dst->dev); +- unsigned int mtu; ++ unsigned int mtu, res; ++ struct net *net; ++ ++ rcu_read_lock(); + ++ net = dev_net_rcu(dst->dev); + if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || + ip_mtu_locked(dst) || + !forwarding) { +@@ -497,7 +500,11 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, + out: + mtu = min_t(unsigned int, mtu, IP_MAX_MTU); + +- return mtu - lwtunnel_headroom(dst->lwtstate, mtu); ++ res = mtu - lwtunnel_headroom(dst->lwtstate, mtu); ++ ++ rcu_read_unlock(); ++ ++ return res; + } + + static inline unsigned int ip_skb_dst_mtu(struct sock *sk, +-- +2.39.5 + diff --git a/queue-6.13/ipv4-use-rcu-protection-in-ipv4_default_advmss.patch b/queue-6.13/ipv4-use-rcu-protection-in-ipv4_default_advmss.patch new file mode 100644 index 0000000000..9a219b4577 --- /dev/null +++ b/queue-6.13/ipv4-use-rcu-protection-in-ipv4_default_advmss.patch @@ -0,0 +1,48 @@ +From b949ea11155f9806574e4f0502e164d073d585f6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Feb 2025 15:51:12 +0000 +Subject: ipv4: use RCU protection in ipv4_default_advmss() + +From: Eric Dumazet + +[ Upstream commit 71b8471c93fa0bcab911fcb65da1eb6c4f5f735f ] + +ipv4_default_advmss() must use RCU protection to make +sure the net structure it reads does not disappear. + +Fixes: 2e9589ff809e ("ipv4: Namespaceify min_adv_mss sysctl knob") +Signed-off-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250205155120.1676781-5-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ipv4/route.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/net/ipv4/route.c b/net/ipv4/route.c +index 3a1467f2d553f..829c8d41aaae2 100644 +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -1307,10 +1307,15 @@ static void set_class_tag(struct rtable *rt, u32 tag) + + static unsigned int ipv4_default_advmss(const struct dst_entry *dst) + { +- struct net *net = dev_net(dst->dev); + unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); +- unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, +- net->ipv4.ip_rt_min_advmss); ++ unsigned int advmss; ++ struct net *net; ++ ++ rcu_read_lock(); ++ net = dev_net_rcu(dst->dev); ++ advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, ++ net->ipv4.ip_rt_min_advmss); ++ rcu_read_unlock(); + + return min(advmss, IPV4_MAX_PMTU - header_size); + } +-- +2.39.5 + diff --git a/queue-6.13/ipv4-use-rcu-protection-in-rt_is_expired.patch b/queue-6.13/ipv4-use-rcu-protection-in-rt_is_expired.patch new file mode 100644 index 0000000000..637461b575 --- /dev/null +++ b/queue-6.13/ipv4-use-rcu-protection-in-rt_is_expired.patch @@ -0,0 +1,44 @@ +From a56954856102acb77e60ee54a6b7e3ed50595caf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Feb 2025 15:51:13 +0000 +Subject: ipv4: use RCU protection in rt_is_expired() + +From: Eric Dumazet + +[ Upstream commit dd205fcc33d92d54eee4d7f21bb073af9bd5ce2b ] + +rt_is_expired() must use RCU protection to make +sure the net structure it reads does not disappear. + +Fixes: e84f84f27647 ("netns: place rt_genid into struct net") +Signed-off-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250205155120.1676781-6-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ipv4/route.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/net/ipv4/route.c b/net/ipv4/route.c +index 829c8d41aaae2..152697459e918 100644 +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -390,7 +390,13 @@ static inline int ip_rt_proc_init(void) + + static inline bool rt_is_expired(const struct rtable *rth) + { +- return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); ++ bool res; ++ ++ rcu_read_lock(); ++ res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev)); ++ rcu_read_unlock(); ++ ++ return res; + } + + void rt_cache_flush(struct net *net) +-- +2.39.5 + diff --git a/queue-6.13/ipv6-icmp-convert-to-dev_net_rcu.patch b/queue-6.13/ipv6-icmp-convert-to-dev_net_rcu.patch new file mode 100644 index 0000000000..60e4abdae5 --- /dev/null +++ b/queue-6.13/ipv6-icmp-convert-to-dev_net_rcu.patch @@ -0,0 +1,191 @@ +From 76653a4b4fa90459562811166a58cb87c9a40fe0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Feb 2025 15:51:19 +0000 +Subject: ipv6: icmp: convert to dev_net_rcu() + +From: Eric Dumazet + +[ Upstream commit 34aef2b0ce3aa4eb4ef2e1f5cad3738d527032f5 ] + +icmp6_send() must acquire rcu_read_lock() sooner to ensure +the dev_net() call done from a safe context. + +Other ICMPv6 uses of dev_net() seem safe, change them to +dev_net_rcu() to get LOCKDEP support to catch bugs. + +Fixes: 9a43b709a230 ("[NETNS][IPV6] icmp6 - make icmpv6_socket per namespace") +Signed-off-by: Eric Dumazet +Link: https://patch.msgid.link/20250205155120.1676781-12-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ipv6/icmp.c | 42 +++++++++++++++++++++++------------------- + 1 file changed, 23 insertions(+), 19 deletions(-) + +diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c +index a6984a29fdb9d..4d14ab7f7e99f 100644 +--- a/net/ipv6/icmp.c ++++ b/net/ipv6/icmp.c +@@ -76,7 +76,7 @@ static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + { + /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ + struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); +- struct net *net = dev_net(skb->dev); ++ struct net *net = dev_net_rcu(skb->dev); + + if (type == ICMPV6_PKT_TOOBIG) + ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); +@@ -473,7 +473,10 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + + if (!skb->dev) + return; +- net = dev_net(skb->dev); ++ ++ rcu_read_lock(); ++ ++ net = dev_net_rcu(skb->dev); + mark = IP6_REPLY_MARK(net, skb->mark); + /* + * Make sure we respect the rules +@@ -496,7 +499,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + !(type == ICMPV6_PARAMPROB && + code == ICMPV6_UNK_OPTION && + (opt_unrec(skb, info)))) +- return; ++ goto out; + + saddr = NULL; + } +@@ -526,7 +529,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { + net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n", + &hdr->saddr, &hdr->daddr); +- return; ++ goto out; + } + + /* +@@ -535,7 +538,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + if (is_ineligible(skb)) { + net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n", + &hdr->saddr, &hdr->daddr); +- return; ++ goto out; + } + + /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */ +@@ -582,7 +585,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + np = inet6_sk(sk); + + if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit)) +- goto out; ++ goto out_unlock; + + tmp_hdr.icmp6_type = type; + tmp_hdr.icmp6_code = code; +@@ -600,7 +603,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + + dst = icmpv6_route_lookup(net, skb, sk, &fl6); + if (IS_ERR(dst)) +- goto out; ++ goto out_unlock; + + ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + +@@ -616,7 +619,6 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + goto out_dst_release; + } + +- rcu_read_lock(); + idev = __in6_dev_get(skb->dev); + + if (ip6_append_data(sk, icmpv6_getfrag, &msg, +@@ -630,13 +632,15 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + len + sizeof(struct icmp6hdr)); + } +- rcu_read_unlock(); ++ + out_dst_release: + dst_release(dst); +-out: ++out_unlock: + icmpv6_xmit_unlock(sk); + out_bh_enable: + local_bh_enable(); ++out: ++ rcu_read_unlock(); + } + EXPORT_SYMBOL(icmp6_send); + +@@ -679,8 +683,8 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, + skb_pull(skb2, nhs); + skb_reset_network_header(skb2); + +- rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, +- skb, 0); ++ rt = rt6_lookup(dev_net_rcu(skb->dev), &ipv6_hdr(skb2)->saddr, ++ NULL, 0, skb, 0); + + if (rt && rt->dst.dev) + skb2->dev = rt->dst.dev; +@@ -717,7 +721,7 @@ EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach); + + static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) + { +- struct net *net = dev_net(skb->dev); ++ struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; + struct inet6_dev *idev; + struct ipv6_pinfo *np; +@@ -832,7 +836,7 @@ enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, + u8 code, __be32 info) + { + struct inet6_skb_parm *opt = IP6CB(skb); +- struct net *net = dev_net(skb->dev); ++ struct net *net = dev_net_rcu(skb->dev); + const struct inet6_protocol *ipprot; + enum skb_drop_reason reason; + int inner_offset; +@@ -889,7 +893,7 @@ enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, + static int icmpv6_rcv(struct sk_buff *skb) + { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; +- struct net *net = dev_net(skb->dev); ++ struct net *net = dev_net_rcu(skb->dev); + struct net_device *dev = icmp6_dev(skb); + struct inet6_dev *idev = __in6_dev_get(dev); + const struct in6_addr *saddr, *daddr; +@@ -921,7 +925,7 @@ static int icmpv6_rcv(struct sk_buff *skb) + skb_set_network_header(skb, nh); + } + +- __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS); ++ __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS); + + saddr = &ipv6_hdr(skb)->saddr; + daddr = &ipv6_hdr(skb)->daddr; +@@ -939,7 +943,7 @@ static int icmpv6_rcv(struct sk_buff *skb) + + type = hdr->icmp6_type; + +- ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type); ++ ICMP6MSGIN_INC_STATS(dev_net_rcu(dev), idev, type); + + switch (type) { + case ICMPV6_ECHO_REQUEST: +@@ -1034,9 +1038,9 @@ static int icmpv6_rcv(struct sk_buff *skb) + + csum_error: + reason = SKB_DROP_REASON_ICMP_CSUM; +- __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); ++ __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_CSUMERRORS); + discard_it: +- __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS); ++ __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INERRORS); + drop_no_count: + kfree_skb_reason(skb, reason); + return 0; +-- +2.39.5 + diff --git a/queue-6.13/ipv6-mcast-add-rcu-protection-to-mld_newpack.patch b/queue-6.13/ipv6-mcast-add-rcu-protection-to-mld_newpack.patch new file mode 100644 index 0000000000..f953e3fa8e --- /dev/null +++ b/queue-6.13/ipv6-mcast-add-rcu-protection-to-mld_newpack.patch @@ -0,0 +1,80 @@ +From 37750b92b6ab7b92337b71d95a7972765c0e643d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 12 Feb 2025 14:10:21 +0000 +Subject: ipv6: mcast: add RCU protection to mld_newpack() + +From: Eric Dumazet + +[ Upstream commit a527750d877fd334de87eef81f1cb5f0f0ca3373 ] + +mld_newpack() can be called without RTNL or RCU being held. + +Note that we no longer can use sock_alloc_send_skb() because +ipv6.igmp_sk uses GFP_KERNEL allocations which can sleep. + +Instead use alloc_skb() and charge the net->ipv6.igmp_sk +socket under RCU protection. + +Fixes: b8ad0cbc58f7 ("[NETNS][IPV6] mcast - handle several network namespace") +Signed-off-by: Eric Dumazet +Reviewed-by: David Ahern +Link: https://patch.msgid.link/20250212141021.1663666-1-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ipv6/mcast.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c +index 6551648512585..b7b62e5a562e5 100644 +--- a/net/ipv6/mcast.c ++++ b/net/ipv6/mcast.c +@@ -1730,21 +1730,19 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) + struct net_device *dev = idev->dev; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; +- struct net *net = dev_net(dev); + const struct in6_addr *saddr; + struct in6_addr addr_buf; + struct mld2_report *pmr; + struct sk_buff *skb; + unsigned int size; + struct sock *sk; +- int err; ++ struct net *net; + +- sk = net->ipv6.igmp_sk; + /* we assume size > sizeof(ra) here + * Also try to not allocate high-order pages for big MTU + */ + size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen; +- skb = sock_alloc_send_skb(sk, size, 1, &err); ++ skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return NULL; + +@@ -1752,6 +1750,12 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) + skb_reserve(skb, hlen); + skb_tailroom_reserve(skb, mtu, tlen); + ++ rcu_read_lock(); ++ ++ net = dev_net_rcu(dev); ++ sk = net->ipv6.igmp_sk; ++ skb_set_owner_w(skb, sk); ++ + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { + /* : + * use unspecified address as the source address +@@ -1763,6 +1767,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) + + ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); + ++ rcu_read_unlock(); ++ + skb_put_data(skb, ra, sizeof(ra)); + + skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); +-- +2.39.5 + diff --git a/queue-6.13/ipv6-mcast-extend-rcu-protection-in-igmp6_send.patch b/queue-6.13/ipv6-mcast-extend-rcu-protection-in-igmp6_send.patch new file mode 100644 index 0000000000..3bc3c578ff --- /dev/null +++ b/queue-6.13/ipv6-mcast-extend-rcu-protection-in-igmp6_send.patch @@ -0,0 +1,105 @@ +From db0123953cf826aa587b2daff17492629b27fff6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 7 Feb 2025 13:58:40 +0000 +Subject: ipv6: mcast: extend RCU protection in igmp6_send() + +From: Eric Dumazet + +[ Upstream commit 087c1faa594fa07a66933d750c0b2610aa1a2946 ] + +igmp6_send() can be called without RTNL or RCU being held. + +Extend RCU protection so that we can safely fetch the net pointer +and avoid a potential UAF. + +Note that we no longer can use sock_alloc_send_skb() because +ipv6.igmp_sk uses GFP_KERNEL allocations which can sleep. + +Instead use alloc_skb() and charge the net->ipv6.igmp_sk +socket under RCU protection. + +Fixes: b8ad0cbc58f7 ("[NETNS][IPV6] mcast - handle several network namespace") +Signed-off-by: Eric Dumazet +Reviewed-by: David Ahern +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250207135841.1948589-9-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ipv6/mcast.c | 31 +++++++++++++++---------------- + 1 file changed, 15 insertions(+), 16 deletions(-) + +diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c +index b244dbf61d5f3..6551648512585 100644 +--- a/net/ipv6/mcast.c ++++ b/net/ipv6/mcast.c +@@ -2122,21 +2122,21 @@ static void mld_send_cr(struct inet6_dev *idev) + + static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) + { +- struct net *net = dev_net(dev); +- struct sock *sk = net->ipv6.igmp_sk; ++ const struct in6_addr *snd_addr, *saddr; ++ int err, len, payload_len, full_len; ++ struct in6_addr addr_buf; + struct inet6_dev *idev; + struct sk_buff *skb; + struct mld_msg *hdr; +- const struct in6_addr *snd_addr, *saddr; +- struct in6_addr addr_buf; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; +- int err, len, payload_len, full_len; + u8 ra[8] = { IPPROTO_ICMPV6, 0, + IPV6_TLV_ROUTERALERT, 2, 0, 0, + IPV6_TLV_PADN, 0 }; +- struct flowi6 fl6; + struct dst_entry *dst; ++ struct flowi6 fl6; ++ struct net *net; ++ struct sock *sk; + + if (type == ICMPV6_MGM_REDUCTION) + snd_addr = &in6addr_linklocal_allrouters; +@@ -2147,19 +2147,21 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) + payload_len = len + sizeof(ra); + full_len = sizeof(struct ipv6hdr) + payload_len; + +- rcu_read_lock(); +- IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_OUTREQUESTS); +- rcu_read_unlock(); ++ skb = alloc_skb(hlen + tlen + full_len, GFP_KERNEL); + +- skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); ++ rcu_read_lock(); + ++ net = dev_net_rcu(dev); ++ idev = __in6_dev_get(dev); ++ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); + if (!skb) { +- rcu_read_lock(); +- IP6_INC_STATS(net, __in6_dev_get(dev), +- IPSTATS_MIB_OUTDISCARDS); ++ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); + return; + } ++ sk = net->ipv6.igmp_sk; ++ skb_set_owner_w(skb, sk); ++ + skb->priority = TC_PRIO_CONTROL; + skb_reserve(skb, hlen); + +@@ -2184,9 +2186,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) + IPPROTO_ICMPV6, + csum_partial(hdr, len, 0)); + +- rcu_read_lock(); +- idev = __in6_dev_get(skb->dev); +- + icmpv6_flow_init(sk, &fl6, type, + &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + skb->dev->ifindex); +-- +2.39.5 + diff --git a/queue-6.13/ipv6-use-rcu-protection-in-ip6_default_advmss.patch b/queue-6.13/ipv6-use-rcu-protection-in-ip6_default_advmss.patch new file mode 100644 index 0000000000..1d6a5cb81f --- /dev/null +++ b/queue-6.13/ipv6-use-rcu-protection-in-ip6_default_advmss.patch @@ -0,0 +1,49 @@ +From d00acab98c62e79343dac4811af016a999cefc1e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Feb 2025 15:51:18 +0000 +Subject: ipv6: use RCU protection in ip6_default_advmss() + +From: Eric Dumazet + +[ Upstream commit 3c8ffcd248da34fc41e52a46e51505900115fc2a ] + +ip6_default_advmss() needs rcu protection to make +sure the net structure it reads does not disappear. + +Fixes: 5578689a4e3c ("[NETNS][IPV6] route6 - make route6 per namespace") +Signed-off-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250205155120.1676781-11-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ipv6/route.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/net/ipv6/route.c b/net/ipv6/route.c +index 67ff16c047180..997e2e4f441d2 100644 +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -3196,13 +3196,18 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) + { + struct net_device *dev = dst->dev; + unsigned int mtu = dst_mtu(dst); +- struct net *net = dev_net(dev); ++ struct net *net; + + mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + ++ rcu_read_lock(); ++ ++ net = dev_net_rcu(dev); + if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) + mtu = net->ipv6.sysctl.ip6_rt_min_advmss; + ++ rcu_read_unlock(); ++ + /* + * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and + * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. +-- +2.39.5 + diff --git a/queue-6.13/ndisc-extend-rcu-protection-in-ndisc_send_skb.patch b/queue-6.13/ndisc-extend-rcu-protection-in-ndisc_send_skb.patch new file mode 100644 index 0000000000..1760d1c8ec --- /dev/null +++ b/queue-6.13/ndisc-extend-rcu-protection-in-ndisc_send_skb.patch @@ -0,0 +1,72 @@ +From 6085714e662d83314202ab333bb0336cc7306a8e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 7 Feb 2025 13:58:39 +0000 +Subject: ndisc: extend RCU protection in ndisc_send_skb() + +From: Eric Dumazet + +[ Upstream commit ed6ae1f325d3c43966ec1b62ac1459e2b8e45640 ] + +ndisc_send_skb() can be called without RTNL or RCU held. + +Acquire rcu_read_lock() earlier, so that we can use dev_net_rcu() +and avoid a potential UAF. + +Fixes: 1762f7e88eb3 ("[NETNS][IPV6] ndisc - make socket control per namespace") +Signed-off-by: Eric Dumazet +Reviewed-by: David Ahern +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250207135841.1948589-8-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ipv6/ndisc.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c +index 90f8aa2d7af2e..8699d1a188dc4 100644 +--- a/net/ipv6/ndisc.c ++++ b/net/ipv6/ndisc.c +@@ -471,16 +471,20 @@ static void ip6_nd_hdr(struct sk_buff *skb, + void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, + const struct in6_addr *saddr) + { ++ struct icmp6hdr *icmp6h = icmp6_hdr(skb); + struct dst_entry *dst = skb_dst(skb); +- struct net *net = dev_net(skb->dev); +- struct sock *sk = net->ipv6.ndisc_sk; + struct inet6_dev *idev; ++ struct net *net; ++ struct sock *sk; + int err; +- struct icmp6hdr *icmp6h = icmp6_hdr(skb); + u8 type; + + type = icmp6h->icmp6_type; + ++ rcu_read_lock(); ++ ++ net = dev_net_rcu(skb->dev); ++ sk = net->ipv6.ndisc_sk; + if (!dst) { + struct flowi6 fl6; + int oif = skb->dev->ifindex; +@@ -488,6 +492,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, + icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); + dst = icmp6_dst_alloc(skb->dev, &fl6); + if (IS_ERR(dst)) { ++ rcu_read_unlock(); + kfree_skb(skb); + return; + } +@@ -502,7 +507,6 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, + + ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); + +- rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); + +-- +2.39.5 + diff --git a/queue-6.13/ndisc-use-rcu-protection-in-ndisc_alloc_skb.patch b/queue-6.13/ndisc-use-rcu-protection-in-ndisc_alloc_skb.patch new file mode 100644 index 0000000000..6b9669a4b3 --- /dev/null +++ b/queue-6.13/ndisc-use-rcu-protection-in-ndisc_alloc_skb.patch @@ -0,0 +1,59 @@ +From 78ac8b14a8da7091a7a851e9afb9885ef4ed25eb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 7 Feb 2025 13:58:34 +0000 +Subject: ndisc: use RCU protection in ndisc_alloc_skb() + +From: Eric Dumazet + +[ Upstream commit 628e6d18930bbd21f2d4562228afe27694f66da9 ] + +ndisc_alloc_skb() can be called without RTNL or RCU being held. + +Add RCU protection to avoid possible UAF. + +Fixes: de09334b9326 ("ndisc: Introduce ndisc_alloc_skb() helper.") +Signed-off-by: Eric Dumazet +Reviewed-by: David Ahern +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250207135841.1948589-3-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ipv6/ndisc.c | 10 ++++------ + 1 file changed, 4 insertions(+), 6 deletions(-) + +diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c +index 264b10a947577..90f8aa2d7af2e 100644 +--- a/net/ipv6/ndisc.c ++++ b/net/ipv6/ndisc.c +@@ -418,15 +418,11 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, + { + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; +- struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; + struct sk_buff *skb; + + skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); +- if (!skb) { +- ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", +- __func__); ++ if (!skb) + return NULL; +- } + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; +@@ -437,7 +433,9 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, + /* Manually assign socket ownership as we avoid calling + * sock_alloc_send_pskb() to bypass wmem buffer limits + */ +- skb_set_owner_w(skb, sk); ++ rcu_read_lock(); ++ skb_set_owner_w(skb, dev_net_rcu(dev)->ipv6.ndisc_sk); ++ rcu_read_unlock(); + + return skb; + } +-- +2.39.5 + diff --git a/queue-6.13/neighbour-use-rcu-protection-in-__neigh_notify.patch b/queue-6.13/neighbour-use-rcu-protection-in-__neigh_notify.patch new file mode 100644 index 0000000000..73f86b4c74 --- /dev/null +++ b/queue-6.13/neighbour-use-rcu-protection-in-__neigh_notify.patch @@ -0,0 +1,58 @@ +From 7621c81b2327d4856333ab1c92e86a266a07456f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 7 Feb 2025 13:58:35 +0000 +Subject: neighbour: use RCU protection in __neigh_notify() + +From: Eric Dumazet + +[ Upstream commit becbd5850c03ed33b232083dd66c6e38c0c0e569 ] + +__neigh_notify() can be called without RTNL or RCU protection. + +Use RCU protection to avoid potential UAF. + +Fixes: 426b5303eb43 ("[NETNS]: Modify the neighbour table code so it handles multiple network namespaces") +Signed-off-by: Eric Dumazet +Reviewed-by: David Ahern +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250207135841.1948589-4-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/core/neighbour.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/net/core/neighbour.c b/net/core/neighbour.c +index 89656d180bc60..bd0251bd74a1f 100644 +--- a/net/core/neighbour.c ++++ b/net/core/neighbour.c +@@ -3447,10 +3447,12 @@ static const struct seq_operations neigh_stat_seq_ops = { + static void __neigh_notify(struct neighbour *n, int type, int flags, + u32 pid) + { +- struct net *net = dev_net(n->dev); + struct sk_buff *skb; + int err = -ENOBUFS; ++ struct net *net; + ++ rcu_read_lock(); ++ net = dev_net_rcu(n->dev); + skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); + if (skb == NULL) + goto errout; +@@ -3463,9 +3465,11 @@ static void __neigh_notify(struct neighbour *n, int type, int flags, + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); +- return; ++ goto out; + errout: + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); ++out: ++ rcu_read_unlock(); + } + + void neigh_app_ns(struct neighbour *n) +-- +2.39.5 + diff --git a/queue-6.13/net-add-dev_net_rcu-helper.patch b/queue-6.13/net-add-dev_net_rcu-helper.patch new file mode 100644 index 0000000000..782d26ea8d --- /dev/null +++ b/queue-6.13/net-add-dev_net_rcu-helper.patch @@ -0,0 +1,62 @@ +From 61a2be1d225562f44f63eb4d42dd3116fba2316f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Feb 2025 15:51:09 +0000 +Subject: net: add dev_net_rcu() helper + +From: Eric Dumazet + +[ Upstream commit 482ad2a4ace2740ca0ff1cbc8f3c7f862f3ab507 ] + +dev->nd_net can change, readers should either +use rcu_read_lock() or RTNL. + +We currently use a generic helper, dev_net() with +no debugging support. We probably have many hidden bugs. + +Add dev_net_rcu() helper for callers using rcu_read_lock() +protection. + +Signed-off-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250205155120.1676781-2-edumazet@google.com +Signed-off-by: Jakub Kicinski +Stable-dep-of: 71b8471c93fa ("ipv4: use RCU protection in ipv4_default_advmss()") +Signed-off-by: Sasha Levin +--- + include/linux/netdevice.h | 6 ++++++ + include/net/net_namespace.h | 2 +- + 2 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index 3928e91bb5905..8268be0723eee 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -2635,6 +2635,12 @@ struct net *dev_net(const struct net_device *dev) + return read_pnet(&dev->nd_net); + } + ++static inline ++struct net *dev_net_rcu(const struct net_device *dev) ++{ ++ return read_pnet_rcu(&dev->nd_net); ++} ++ + static inline + void dev_net_set(struct net_device *dev, struct net *net) + { +diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h +index 5a2a0df8ad91b..44be742cf4d60 100644 +--- a/include/net/net_namespace.h ++++ b/include/net/net_namespace.h +@@ -396,7 +396,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet) + #endif + } + +-static inline struct net *read_pnet_rcu(possible_net_t *pnet) ++static inline struct net *read_pnet_rcu(const possible_net_t *pnet) + { + #ifdef CONFIG_NET_NS + return rcu_dereference(pnet->net); +-- +2.39.5 + diff --git a/queue-6.13/net-add-netdev-up-protected-by-netdev_lock.patch b/queue-6.13/net-add-netdev-up-protected-by-netdev_lock.patch new file mode 100644 index 0000000000..9188ef115a --- /dev/null +++ b/queue-6.13/net-add-netdev-up-protected-by-netdev_lock.patch @@ -0,0 +1,118 @@ +From 996f44c563e77c7a00e23892c4a95f664c97ffe4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 14 Jan 2025 19:53:12 -0800 +Subject: net: add netdev->up protected by netdev_lock() + +From: Jakub Kicinski + +[ Upstream commit 5112457f3d8e41f987908266068af88ef9f3ab78 ] + +Some uAPI (netdev netlink) hide net_device's sub-objects while +the interface is down to ensure uniform behavior across drivers. +To remove the rtnl_lock dependency from those uAPIs we need a way +to safely tell if the device is down or up. + +Add an indication of whether device is open or closed, protected +by netdev->lock. The semantics are the same as IFF_UP, but taking +netdev_lock around every write to ->flags would be a lot of code +churn. + +We don't want to blanket the entire open / close path by netdev_lock, +because it will prevent us from applying it to specific structures - +core helpers won't be able to take that lock from any function +called by the drivers on open/close paths. + +So the state of the flag is "pessimistic", as in it may report false +negatives, but never false positives. + +Reviewed-by: Joe Damato +Reviewed-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250115035319.559603-5-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Stable-dep-of: 011b03359038 ("Revert "net: skb: introduce and use a single page frag cache"") +Signed-off-by: Sasha Levin +--- + include/linux/netdevice.h | 14 +++++++++++++- + net/core/dev.c | 4 ++-- + net/core/dev.h | 12 ++++++++++++ + 3 files changed, 27 insertions(+), 3 deletions(-) + +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index 47f817bcea503..64013fd389f28 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -2441,12 +2441,24 @@ struct net_device { + unsigned long gro_flush_timeout; + u32 napi_defer_hard_irqs; + ++ /** ++ * @up: copy of @state's IFF_UP, but safe to read with just @lock. ++ * May report false negatives while the device is being opened ++ * or closed (@lock does not protect .ndo_open, or .ndo_close). ++ */ ++ bool up; ++ + /** + * @lock: netdev-scope lock, protects a small selection of fields. + * Should always be taken using netdev_lock() / netdev_unlock() helpers. + * Drivers are free to use it for other protection. + * +- * Protects: @reg_state, @net_shaper_hierarchy. ++ * Protects: ++ * @net_shaper_hierarchy, @reg_state ++ * ++ * Partially protects (writers must hold both @lock and rtnl_lock): ++ * @up ++ * + * Ordering: take after rtnl_lock. + */ + struct mutex lock; +diff --git a/net/core/dev.c b/net/core/dev.c +index 75996e1aac46c..67f2bb84db543 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -1512,7 +1512,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) + if (ret) + clear_bit(__LINK_STATE_START, &dev->state); + else { +- dev->flags |= IFF_UP; ++ netif_set_up(dev, true); + dev_set_rx_mode(dev); + dev_activate(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); +@@ -1591,7 +1591,7 @@ static void __dev_close_many(struct list_head *head) + if (ops->ndo_stop) + ops->ndo_stop(dev); + +- dev->flags &= ~IFF_UP; ++ netif_set_up(dev, false); + netpoll_poll_enable(dev); + } + } +diff --git a/net/core/dev.h b/net/core/dev.h +index deb5eae5749fa..e17c640c05fb9 100644 +--- a/net/core/dev.h ++++ b/net/core/dev.h +@@ -111,6 +111,18 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, + void unregister_netdevice_many_notify(struct list_head *head, + u32 portid, const struct nlmsghdr *nlh); + ++static inline void netif_set_up(struct net_device *dev, bool value) ++{ ++ if (value) ++ dev->flags |= IFF_UP; ++ else ++ dev->flags &= ~IFF_UP; ++ ++ netdev_lock(dev); ++ dev->up = value; ++ netdev_unlock(dev); ++} ++ + static inline void netif_set_gso_max_size(struct net_device *dev, + unsigned int size) + { +-- +2.39.5 + diff --git a/queue-6.13/net-add-netdev_lock-netdev_unlock-helpers.patch b/queue-6.13/net-add-netdev_lock-netdev_unlock-helpers.patch new file mode 100644 index 0000000000..ba8b334de9 --- /dev/null +++ b/queue-6.13/net-add-netdev_lock-netdev_unlock-helpers.patch @@ -0,0 +1,435 @@ +From bfea92571978b90d08add497b4ba4886a8218705 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 14 Jan 2025 19:53:09 -0800 +Subject: net: add netdev_lock() / netdev_unlock() helpers + +From: Jakub Kicinski + +[ Upstream commit ebda2f0bbde540ff7da168d2837f8cfb14581e2e ] + +Add helpers for locking the netdev instance, use it in drivers +and the shaper code. This will make grepping for the lock usage +much easier, as we extend the lock to cover more fields. + +Reviewed-by: Joe Damato +Reviewed-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Reviewed-by: Przemek Kitszel +Link: https://patch.msgid.link/20250115035319.559603-2-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Stable-dep-of: 011b03359038 ("Revert "net: skb: introduce and use a single page frag cache"") +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/iavf/iavf_main.c | 74 ++++++++++----------- + drivers/net/netdevsim/ethtool.c | 4 +- + include/linux/netdevice.h | 23 ++++++- + net/shaper/shaper.c | 6 +- + 4 files changed, 63 insertions(+), 44 deletions(-) + +diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c +index 7c427003184d5..72314b0a1b25b 100644 +--- a/drivers/net/ethernet/intel/iavf/iavf_main.c ++++ b/drivers/net/ethernet/intel/iavf/iavf_main.c +@@ -1992,7 +1992,7 @@ static void iavf_finish_config(struct work_struct *work) + * The dev->lock is needed to update the queue number + */ + rtnl_lock(); +- mutex_lock(&adapter->netdev->lock); ++ netdev_lock(adapter->netdev); + mutex_lock(&adapter->crit_lock); + + if ((adapter->flags & IAVF_FLAG_SETUP_NETDEV_FEATURES) && +@@ -2012,7 +2012,7 @@ static void iavf_finish_config(struct work_struct *work) + netif_set_real_num_tx_queues(adapter->netdev, pairs); + + if (adapter->netdev->reg_state != NETREG_REGISTERED) { +- mutex_unlock(&adapter->netdev->lock); ++ netdev_unlock(adapter->netdev); + netdev_released = true; + err = register_netdevice(adapter->netdev); + if (err) { +@@ -2042,7 +2042,7 @@ static void iavf_finish_config(struct work_struct *work) + out: + mutex_unlock(&adapter->crit_lock); + if (!netdev_released) +- mutex_unlock(&adapter->netdev->lock); ++ netdev_unlock(adapter->netdev); + rtnl_unlock(); + } + +@@ -2739,10 +2739,10 @@ static void iavf_watchdog_task(struct work_struct *work) + struct iavf_hw *hw = &adapter->hw; + u32 reg_val; + +- mutex_lock(&netdev->lock); ++ netdev_lock(netdev); + if (!mutex_trylock(&adapter->crit_lock)) { + if (adapter->state == __IAVF_REMOVE) { +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + return; + } + +@@ -2756,35 +2756,35 @@ static void iavf_watchdog_task(struct work_struct *work) + case __IAVF_STARTUP: + iavf_startup(adapter); + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, + msecs_to_jiffies(30)); + return; + case __IAVF_INIT_VERSION_CHECK: + iavf_init_version_check(adapter); + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, + msecs_to_jiffies(30)); + return; + case __IAVF_INIT_GET_RESOURCES: + iavf_init_get_resources(adapter); + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, + msecs_to_jiffies(1)); + return; + case __IAVF_INIT_EXTENDED_CAPS: + iavf_init_process_extended_caps(adapter); + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, + msecs_to_jiffies(1)); + return; + case __IAVF_INIT_CONFIG_ADAPTER: + iavf_init_config_adapter(adapter); + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, + msecs_to_jiffies(1)); + return; +@@ -2796,7 +2796,7 @@ static void iavf_watchdog_task(struct work_struct *work) + * as it can loop forever + */ + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + return; + } + if (++adapter->aq_wait_count > IAVF_AQ_MAX_ERR) { +@@ -2805,7 +2805,7 @@ static void iavf_watchdog_task(struct work_struct *work) + adapter->flags |= IAVF_FLAG_PF_COMMS_FAILED; + iavf_shutdown_adminq(hw); + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + queue_delayed_work(adapter->wq, + &adapter->watchdog_task, (5 * HZ)); + return; +@@ -2813,7 +2813,7 @@ static void iavf_watchdog_task(struct work_struct *work) + /* Try again from failed step*/ + iavf_change_state(adapter, adapter->last_state); + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, HZ); + return; + case __IAVF_COMM_FAILED: +@@ -2826,7 +2826,7 @@ static void iavf_watchdog_task(struct work_struct *work) + iavf_change_state(adapter, __IAVF_INIT_FAILED); + adapter->flags &= ~IAVF_FLAG_PF_COMMS_FAILED; + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + return; + } + reg_val = rd32(hw, IAVF_VFGEN_RSTAT) & +@@ -2846,14 +2846,14 @@ static void iavf_watchdog_task(struct work_struct *work) + adapter->aq_required = 0; + adapter->current_op = VIRTCHNL_OP_UNKNOWN; + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + queue_delayed_work(adapter->wq, + &adapter->watchdog_task, + msecs_to_jiffies(10)); + return; + case __IAVF_RESETTING: + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + queue_delayed_work(adapter->wq, &adapter->watchdog_task, + HZ * 2); + return; +@@ -2884,7 +2884,7 @@ static void iavf_watchdog_task(struct work_struct *work) + case __IAVF_REMOVE: + default: + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + return; + } + +@@ -2896,14 +2896,14 @@ static void iavf_watchdog_task(struct work_struct *work) + dev_err(&adapter->pdev->dev, "Hardware reset detected\n"); + iavf_schedule_reset(adapter, IAVF_FLAG_RESET_PENDING); + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + queue_delayed_work(adapter->wq, + &adapter->watchdog_task, HZ * 2); + return; + } + + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + restart_watchdog: + if (adapter->state >= __IAVF_DOWN) + queue_work(adapter->wq, &adapter->adminq_task); +@@ -3030,12 +3030,12 @@ static void iavf_reset_task(struct work_struct *work) + /* When device is being removed it doesn't make sense to run the reset + * task, just return in such a case. + */ +- mutex_lock(&netdev->lock); ++ netdev_lock(netdev); + if (!mutex_trylock(&adapter->crit_lock)) { + if (adapter->state != __IAVF_REMOVE) + queue_work(adapter->wq, &adapter->reset_task); + +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + return; + } + +@@ -3083,7 +3083,7 @@ static void iavf_reset_task(struct work_struct *work) + reg_val); + iavf_disable_vf(adapter); + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + return; /* Do not attempt to reinit. It's dead, Jim. */ + } + +@@ -3224,7 +3224,7 @@ static void iavf_reset_task(struct work_struct *work) + + wake_up(&adapter->reset_waitqueue); + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + + return; + reset_err: +@@ -3235,7 +3235,7 @@ static void iavf_reset_task(struct work_struct *work) + iavf_disable_vf(adapter); + + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + dev_err(&adapter->pdev->dev, "failed to allocate resources during reinit\n"); + } + +@@ -3707,10 +3707,10 @@ static int __iavf_setup_tc(struct net_device *netdev, void *type_data) + if (test_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section)) + return 0; + +- mutex_lock(&netdev->lock); ++ netdev_lock(netdev); + netif_set_real_num_rx_queues(netdev, total_qps); + netif_set_real_num_tx_queues(netdev, total_qps); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + + return ret; + } +@@ -4380,7 +4380,7 @@ static int iavf_open(struct net_device *netdev) + return -EIO; + } + +- mutex_lock(&netdev->lock); ++ netdev_lock(netdev); + while (!mutex_trylock(&adapter->crit_lock)) { + /* If we are in __IAVF_INIT_CONFIG_ADAPTER state the crit_lock + * is already taken and iavf_open is called from an upper +@@ -4388,7 +4388,7 @@ static int iavf_open(struct net_device *netdev) + * We have to leave here to avoid dead lock. + */ + if (adapter->state == __IAVF_INIT_CONFIG_ADAPTER) { +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + return -EBUSY; + } + +@@ -4439,7 +4439,7 @@ static int iavf_open(struct net_device *netdev) + iavf_irq_enable(adapter, true); + + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + + return 0; + +@@ -4452,7 +4452,7 @@ static int iavf_open(struct net_device *netdev) + iavf_free_all_tx_resources(adapter); + err_unlock: + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + + return err; + } +@@ -4474,12 +4474,12 @@ static int iavf_close(struct net_device *netdev) + u64 aq_to_restore; + int status; + +- mutex_lock(&netdev->lock); ++ netdev_lock(netdev); + mutex_lock(&adapter->crit_lock); + + if (adapter->state <= __IAVF_DOWN_PENDING) { + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + return 0; + } + +@@ -4513,7 +4513,7 @@ static int iavf_close(struct net_device *netdev) + iavf_free_traffic_irqs(adapter); + + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + + /* We explicitly don't free resources here because the hardware is + * still active and can DMA into memory. Resources are cleared in +@@ -5390,7 +5390,7 @@ static int iavf_suspend(struct device *dev_d) + + netif_device_detach(netdev); + +- mutex_lock(&netdev->lock); ++ netdev_lock(netdev); + mutex_lock(&adapter->crit_lock); + + if (netif_running(netdev)) { +@@ -5402,7 +5402,7 @@ static int iavf_suspend(struct device *dev_d) + iavf_reset_interrupt_capability(adapter); + + mutex_unlock(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + + return 0; + } +@@ -5501,7 +5501,7 @@ static void iavf_remove(struct pci_dev *pdev) + if (netdev->reg_state == NETREG_REGISTERED) + unregister_netdev(netdev); + +- mutex_lock(&netdev->lock); ++ netdev_lock(netdev); + mutex_lock(&adapter->crit_lock); + dev_info(&adapter->pdev->dev, "Removing device\n"); + iavf_change_state(adapter, __IAVF_REMOVE); +@@ -5538,7 +5538,7 @@ static void iavf_remove(struct pci_dev *pdev) + mutex_destroy(&hw->aq.asq_mutex); + mutex_unlock(&adapter->crit_lock); + mutex_destroy(&adapter->crit_lock); +- mutex_unlock(&netdev->lock); ++ netdev_unlock(netdev); + + iounmap(hw->hw_addr); + pci_release_regions(pdev); +diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c +index 5fe1eaef99b5b..3f44a11aec83e 100644 +--- a/drivers/net/netdevsim/ethtool.c ++++ b/drivers/net/netdevsim/ethtool.c +@@ -103,10 +103,10 @@ nsim_set_channels(struct net_device *dev, struct ethtool_channels *ch) + struct netdevsim *ns = netdev_priv(dev); + int err; + +- mutex_lock(&dev->lock); ++ netdev_lock(dev); + err = netif_set_real_num_queues(dev, ch->combined_count, + ch->combined_count); +- mutex_unlock(&dev->lock); ++ netdev_unlock(dev); + if (err) + return err; + +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index 8268be0723eee..035cc881dd756 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -2442,8 +2442,12 @@ struct net_device { + u32 napi_defer_hard_irqs; + + /** +- * @lock: protects @net_shaper_hierarchy, feel free to use for other +- * netdev-scope protection. Ordering: take after rtnl_lock. ++ * @lock: netdev-scope lock, protects a small selection of fields. ++ * Should always be taken using netdev_lock() / netdev_unlock() helpers. ++ * Drivers are free to use it for other protection. ++ * ++ * Protects: @net_shaper_hierarchy. ++ * Ordering: take after rtnl_lock. + */ + struct mutex lock; + +@@ -2673,6 +2677,21 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, + enum netdev_queue_type type, + struct napi_struct *napi); + ++static inline void netdev_lock(struct net_device *dev) ++{ ++ mutex_lock(&dev->lock); ++} ++ ++static inline void netdev_unlock(struct net_device *dev) ++{ ++ mutex_unlock(&dev->lock); ++} ++ ++static inline void netdev_assert_locked(struct net_device *dev) ++{ ++ lockdep_assert_held(&dev->lock); ++} ++ + static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) + { + napi->irq = irq; +diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c +index 15463062fe7b6..7101a48bce545 100644 +--- a/net/shaper/shaper.c ++++ b/net/shaper/shaper.c +@@ -40,7 +40,7 @@ static void net_shaper_lock(struct net_shaper_binding *binding) + { + switch (binding->type) { + case NET_SHAPER_BINDING_TYPE_NETDEV: +- mutex_lock(&binding->netdev->lock); ++ netdev_lock(binding->netdev); + break; + } + } +@@ -49,7 +49,7 @@ static void net_shaper_unlock(struct net_shaper_binding *binding) + { + switch (binding->type) { + case NET_SHAPER_BINDING_TYPE_NETDEV: +- mutex_unlock(&binding->netdev->lock); ++ netdev_unlock(binding->netdev); + break; + } + } +@@ -1398,7 +1398,7 @@ void net_shaper_set_real_num_tx_queues(struct net_device *dev, + /* Only drivers implementing shapers support ensure + * the lock is acquired in advance. + */ +- lockdep_assert_held(&dev->lock); ++ netdev_assert_locked(dev); + + /* Take action only when decreasing the tx queue number. */ + for (i = txq; i < dev->real_num_tx_queues; ++i) { +-- +2.39.5 + diff --git a/queue-6.13/net-ipv6-fix-dst-ref-loops-in-rpl-seg6-and-ioam6-lwt.patch b/queue-6.13/net-ipv6-fix-dst-ref-loops-in-rpl-seg6-and-ioam6-lwt.patch new file mode 100644 index 0000000000..1c9c6e6aa3 --- /dev/null +++ b/queue-6.13/net-ipv6-fix-dst-ref-loops-in-rpl-seg6-and-ioam6-lwt.patch @@ -0,0 +1,94 @@ +From d48c8db4fede6d25a2f315667fd979dd90efdca2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 29 Jan 2025 19:15:19 -0800 +Subject: net: ipv6: fix dst ref loops in rpl, seg6 and ioam6 lwtunnels + +From: Jakub Kicinski + +[ Upstream commit 92191dd1073088753821b862b791dcc83e558e07 ] + +Some lwtunnels have a dst cache for post-transformation dst. +If the packet destination did not change we may end up recording +a reference to the lwtunnel in its own cache, and the lwtunnel +state will never be freed. + +Discovered by the ioam6.sh test, kmemleak was recently fixed +to catch per-cpu memory leaks. I'm not sure if rpl and seg6 +can actually hit this, but in principle I don't see why not. + +Fixes: 8cb3bf8bff3c ("ipv6: ioam: Add support for the ip6ip6 encapsulation") +Fixes: 6c8702c60b88 ("ipv6: sr: add support for SRH encapsulation and injection with lwtunnels") +Fixes: a7a29f9c361f ("net: ipv6: add rpl sr tunnel") +Reviewed-by: Simon Horman +Link: https://patch.msgid.link/20250130031519.2716843-2-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ipv6/ioam6_iptunnel.c | 9 ++++++--- + net/ipv6/rpl_iptunnel.c | 9 ++++++--- + net/ipv6/seg6_iptunnel.c | 9 ++++++--- + 3 files changed, 18 insertions(+), 9 deletions(-) + +diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c +index 28e5a89dc2557..7b6bd832930d1 100644 +--- a/net/ipv6/ioam6_iptunnel.c ++++ b/net/ipv6/ioam6_iptunnel.c +@@ -411,9 +411,12 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) + goto drop; + } + +- local_bh_disable(); +- dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); +- local_bh_enable(); ++ /* cache only if we don't create a dst reference loop */ ++ if (dst->lwtstate != cache_dst->lwtstate) { ++ local_bh_disable(); ++ dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); ++ local_bh_enable(); ++ } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(cache_dst->dev)); + if (unlikely(err)) +diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c +index 7ba22d2f2bfef..be084089ec783 100644 +--- a/net/ipv6/rpl_iptunnel.c ++++ b/net/ipv6/rpl_iptunnel.c +@@ -236,9 +236,12 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) + goto drop; + } + +- local_bh_disable(); +- dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); +- local_bh_enable(); ++ /* cache only if we don't create a dst reference loop */ ++ if (orig_dst->lwtstate != dst->lwtstate) { ++ local_bh_disable(); ++ dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); ++ local_bh_enable(); ++ } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) +diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c +index 4bf937bfc2633..316dbc2694f2a 100644 +--- a/net/ipv6/seg6_iptunnel.c ++++ b/net/ipv6/seg6_iptunnel.c +@@ -575,9 +575,12 @@ static int seg6_output_core(struct net *net, struct sock *sk, + goto drop; + } + +- local_bh_disable(); +- dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); +- local_bh_enable(); ++ /* cache only if we don't create a dst reference loop */ ++ if (orig_dst->lwtstate != dst->lwtstate) { ++ local_bh_disable(); ++ dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); ++ local_bh_enable(); ++ } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) +-- +2.39.5 + diff --git a/queue-6.13/net-ipv6-ioam6_iptunnel-mitigate-2-realloc-issue.patch b/queue-6.13/net-ipv6-ioam6_iptunnel-mitigate-2-realloc-issue.patch new file mode 100644 index 0000000000..d61b082324 --- /dev/null +++ b/queue-6.13/net-ipv6-ioam6_iptunnel-mitigate-2-realloc-issue.patch @@ -0,0 +1,190 @@ +From 01a53d28912cfe3fe7f503b1aca471b378bb8f3f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 3 Dec 2024 13:49:43 +0100 +Subject: net: ipv6: ioam6_iptunnel: mitigate 2-realloc issue + +From: Justin Iurman + +[ Upstream commit dce525185bc92864e5a318040285ee070563fe34 ] + +This patch mitigates the two-reallocations issue with ioam6_iptunnel by +providing the dst_entry (in the cache) to the first call to +skb_cow_head(). As a result, the very first iteration may still trigger +two reallocations (i.e., empty cache), while next iterations would only +trigger a single reallocation. + +Performance tests before/after applying this patch, which clearly shows +the improvement: +- inline mode: + - before: https://ibb.co/LhQ8V63 + - after: https://ibb.co/x5YT2bS +- encap mode: + - before: https://ibb.co/3Cjm5m0 + - after: https://ibb.co/TwpsxTC +- encap mode with tunsrc: + - before: https://ibb.co/Gpy9QPg + - after: https://ibb.co/PW1bZFT + +This patch also fixes an incorrect behavior: after the insertion, the +second call to skb_cow_head() makes sure that the dev has enough +headroom in the skb for layer 2 and stuff. In that case, the "old" +dst_entry was used, which is now fixed. After discussing with Paolo, it +appears that both patches can be merged into a single one -this one- +(for the sake of readability) and target net-next. + +Signed-off-by: Justin Iurman +Signed-off-by: Paolo Abeni +Stable-dep-of: 92191dd10730 ("net: ipv6: fix dst ref loops in rpl, seg6 and ioam6 lwtunnels") +Signed-off-by: Sasha Levin +--- + net/ipv6/ioam6_iptunnel.c | 73 ++++++++++++++++++++------------------- + 1 file changed, 37 insertions(+), 36 deletions(-) + +diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c +index 9d8422e350f8d..28e5a89dc2557 100644 +--- a/net/ipv6/ioam6_iptunnel.c ++++ b/net/ipv6/ioam6_iptunnel.c +@@ -253,14 +253,15 @@ static int ioam6_do_fill(struct net *net, struct sk_buff *skb) + } + + static int ioam6_do_inline(struct net *net, struct sk_buff *skb, +- struct ioam6_lwt_encap *tuninfo) ++ struct ioam6_lwt_encap *tuninfo, ++ struct dst_entry *cache_dst) + { + struct ipv6hdr *oldhdr, *hdr; + int hdrlen, err; + + hdrlen = (tuninfo->eh.hdrlen + 1) << 3; + +- err = skb_cow_head(skb, hdrlen + skb->mac_len); ++ err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); + if (unlikely(err)) + return err; + +@@ -291,7 +292,8 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, + struct ioam6_lwt_encap *tuninfo, + bool has_tunsrc, + struct in6_addr *tunsrc, +- struct in6_addr *tundst) ++ struct in6_addr *tundst, ++ struct dst_entry *cache_dst) + { + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *hdr, *inner_hdr; +@@ -300,7 +302,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, + hdrlen = (tuninfo->eh.hdrlen + 1) << 3; + len = sizeof(*hdr) + hdrlen; + +- err = skb_cow_head(skb, len + skb->mac_len); ++ err = skb_cow_head(skb, len + dst_dev_overhead(cache_dst, skb)); + if (unlikely(err)) + return err; + +@@ -334,7 +336,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, + + static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) + { +- struct dst_entry *dst = skb_dst(skb); ++ struct dst_entry *dst = skb_dst(skb), *cache_dst; + struct in6_addr orig_daddr; + struct ioam6_lwt *ilwt; + int err = -EINVAL; +@@ -352,6 +354,10 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) + + orig_daddr = ipv6_hdr(skb)->daddr; + ++ local_bh_disable(); ++ cache_dst = dst_cache_get(&ilwt->cache); ++ local_bh_enable(); ++ + switch (ilwt->mode) { + case IOAM6_IPTUNNEL_MODE_INLINE: + do_inline: +@@ -359,7 +365,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) + if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) + goto out; + +- err = ioam6_do_inline(net, skb, &ilwt->tuninfo); ++ err = ioam6_do_inline(net, skb, &ilwt->tuninfo, cache_dst); + if (unlikely(err)) + goto drop; + +@@ -369,7 +375,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) + /* Encapsulation (ip6ip6) */ + err = ioam6_do_encap(net, skb, &ilwt->tuninfo, + ilwt->has_tunsrc, &ilwt->tunsrc, +- &ilwt->tundst); ++ &ilwt->tundst, cache_dst); + if (unlikely(err)) + goto drop; + +@@ -387,41 +393,36 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) + goto drop; + } + +- err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); +- if (unlikely(err)) +- goto drop; ++ if (unlikely(!cache_dst)) { ++ struct ipv6hdr *hdr = ipv6_hdr(skb); ++ struct flowi6 fl6; ++ ++ memset(&fl6, 0, sizeof(fl6)); ++ fl6.daddr = hdr->daddr; ++ fl6.saddr = hdr->saddr; ++ fl6.flowlabel = ip6_flowinfo(hdr); ++ fl6.flowi6_mark = skb->mark; ++ fl6.flowi6_proto = hdr->nexthdr; ++ ++ cache_dst = ip6_route_output(net, NULL, &fl6); ++ if (cache_dst->error) { ++ err = cache_dst->error; ++ dst_release(cache_dst); ++ goto drop; ++ } + +- if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { + local_bh_disable(); +- dst = dst_cache_get(&ilwt->cache); ++ dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); + local_bh_enable(); + +- if (unlikely(!dst)) { +- struct ipv6hdr *hdr = ipv6_hdr(skb); +- struct flowi6 fl6; +- +- memset(&fl6, 0, sizeof(fl6)); +- fl6.daddr = hdr->daddr; +- fl6.saddr = hdr->saddr; +- fl6.flowlabel = ip6_flowinfo(hdr); +- fl6.flowi6_mark = skb->mark; +- fl6.flowi6_proto = hdr->nexthdr; +- +- dst = ip6_route_output(net, NULL, &fl6); +- if (dst->error) { +- err = dst->error; +- dst_release(dst); +- goto drop; +- } +- +- local_bh_disable(); +- dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); +- local_bh_enable(); +- } ++ err = skb_cow_head(skb, LL_RESERVED_SPACE(cache_dst->dev)); ++ if (unlikely(err)) ++ goto drop; ++ } + ++ if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { + skb_dst_drop(skb); +- skb_dst_set(skb, dst); +- ++ skb_dst_set(skb, cache_dst); + return dst_output(net, sk, skb); + } + out: +-- +2.39.5 + diff --git a/queue-6.13/net-ipv6-rpl_iptunnel-mitigate-2-realloc-issue.patch b/queue-6.13/net-ipv6-rpl_iptunnel-mitigate-2-realloc-issue.patch new file mode 100644 index 0000000000..fa82c3580b --- /dev/null +++ b/queue-6.13/net-ipv6-rpl_iptunnel-mitigate-2-realloc-issue.patch @@ -0,0 +1,154 @@ +From 179bd2633bd3785bd22595206661558cd9a951cb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 3 Dec 2024 13:49:45 +0100 +Subject: net: ipv6: rpl_iptunnel: mitigate 2-realloc issue + +From: Justin Iurman + +[ Upstream commit 985ec6f5e6235242191370628acb73d7a9f0c0ea ] + +This patch mitigates the two-reallocations issue with rpl_iptunnel by +providing the dst_entry (in the cache) to the first call to +skb_cow_head(). As a result, the very first iteration would still +trigger two reallocations (i.e., empty cache), while next iterations +would only trigger a single reallocation. + +Performance tests before/after applying this patch, which clearly shows +there is no impact (it even shows improvement): +- before: https://ibb.co/nQJhqwc +- after: https://ibb.co/4ZvW6wV + +Signed-off-by: Justin Iurman +Cc: Alexander Aring +Signed-off-by: Paolo Abeni +Stable-dep-of: 92191dd10730 ("net: ipv6: fix dst ref loops in rpl, seg6 and ioam6 lwtunnels") +Signed-off-by: Sasha Levin +--- + net/ipv6/rpl_iptunnel.c | 46 ++++++++++++++++++++++------------------- + 1 file changed, 25 insertions(+), 21 deletions(-) + +diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c +index db3c19a42e1ca..7ba22d2f2bfef 100644 +--- a/net/ipv6/rpl_iptunnel.c ++++ b/net/ipv6/rpl_iptunnel.c +@@ -125,7 +125,8 @@ static void rpl_destroy_state(struct lwtunnel_state *lwt) + } + + static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, +- const struct ipv6_rpl_sr_hdr *srh) ++ const struct ipv6_rpl_sr_hdr *srh, ++ struct dst_entry *cache_dst) + { + struct ipv6_rpl_sr_hdr *isrh, *csrh; + const struct ipv6hdr *oldhdr; +@@ -153,7 +154,7 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, + + hdrlen = ((csrh->hdrlen + 1) << 3); + +- err = skb_cow_head(skb, hdrlen + skb->mac_len); ++ err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); + if (unlikely(err)) { + kfree(buf); + return err; +@@ -186,7 +187,8 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, + return 0; + } + +-static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) ++static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt, ++ struct dst_entry *cache_dst) + { + struct dst_entry *dst = skb_dst(skb); + struct rpl_iptunnel_encap *tinfo; +@@ -196,7 +198,7 @@ static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) + + tinfo = rpl_encap_lwtunnel(dst->lwtstate); + +- return rpl_do_srh_inline(skb, rlwt, tinfo->srh); ++ return rpl_do_srh_inline(skb, rlwt, tinfo->srh, cache_dst); + } + + static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) +@@ -208,14 +210,14 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) + + rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); + +- err = rpl_do_srh(skb, rlwt); +- if (unlikely(err)) +- goto drop; +- + local_bh_disable(); + dst = dst_cache_get(&rlwt->cache); + local_bh_enable(); + ++ err = rpl_do_srh(skb, rlwt, dst); ++ if (unlikely(err)) ++ goto drop; ++ + if (unlikely(!dst)) { + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct flowi6 fl6; +@@ -237,15 +239,15 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) + local_bh_disable(); + dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); + local_bh_enable(); ++ ++ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); ++ if (unlikely(err)) ++ goto drop; + } + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + +- err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); +- if (unlikely(err)) +- goto drop; +- + return dst_output(net, sk, skb); + + drop: +@@ -262,29 +264,31 @@ static int rpl_input(struct sk_buff *skb) + + rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); + +- err = rpl_do_srh(skb, rlwt); +- if (unlikely(err)) +- goto drop; +- + local_bh_disable(); + dst = dst_cache_get(&rlwt->cache); ++ local_bh_enable(); ++ ++ err = rpl_do_srh(skb, rlwt, dst); ++ if (unlikely(err)) ++ goto drop; + + if (!dst) { + ip6_route_input(skb); + dst = skb_dst(skb); + if (!dst->error) { ++ local_bh_disable(); + dst_cache_set_ip6(&rlwt->cache, dst, + &ipv6_hdr(skb)->saddr); ++ local_bh_enable(); + } ++ ++ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); ++ if (unlikely(err)) ++ goto drop; + } else { + skb_dst_drop(skb); + skb_dst_set(skb, dst); + } +- local_bh_enable(); +- +- err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); +- if (unlikely(err)) +- goto drop; + + return dst_input(skb); + +-- +2.39.5 + diff --git a/queue-6.13/net-ipv6-seg6_iptunnel-mitigate-2-realloc-issue.patch b/queue-6.13/net-ipv6-seg6_iptunnel-mitigate-2-realloc-issue.patch new file mode 100644 index 0000000000..7caeeb728a --- /dev/null +++ b/queue-6.13/net-ipv6-seg6_iptunnel-mitigate-2-realloc-issue.patch @@ -0,0 +1,254 @@ +From 6661ff9593387562aedbbfc07f1eb953186bc99a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 3 Dec 2024 13:49:44 +0100 +Subject: net: ipv6: seg6_iptunnel: mitigate 2-realloc issue + +From: Justin Iurman + +[ Upstream commit 40475b63761abb6f8fdef960d03228a08662c9c4 ] + +This patch mitigates the two-reallocations issue with seg6_iptunnel by +providing the dst_entry (in the cache) to the first call to +skb_cow_head(). As a result, the very first iteration would still +trigger two reallocations (i.e., empty cache), while next iterations +would only trigger a single reallocation. + +Performance tests before/after applying this patch, which clearly shows +the improvement: +- before: https://ibb.co/3Cg4sNH +- after: https://ibb.co/8rQ350r + +Signed-off-by: Justin Iurman +Cc: David Lebrun +Signed-off-by: Paolo Abeni +Stable-dep-of: 92191dd10730 ("net: ipv6: fix dst ref loops in rpl, seg6 and ioam6 lwtunnels") +Signed-off-by: Sasha Levin +--- + net/ipv6/seg6_iptunnel.c | 85 ++++++++++++++++++++++++---------------- + 1 file changed, 52 insertions(+), 33 deletions(-) + +diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c +index 098632adc9b5a..4bf937bfc2633 100644 +--- a/net/ipv6/seg6_iptunnel.c ++++ b/net/ipv6/seg6_iptunnel.c +@@ -124,8 +124,8 @@ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb, + return flowlabel; + } + +-/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ +-int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) ++static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, ++ int proto, struct dst_entry *cache_dst) + { + struct dst_entry *dst = skb_dst(skb); + struct net *net = dev_net(dst->dev); +@@ -137,7 +137,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) + hdrlen = (osrh->hdrlen + 1) << 3; + tot_len = hdrlen + sizeof(*hdr); + +- err = skb_cow_head(skb, tot_len + skb->mac_len); ++ err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); + if (unlikely(err)) + return err; + +@@ -197,11 +197,18 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) + + return 0; + } ++ ++/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ ++int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) ++{ ++ return __seg6_do_srh_encap(skb, osrh, proto, NULL); ++} + EXPORT_SYMBOL_GPL(seg6_do_srh_encap); + + /* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */ + static int seg6_do_srh_encap_red(struct sk_buff *skb, +- struct ipv6_sr_hdr *osrh, int proto) ++ struct ipv6_sr_hdr *osrh, int proto, ++ struct dst_entry *cache_dst) + { + __u8 first_seg = osrh->first_segment; + struct dst_entry *dst = skb_dst(skb); +@@ -230,7 +237,7 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, + + tot_len = red_hdrlen + sizeof(struct ipv6hdr); + +- err = skb_cow_head(skb, tot_len + skb->mac_len); ++ err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); + if (unlikely(err)) + return err; + +@@ -317,8 +324,8 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, + return 0; + } + +-/* insert an SRH within an IPv6 packet, just after the IPv6 header */ +-int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) ++static int __seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, ++ struct dst_entry *cache_dst) + { + struct ipv6hdr *hdr, *oldhdr; + struct ipv6_sr_hdr *isrh; +@@ -326,7 +333,7 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) + + hdrlen = (osrh->hdrlen + 1) << 3; + +- err = skb_cow_head(skb, hdrlen + skb->mac_len); ++ err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); + if (unlikely(err)) + return err; + +@@ -369,9 +376,8 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) + + return 0; + } +-EXPORT_SYMBOL_GPL(seg6_do_srh_inline); + +-static int seg6_do_srh(struct sk_buff *skb) ++static int seg6_do_srh(struct sk_buff *skb, struct dst_entry *cache_dst) + { + struct dst_entry *dst = skb_dst(skb); + struct seg6_iptunnel_encap *tinfo; +@@ -384,7 +390,7 @@ static int seg6_do_srh(struct sk_buff *skb) + if (skb->protocol != htons(ETH_P_IPV6)) + return -EINVAL; + +- err = seg6_do_srh_inline(skb, tinfo->srh); ++ err = __seg6_do_srh_inline(skb, tinfo->srh, cache_dst); + if (err) + return err; + break; +@@ -402,9 +408,11 @@ static int seg6_do_srh(struct sk_buff *skb) + return -EINVAL; + + if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP) +- err = seg6_do_srh_encap(skb, tinfo->srh, proto); ++ err = __seg6_do_srh_encap(skb, tinfo->srh, ++ proto, cache_dst); + else +- err = seg6_do_srh_encap_red(skb, tinfo->srh, proto); ++ err = seg6_do_srh_encap_red(skb, tinfo->srh, ++ proto, cache_dst); + + if (err) + return err; +@@ -425,11 +433,13 @@ static int seg6_do_srh(struct sk_buff *skb) + skb_push(skb, skb->mac_len); + + if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP) +- err = seg6_do_srh_encap(skb, tinfo->srh, +- IPPROTO_ETHERNET); ++ err = __seg6_do_srh_encap(skb, tinfo->srh, ++ IPPROTO_ETHERNET, ++ cache_dst); + else + err = seg6_do_srh_encap_red(skb, tinfo->srh, +- IPPROTO_ETHERNET); ++ IPPROTO_ETHERNET, ++ cache_dst); + + if (err) + return err; +@@ -444,6 +454,13 @@ static int seg6_do_srh(struct sk_buff *skb) + return 0; + } + ++/* insert an SRH within an IPv6 packet, just after the IPv6 header */ ++int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) ++{ ++ return __seg6_do_srh_inline(skb, osrh, NULL); ++} ++EXPORT_SYMBOL_GPL(seg6_do_srh_inline); ++ + static int seg6_input_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) + { +@@ -458,31 +475,33 @@ static int seg6_input_core(struct net *net, struct sock *sk, + struct seg6_lwt *slwt; + int err; + +- err = seg6_do_srh(skb); +- if (unlikely(err)) +- goto drop; +- + slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); + + local_bh_disable(); + dst = dst_cache_get(&slwt->cache); ++ local_bh_enable(); ++ ++ err = seg6_do_srh(skb, dst); ++ if (unlikely(err)) ++ goto drop; + + if (!dst) { + ip6_route_input(skb); + dst = skb_dst(skb); + if (!dst->error) { ++ local_bh_disable(); + dst_cache_set_ip6(&slwt->cache, dst, + &ipv6_hdr(skb)->saddr); ++ local_bh_enable(); + } ++ ++ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); ++ if (unlikely(err)) ++ goto drop; + } else { + skb_dst_drop(skb); + skb_dst_set(skb, dst); + } +- local_bh_enable(); +- +- err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); +- if (unlikely(err)) +- goto drop; + + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, +@@ -528,16 +547,16 @@ static int seg6_output_core(struct net *net, struct sock *sk, + struct seg6_lwt *slwt; + int err; + +- err = seg6_do_srh(skb); +- if (unlikely(err)) +- goto drop; +- + slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); + + local_bh_disable(); + dst = dst_cache_get(&slwt->cache); + local_bh_enable(); + ++ err = seg6_do_srh(skb, dst); ++ if (unlikely(err)) ++ goto drop; ++ + if (unlikely(!dst)) { + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct flowi6 fl6; +@@ -559,15 +578,15 @@ static int seg6_output_core(struct net *net, struct sock *sk, + local_bh_disable(); + dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); + local_bh_enable(); ++ ++ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); ++ if (unlikely(err)) ++ goto drop; + } + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + +- err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); +- if (unlikely(err)) +- goto drop; +- + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, + NULL, skb_dst(skb)->dev, dst_output); +-- +2.39.5 + diff --git a/queue-6.13/net-make-netdev_lock-protect-netdev-reg_state.patch b/queue-6.13/net-make-netdev_lock-protect-netdev-reg_state.patch new file mode 100644 index 0000000000..165b49a8c4 --- /dev/null +++ b/queue-6.13/net-make-netdev_lock-protect-netdev-reg_state.patch @@ -0,0 +1,84 @@ +From 621cdef591d78eef5fdd1bff248971b0b2149796 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 14 Jan 2025 19:53:10 -0800 +Subject: net: make netdev_lock() protect netdev->reg_state + +From: Jakub Kicinski + +[ Upstream commit 5fda3f35349b6b7f22f5f5095a3821261d515075 ] + +Protect writes to netdev->reg_state with netdev_lock(). +From now on holding netdev_lock() is sufficient to prevent +the net_device from getting unregistered, so code which +wants to hold just a single netdev around no longer needs +to hold rtnl_lock. + +We do not protect the NETREG_UNREGISTERED -> NETREG_RELEASED +transition. We'd need to move mutex_destroy(netdev->lock) +to .release, but the real reason is that trying to stop +the unregistration process mid-way would be unsafe / crazy. +Taking references on such devices is not safe, either. +So the intended semantics are to lock REGISTERED devices. + +Reviewed-by: Joe Damato +Reviewed-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250115035319.559603-3-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Stable-dep-of: 011b03359038 ("Revert "net: skb: introduce and use a single page frag cache"") +Signed-off-by: Sasha Levin +--- + include/linux/netdevice.h | 2 +- + net/core/dev.c | 6 ++++++ + 2 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index 035cc881dd756..47f817bcea503 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -2446,7 +2446,7 @@ struct net_device { + * Should always be taken using netdev_lock() / netdev_unlock() helpers. + * Drivers are free to use it for other protection. + * +- * Protects: @net_shaper_hierarchy. ++ * Protects: @reg_state, @net_shaper_hierarchy. + * Ordering: take after rtnl_lock. + */ + struct mutex lock; +diff --git a/net/core/dev.c b/net/core/dev.c +index 09a9adfa7da99..75996e1aac46c 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -10656,7 +10656,9 @@ int register_netdevice(struct net_device *dev) + + ret = netdev_register_kobject(dev); + ++ netdev_lock(dev); + WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); ++ netdev_unlock(dev); + + if (ret) + goto err_uninit_notify; +@@ -10954,7 +10956,9 @@ void netdev_run_todo(void) + continue; + } + ++ netdev_lock(dev); + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); ++ netdev_unlock(dev); + linkwatch_sync_dev(dev); + } + +@@ -11560,7 +11564,9 @@ void unregister_netdevice_many_notify(struct list_head *head, + list_for_each_entry(dev, head, unreg_list) { + /* And unlink it from device chain. */ + unlist_netdevice(dev); ++ netdev_lock(dev); + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); ++ netdev_unlock(dev); + } + flush_all_backlogs(); + +-- +2.39.5 + diff --git a/queue-6.13/net-make-sure-we-retain-napi-ordering-on-netdev-napi.patch b/queue-6.13/net-make-sure-we-retain-napi-ordering-on-netdev-napi.patch new file mode 100644 index 0000000000..a06f932d79 --- /dev/null +++ b/queue-6.13/net-make-sure-we-retain-napi-ordering-on-netdev-napi.patch @@ -0,0 +1,123 @@ +From 0743664434d4be423767fbe92f9426a526c9ecd8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 7 Jan 2025 08:08:39 -0800 +Subject: net: make sure we retain NAPI ordering on netdev->napi_list + +From: Jakub Kicinski + +[ Upstream commit d6c7b03497eef8b66bf0b5572881359913e39787 ] + +Netlink code depends on NAPI instances being sorted by ID on +the netdev list for dump continuation. We need to be able to +find the position on the list where we left off if dump does +not fit in a single skb, and in the meantime NAPI instances +can come and go. + +This was trivially true when we were assigning a new ID to every +new NAPI instance. Since we added the NAPI config API, we try +to retain the ID previously used for the same queue, but still +add the new NAPI instance at the start of the list. + +This is fine if we reset the entire netdev and all NAPIs get +removed and added back. If driver replaces a NAPI instance +during an operation like DEVMEM queue reset, or recreates +a subset of NAPI instances in other ways we may end up with +broken ordering, and therefore Netlink dumps with either +missing or duplicated entries. + +At this stage the problem is theoretical. Only two drivers +support queue API, bnxt and gve. gve recreates NAPIs during +queue reset, but it doesn't support NAPI config. +bnxt supports NAPI config but doesn't recreate instances +during reset. + +We need to save the ID in the config as soon as it is assigned +because otherwise the new NAPI will not know what ID it will +get at enable time, at the time it is being added. + +Reviewed-by: Willem de Bruijn +Reviewed-by: Eric Dumazet +Signed-off-by: Jakub Kicinski +Signed-off-by: Paolo Abeni +Stable-dep-of: 011b03359038 ("Revert "net: skb: introduce and use a single page frag cache"") +Signed-off-by: Sasha Levin +--- + net/core/dev.c | 42 ++++++++++++++++++++++++++++++++++++------ + 1 file changed, 36 insertions(+), 6 deletions(-) + +diff --git a/net/core/dev.c b/net/core/dev.c +index fbb796375aa0e..09a9adfa7da99 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -6735,13 +6735,14 @@ static void napi_restore_config(struct napi_struct *n) + n->gro_flush_timeout = n->config->gro_flush_timeout; + n->irq_suspend_timeout = n->config->irq_suspend_timeout; + /* a NAPI ID might be stored in the config, if so use it. if not, use +- * napi_hash_add to generate one for us. It will be saved to the config +- * in napi_disable. ++ * napi_hash_add to generate one for us. + */ +- if (n->config->napi_id) ++ if (n->config->napi_id) { + napi_hash_add_with_id(n, n->config->napi_id); +- else ++ } else { + napi_hash_add(n); ++ n->config->napi_id = n->napi_id; ++ } + } + + static void napi_save_config(struct napi_struct *n) +@@ -6749,10 +6750,39 @@ static void napi_save_config(struct napi_struct *n) + n->config->defer_hard_irqs = n->defer_hard_irqs; + n->config->gro_flush_timeout = n->gro_flush_timeout; + n->config->irq_suspend_timeout = n->irq_suspend_timeout; +- n->config->napi_id = n->napi_id; + napi_hash_del(n); + } + ++/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will ++ * inherit an existing ID try to insert it at the right position. ++ */ ++static void ++netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) ++{ ++ unsigned int new_id, pos_id; ++ struct list_head *higher; ++ struct napi_struct *pos; ++ ++ new_id = UINT_MAX; ++ if (napi->config && napi->config->napi_id) ++ new_id = napi->config->napi_id; ++ ++ higher = &dev->napi_list; ++ list_for_each_entry(pos, &dev->napi_list, dev_list) { ++ if (pos->napi_id >= MIN_NAPI_ID) ++ pos_id = pos->napi_id; ++ else if (pos->config) ++ pos_id = pos->config->napi_id; ++ else ++ pos_id = UINT_MAX; ++ ++ if (pos_id <= new_id) ++ break; ++ higher = &pos->dev_list; ++ } ++ list_add_rcu(&napi->dev_list, higher); /* adds after higher */ ++} ++ + void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) + { +@@ -6779,7 +6809,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, + napi->list_owner = -1; + set_bit(NAPI_STATE_SCHED, &napi->state); + set_bit(NAPI_STATE_NPSVC, &napi->state); +- list_add_rcu(&napi->dev_list, &dev->napi_list); ++ netif_napi_dev_list_add(dev, napi); + + /* default settings from sysfs are applied to all NAPIs. any per-NAPI + * configuration will be loaded in napi_enable +-- +2.39.5 + diff --git a/queue-6.13/net-protect-netdev-napi_list-with-netdev_lock.patch b/queue-6.13/net-protect-netdev-napi_list-with-netdev_lock.patch new file mode 100644 index 0000000000..7ca0c9638f --- /dev/null +++ b/queue-6.13/net-protect-netdev-napi_list-with-netdev_lock.patch @@ -0,0 +1,213 @@ +From b17b565e17e45a0c9a03a3fa17b254f4574b3aa3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 14 Jan 2025 19:53:13 -0800 +Subject: net: protect netdev->napi_list with netdev_lock() + +From: Jakub Kicinski + +[ Upstream commit 1b23cdbd2bbc4b40e21c12ae86c2781e347ff0f8 ] + +Hold netdev->lock when NAPIs are getting added or removed. +This will allow safe access to NAPI instances of a net_device +without rtnl_lock. + +Create a family of helpers which assume the lock is already taken. +Switch iavf to them, as it makes extensive use of netdev->lock, +already. + +Reviewed-by: Joe Damato +Reviewed-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250115035319.559603-6-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Stable-dep-of: 011b03359038 ("Revert "net: skb: introduce and use a single page frag cache"") +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/iavf/iavf_main.c | 6 +-- + include/linux/netdevice.h | 54 ++++++++++++++++++--- + net/core/dev.c | 15 ++++-- + 3 files changed, 60 insertions(+), 15 deletions(-) + +diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c +index 72314b0a1b25b..4639f55a17be1 100644 +--- a/drivers/net/ethernet/intel/iavf/iavf_main.c ++++ b/drivers/net/ethernet/intel/iavf/iavf_main.c +@@ -1815,8 +1815,8 @@ static int iavf_alloc_q_vectors(struct iavf_adapter *adapter) + q_vector->v_idx = q_idx; + q_vector->reg_idx = q_idx; + cpumask_copy(&q_vector->affinity_mask, cpu_possible_mask); +- netif_napi_add(adapter->netdev, &q_vector->napi, +- iavf_napi_poll); ++ netif_napi_add_locked(adapter->netdev, &q_vector->napi, ++ iavf_napi_poll); + } + + return 0; +@@ -1842,7 +1842,7 @@ static void iavf_free_q_vectors(struct iavf_adapter *adapter) + for (q_idx = 0; q_idx < num_q_vectors; q_idx++) { + struct iavf_q_vector *q_vector = &adapter->q_vectors[q_idx]; + +- netif_napi_del(&q_vector->napi); ++ netif_napi_del_locked(&q_vector->napi); + } + kfree(adapter->q_vectors); + adapter->q_vectors = NULL; +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index 64013fd389f28..7966a3d0e5bbc 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -2454,7 +2454,7 @@ struct net_device { + * Drivers are free to use it for other protection. + * + * Protects: +- * @net_shaper_hierarchy, @reg_state ++ * @napi_list, @net_shaper_hierarchy, @reg_state + * + * Partially protects (writers must hold both @lock and rtnl_lock): + * @up +@@ -2714,8 +2714,19 @@ static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) + */ + #define NAPI_POLL_WEIGHT 64 + +-void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, +- int (*poll)(struct napi_struct *, int), int weight); ++void netif_napi_add_weight_locked(struct net_device *dev, ++ struct napi_struct *napi, ++ int (*poll)(struct napi_struct *, int), ++ int weight); ++ ++static inline void ++netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, ++ int (*poll)(struct napi_struct *, int), int weight) ++{ ++ netdev_lock(dev); ++ netif_napi_add_weight_locked(dev, napi, poll, weight); ++ netdev_unlock(dev); ++} + + /** + * netif_napi_add() - initialize a NAPI context +@@ -2733,6 +2744,13 @@ netif_napi_add(struct net_device *dev, struct napi_struct *napi, + netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT); + } + ++static inline void ++netif_napi_add_locked(struct net_device *dev, struct napi_struct *napi, ++ int (*poll)(struct napi_struct *, int)) ++{ ++ netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT); ++} ++ + static inline void + netif_napi_add_tx_weight(struct net_device *dev, + struct napi_struct *napi, +@@ -2743,6 +2761,15 @@ netif_napi_add_tx_weight(struct net_device *dev, + netif_napi_add_weight(dev, napi, poll, weight); + } + ++static inline void ++netif_napi_add_config_locked(struct net_device *dev, struct napi_struct *napi, ++ int (*poll)(struct napi_struct *, int), int index) ++{ ++ napi->index = index; ++ napi->config = &dev->napi_config[index]; ++ netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT); ++} ++ + /** + * netif_napi_add_config - initialize a NAPI context with persistent config + * @dev: network device +@@ -2754,9 +2781,9 @@ static inline void + netif_napi_add_config(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int index) + { +- napi->index = index; +- napi->config = &dev->napi_config[index]; +- netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT); ++ netdev_lock(dev); ++ netif_napi_add_config_locked(dev, napi, poll, index); ++ netdev_unlock(dev); + } + + /** +@@ -2776,6 +2803,8 @@ static inline void netif_napi_add_tx(struct net_device *dev, + netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT); + } + ++void __netif_napi_del_locked(struct napi_struct *napi); ++ + /** + * __netif_napi_del - remove a NAPI context + * @napi: NAPI context +@@ -2784,7 +2813,18 @@ static inline void netif_napi_add_tx(struct net_device *dev, + * containing @napi. Drivers might want to call this helper to combine + * all the needed RCU grace periods into a single one. + */ +-void __netif_napi_del(struct napi_struct *napi); ++static inline void __netif_napi_del(struct napi_struct *napi) ++{ ++ netdev_lock(napi->dev); ++ __netif_napi_del_locked(napi); ++ netdev_unlock(napi->dev); ++} ++ ++static inline void netif_napi_del_locked(struct napi_struct *napi) ++{ ++ __netif_napi_del_locked(napi); ++ synchronize_net(); ++} + + /** + * netif_napi_del - remove a NAPI context +diff --git a/net/core/dev.c b/net/core/dev.c +index 67f2bb84db543..26cce0504f105 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -6783,9 +6783,12 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) + list_add_rcu(&napi->dev_list, higher); /* adds after higher */ + } + +-void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, +- int (*poll)(struct napi_struct *, int), int weight) ++void netif_napi_add_weight_locked(struct net_device *dev, ++ struct napi_struct *napi, ++ int (*poll)(struct napi_struct *, int), ++ int weight) + { ++ netdev_assert_locked(dev); + if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) + return; + +@@ -6826,7 +6829,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, + dev->threaded = false; + netif_napi_set_irq(napi, -1); + } +-EXPORT_SYMBOL(netif_napi_add_weight); ++EXPORT_SYMBOL(netif_napi_add_weight_locked); + + void napi_disable(struct napi_struct *n) + { +@@ -6897,8 +6900,10 @@ static void flush_gro_hash(struct napi_struct *napi) + } + + /* Must be called in process context */ +-void __netif_napi_del(struct napi_struct *napi) ++void __netif_napi_del_locked(struct napi_struct *napi) + { ++ netdev_assert_locked(napi->dev); ++ + if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) + return; + +@@ -6918,7 +6923,7 @@ void __netif_napi_del(struct napi_struct *napi) + napi->thread = NULL; + } + } +-EXPORT_SYMBOL(__netif_napi_del); ++EXPORT_SYMBOL(__netif_napi_del_locked); + + static int __napi_poll(struct napi_struct *n, bool *repoll) + { +-- +2.39.5 + diff --git a/queue-6.13/openvswitch-use-rcu-protection-in-ovs_vport_cmd_fill.patch b/queue-6.13/openvswitch-use-rcu-protection-in-ovs_vport_cmd_fill.patch new file mode 100644 index 0000000000..17c2168ada --- /dev/null +++ b/queue-6.13/openvswitch-use-rcu-protection-in-ovs_vport_cmd_fill.patch @@ -0,0 +1,66 @@ +From 8f62caf308d25d2da671456286fc6d18cc149722 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 7 Feb 2025 13:58:37 +0000 +Subject: openvswitch: use RCU protection in ovs_vport_cmd_fill_info() + +From: Eric Dumazet + +[ Upstream commit 90b2f49a502fa71090d9f4fe29a2f51fe5dff76d ] + +ovs_vport_cmd_fill_info() can be called without RTNL or RCU. + +Use RCU protection and dev_net_rcu() to avoid potential UAF. + +Fixes: 9354d4520342 ("openvswitch: reliable interface indentification in port dumps") +Signed-off-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20250207135841.1948589-6-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/openvswitch/datapath.c | 12 +++++++++--- + 1 file changed, 9 insertions(+), 3 deletions(-) + +diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c +index 225f6048867f4..5d548eda742df 100644 +--- a/net/openvswitch/datapath.c ++++ b/net/openvswitch/datapath.c +@@ -2101,6 +2101,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, + { + struct ovs_header *ovs_header; + struct ovs_vport_stats vport_stats; ++ struct net *net_vport; + int err; + + ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, +@@ -2117,12 +2118,15 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, + nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex)) + goto nla_put_failure; + +- if (!net_eq(net, dev_net(vport->dev))) { +- int id = peernet2id_alloc(net, dev_net(vport->dev), gfp); ++ rcu_read_lock(); ++ net_vport = dev_net_rcu(vport->dev); ++ if (!net_eq(net, net_vport)) { ++ int id = peernet2id_alloc(net, net_vport, GFP_ATOMIC); + + if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) +- goto nla_put_failure; ++ goto nla_put_failure_unlock; + } ++ rcu_read_unlock(); + + ovs_vport_get_stats(vport, &vport_stats); + if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, +@@ -2143,6 +2147,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, + genlmsg_end(skb, ovs_header); + return 0; + ++nla_put_failure_unlock: ++ rcu_read_unlock(); + nla_put_failure: + err = -EMSGSIZE; + error: +-- +2.39.5 + diff --git a/queue-6.13/reapply-net-skb-introduce-and-use-a-single-page-frag.patch b/queue-6.13/reapply-net-skb-introduce-and-use-a-single-page-frag.patch new file mode 100644 index 0000000000..388613b623 --- /dev/null +++ b/queue-6.13/reapply-net-skb-introduce-and-use-a-single-page-frag.patch @@ -0,0 +1,214 @@ +From 9d8b397523c329a7127e99f9a2107e6aa6a758c7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 13 Feb 2025 08:49:44 -0800 +Subject: Reapply "net: skb: introduce and use a single page frag cache" + +From: Jakub Kicinski + +[ Upstream commit 0892b840318daa6ae739b7cdec5ecdfca4006689 ] + +This reverts commit 011b0335903832facca86cd8ed05d7d8d94c9c76. + +Sabrina reports that the revert may trigger warnings due to intervening +changes, especially the ability to rise MAX_SKB_FRAGS. Let's drop it +and revisit once that part is also ironed out. + +Fixes: 011b03359038 ("Revert "net: skb: introduce and use a single page frag cache"") +Reported-by: Sabrina Dubroca +Link: https://lore.kernel.org/6bf54579233038bc0e76056c5ea459872ce362ab.1739375933.git.pabeni@redhat.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + include/linux/netdevice.h | 1 + + net/core/dev.c | 17 ------- + net/core/skbuff.c | 103 ++++++++++++++++++++++++++++++++++++-- + 3 files changed, 99 insertions(+), 22 deletions(-) + +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index 4cb08af483438..7966a3d0e5bbc 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -4048,6 +4048,7 @@ void netif_receive_skb_list(struct list_head *head); + gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); + void napi_gro_flush(struct napi_struct *napi, bool flush_old); + struct sk_buff *napi_get_frags(struct napi_struct *napi); ++void napi_get_frags_check(struct napi_struct *napi); + gro_result_t napi_gro_frags(struct napi_struct *napi); + + static inline void napi_free_frags(struct napi_struct *napi) +diff --git a/net/core/dev.c b/net/core/dev.c +index 571d38ca2bee7..26cce0504f105 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -6783,23 +6783,6 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) + list_add_rcu(&napi->dev_list, higher); /* adds after higher */ + } + +-/* Double check that napi_get_frags() allocates skbs with +- * skb->head being backed by slab, not a page fragment. +- * This is to make sure bug fixed in 3226b158e67c +- * ("net: avoid 32 x truesize under-estimation for tiny skbs") +- * does not accidentally come back. +- */ +-static void napi_get_frags_check(struct napi_struct *napi) +-{ +- struct sk_buff *skb; +- +- local_bh_disable(); +- skb = napi_get_frags(napi); +- WARN_ON_ONCE(skb && skb->head_frag); +- napi_free_frags(napi); +- local_bh_enable(); +-} +- + void netif_napi_add_weight_locked(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), +diff --git a/net/core/skbuff.c b/net/core/skbuff.c +index b32d4e1fa4428..6841e61a6bd0b 100644 +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -220,9 +220,67 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) + #define NAPI_SKB_CACHE_BULK 16 + #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) + ++#if PAGE_SIZE == SZ_4K ++ ++#define NAPI_HAS_SMALL_PAGE_FRAG 1 ++#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) ++ ++/* specialized page frag allocator using a single order 0 page ++ * and slicing it into 1K sized fragment. Constrained to systems ++ * with a very limited amount of 1K fragments fitting a single ++ * page - to avoid excessive truesize underestimation ++ */ ++ ++struct page_frag_1k { ++ void *va; ++ u16 offset; ++ bool pfmemalloc; ++}; ++ ++static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) ++{ ++ struct page *page; ++ int offset; ++ ++ offset = nc->offset - SZ_1K; ++ if (likely(offset >= 0)) ++ goto use_frag; ++ ++ page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); ++ if (!page) ++ return NULL; ++ ++ nc->va = page_address(page); ++ nc->pfmemalloc = page_is_pfmemalloc(page); ++ offset = PAGE_SIZE - SZ_1K; ++ page_ref_add(page, offset / SZ_1K); ++ ++use_frag: ++ nc->offset = offset; ++ return nc->va + offset; ++} ++#else ++ ++/* the small page is actually unused in this build; add dummy helpers ++ * to please the compiler and avoid later preprocessor's conditionals ++ */ ++#define NAPI_HAS_SMALL_PAGE_FRAG 0 ++#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false ++ ++struct page_frag_1k { ++}; ++ ++static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) ++{ ++ return NULL; ++} ++ ++#endif ++ + struct napi_alloc_cache { + local_lock_t bh_lock; + struct page_frag_cache page; ++ struct page_frag_1k page_small; + unsigned int skb_count; + void *skb_cache[NAPI_SKB_CACHE_SIZE]; + }; +@@ -232,6 +290,23 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), + }; + ++/* Double check that napi_get_frags() allocates skbs with ++ * skb->head being backed by slab, not a page fragment. ++ * This is to make sure bug fixed in 3226b158e67c ++ * ("net: avoid 32 x truesize under-estimation for tiny skbs") ++ * does not accidentally come back. ++ */ ++void napi_get_frags_check(struct napi_struct *napi) ++{ ++ struct sk_buff *skb; ++ ++ local_bh_disable(); ++ skb = napi_get_frags(napi); ++ WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); ++ napi_free_frags(napi); ++ local_bh_enable(); ++} ++ + void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) + { + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); +@@ -738,8 +813,10 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) + + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. ++ * When the small frag allocator is available, prefer it over kmalloc ++ * for small fragments + */ +- if (len <= SKB_WITH_OVERHEAD(1024) || ++ if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, +@@ -749,16 +826,32 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) + goto skb_success; + } + +- len = SKB_HEAD_ALIGN(len); +- + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; + + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + nc = this_cpu_ptr(&napi_alloc_cache); ++ if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { ++ /* we are artificially inflating the allocation size, but ++ * that is not as bad as it may look like, as: ++ * - 'len' less than GRO_MAX_HEAD makes little sense ++ * - On most systems, larger 'len' values lead to fragment ++ * size above 512 bytes ++ * - kmalloc would use the kmalloc-1k slab for such values ++ * - Builds with smaller GRO_MAX_HEAD will very likely do ++ * little networking, as that implies no WiFi and no ++ * tunnels support, and 32 bits arches. ++ */ ++ len = SZ_1K; + +- data = page_frag_alloc(&nc->page, len, gfp_mask); +- pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); ++ data = page_frag_alloc_1k(&nc->page_small, gfp_mask); ++ pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); ++ } else { ++ len = SKB_HEAD_ALIGN(len); ++ ++ data = page_frag_alloc(&nc->page, len, gfp_mask); ++ pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); ++ } + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + + if (unlikely(!data)) +-- +2.39.5 + diff --git a/queue-6.13/revert-net-skb-introduce-and-use-a-single-page-frag-.patch b/queue-6.13/revert-net-skb-introduce-and-use-a-single-page-frag-.patch new file mode 100644 index 0000000000..f1b7dd3374 --- /dev/null +++ b/queue-6.13/revert-net-skb-introduce-and-use-a-single-page-frag-.patch @@ -0,0 +1,232 @@ +From 46d3da5ebf5d3e897f42525d3c3e03501621f15b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 6 Feb 2025 22:28:48 +0100 +Subject: Revert "net: skb: introduce and use a single page frag cache" + +From: Paolo Abeni + +[ Upstream commit 011b0335903832facca86cd8ed05d7d8d94c9c76 ] + +This reverts commit dbae2b062824 ("net: skb: introduce and use a single +page frag cache"). The intended goal of such change was to counter a +performance regression introduced by commit 3226b158e67c ("net: avoid +32 x truesize under-estimation for tiny skbs"). + +Unfortunately, the blamed commit introduces another regression for the +virtio_net driver. Such a driver calls napi_alloc_skb() with a tiny +size, so that the whole head frag could fit a 512-byte block. + +The single page frag cache uses a 1K fragment for such allocation, and +the additional overhead, under small UDP packets flood, makes the page +allocator a bottleneck. + +Thanks to commit bf9f1baa279f ("net: add dedicated kmem_cache for +typical/small skb->head"), this revert does not re-introduce the +original regression. Actually, in the relevant test on top of this +revert, I measure a small but noticeable positive delta, just above +noise level. + +The revert itself required some additional mangling due to the +introduction of the SKB_HEAD_ALIGN() helper and local lock infra in the +affected code. + +Suggested-by: Eric Dumazet +Fixes: dbae2b062824 ("net: skb: introduce and use a single page frag cache") +Signed-off-by: Paolo Abeni +Link: https://patch.msgid.link/e649212fde9f0fdee23909ca0d14158d32bb7425.1738877290.git.pabeni@redhat.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + include/linux/netdevice.h | 1 - + net/core/dev.c | 17 +++++++ + net/core/skbuff.c | 103 ++------------------------------------ + 3 files changed, 22 insertions(+), 99 deletions(-) + +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index 7966a3d0e5bbc..4cb08af483438 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -4048,7 +4048,6 @@ void netif_receive_skb_list(struct list_head *head); + gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); + void napi_gro_flush(struct napi_struct *napi, bool flush_old); + struct sk_buff *napi_get_frags(struct napi_struct *napi); +-void napi_get_frags_check(struct napi_struct *napi); + gro_result_t napi_gro_frags(struct napi_struct *napi); + + static inline void napi_free_frags(struct napi_struct *napi) +diff --git a/net/core/dev.c b/net/core/dev.c +index 26cce0504f105..571d38ca2bee7 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -6783,6 +6783,23 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) + list_add_rcu(&napi->dev_list, higher); /* adds after higher */ + } + ++/* Double check that napi_get_frags() allocates skbs with ++ * skb->head being backed by slab, not a page fragment. ++ * This is to make sure bug fixed in 3226b158e67c ++ * ("net: avoid 32 x truesize under-estimation for tiny skbs") ++ * does not accidentally come back. ++ */ ++static void napi_get_frags_check(struct napi_struct *napi) ++{ ++ struct sk_buff *skb; ++ ++ local_bh_disable(); ++ skb = napi_get_frags(napi); ++ WARN_ON_ONCE(skb && skb->head_frag); ++ napi_free_frags(napi); ++ local_bh_enable(); ++} ++ + void netif_napi_add_weight_locked(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), +diff --git a/net/core/skbuff.c b/net/core/skbuff.c +index 6841e61a6bd0b..b32d4e1fa4428 100644 +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -220,67 +220,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) + #define NAPI_SKB_CACHE_BULK 16 + #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) + +-#if PAGE_SIZE == SZ_4K +- +-#define NAPI_HAS_SMALL_PAGE_FRAG 1 +-#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) +- +-/* specialized page frag allocator using a single order 0 page +- * and slicing it into 1K sized fragment. Constrained to systems +- * with a very limited amount of 1K fragments fitting a single +- * page - to avoid excessive truesize underestimation +- */ +- +-struct page_frag_1k { +- void *va; +- u16 offset; +- bool pfmemalloc; +-}; +- +-static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) +-{ +- struct page *page; +- int offset; +- +- offset = nc->offset - SZ_1K; +- if (likely(offset >= 0)) +- goto use_frag; +- +- page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); +- if (!page) +- return NULL; +- +- nc->va = page_address(page); +- nc->pfmemalloc = page_is_pfmemalloc(page); +- offset = PAGE_SIZE - SZ_1K; +- page_ref_add(page, offset / SZ_1K); +- +-use_frag: +- nc->offset = offset; +- return nc->va + offset; +-} +-#else +- +-/* the small page is actually unused in this build; add dummy helpers +- * to please the compiler and avoid later preprocessor's conditionals +- */ +-#define NAPI_HAS_SMALL_PAGE_FRAG 0 +-#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false +- +-struct page_frag_1k { +-}; +- +-static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) +-{ +- return NULL; +-} +- +-#endif +- + struct napi_alloc_cache { + local_lock_t bh_lock; + struct page_frag_cache page; +- struct page_frag_1k page_small; + unsigned int skb_count; + void *skb_cache[NAPI_SKB_CACHE_SIZE]; + }; +@@ -290,23 +232,6 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), + }; + +-/* Double check that napi_get_frags() allocates skbs with +- * skb->head being backed by slab, not a page fragment. +- * This is to make sure bug fixed in 3226b158e67c +- * ("net: avoid 32 x truesize under-estimation for tiny skbs") +- * does not accidentally come back. +- */ +-void napi_get_frags_check(struct napi_struct *napi) +-{ +- struct sk_buff *skb; +- +- local_bh_disable(); +- skb = napi_get_frags(napi); +- WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); +- napi_free_frags(napi); +- local_bh_enable(); +-} +- + void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) + { + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); +@@ -813,10 +738,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) + + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. +- * When the small frag allocator is available, prefer it over kmalloc +- * for small fragments + */ +- if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || ++ if (len <= SKB_WITH_OVERHEAD(1024) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, +@@ -826,32 +749,16 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) + goto skb_success; + } + ++ len = SKB_HEAD_ALIGN(len); ++ + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; + + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + nc = this_cpu_ptr(&napi_alloc_cache); +- if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { +- /* we are artificially inflating the allocation size, but +- * that is not as bad as it may look like, as: +- * - 'len' less than GRO_MAX_HEAD makes little sense +- * - On most systems, larger 'len' values lead to fragment +- * size above 512 bytes +- * - kmalloc would use the kmalloc-1k slab for such values +- * - Builds with smaller GRO_MAX_HEAD will very likely do +- * little networking, as that implies no WiFi and no +- * tunnels support, and 32 bits arches. +- */ +- len = SZ_1K; + +- data = page_frag_alloc_1k(&nc->page_small, gfp_mask); +- pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); +- } else { +- len = SKB_HEAD_ALIGN(len); +- +- data = page_frag_alloc(&nc->page, len, gfp_mask); +- pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); +- } ++ data = page_frag_alloc(&nc->page, len, gfp_mask); ++ pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + + if (unlikely(!data)) +-- +2.39.5 + diff --git a/queue-6.13/rust-kbuild-add-fzero-init-padding-bits-to-bindgen_s.patch b/queue-6.13/rust-kbuild-add-fzero-init-padding-bits-to-bindgen_s.patch new file mode 100644 index 0000000000..d599ea17e3 --- /dev/null +++ b/queue-6.13/rust-kbuild-add-fzero-init-padding-bits-to-bindgen_s.patch @@ -0,0 +1,42 @@ +From 7135c6ca6952e39b35856546f51da533d3f8145c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 29 Jan 2025 14:50:02 -0700 +Subject: rust: kbuild: add -fzero-init-padding-bits to bindgen_skip_cflags + +From: Justin M. Forbes + +[ Upstream commit a9c621a217128eb3fb7522cf763992d9437fd5ba ] + +This seems to break the build when building with gcc15: + + Unable to generate bindings: ClangDiagnostic("error: unknown + argument: '-fzero-init-padding-bits=all'\n") + +Thus skip that flag. + +Signed-off-by: Justin M. Forbes +Fixes: dce4aab8441d ("kbuild: Use -fzero-init-padding-bits=all") +Reviewed-by: Kees Cook +Link: https://lore.kernel.org/r/20250129215003.1736127-1-jforbes@fedoraproject.org +[ Slightly reworded commit. - Miguel ] +Signed-off-by: Miguel Ojeda +Signed-off-by: Sasha Levin +--- + rust/Makefile | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/rust/Makefile b/rust/Makefile +index a40a3936126d6..43cd7f845a9a3 100644 +--- a/rust/Makefile ++++ b/rust/Makefile +@@ -238,6 +238,7 @@ bindgen_skip_c_flags := -mno-fp-ret-in-387 -mpreferred-stack-boundary=% \ + -fzero-call-used-regs=% -fno-stack-clash-protection \ + -fno-inline-functions-called-once -fsanitize=bounds-strict \ + -fstrict-flex-arrays=% -fmin-function-alignment=% \ ++ -fzero-init-padding-bits=% \ + --param=% --param asan-% + + # Derived from `scripts/Makefile.clang`. +-- +2.39.5 + diff --git a/queue-6.13/s390-qeth-move-netif_napi_add_tx-and-napi_enable-fro.patch b/queue-6.13/s390-qeth-move-netif_napi_add_tx-and-napi_enable-fro.patch new file mode 100644 index 0000000000..8f9d1f7402 --- /dev/null +++ b/queue-6.13/s390-qeth-move-netif_napi_add_tx-and-napi_enable-fro.patch @@ -0,0 +1,54 @@ +From 48eda8093b86b426078bd245a9b4fbc5d057c436 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 12 Feb 2025 17:36:59 +0100 +Subject: s390/qeth: move netif_napi_add_tx() and napi_enable() from under BH + +From: Alexandra Winter + +[ Upstream commit 0d0b752f2497471ddd2b32143d167d42e18a8f3c ] + +Like other drivers qeth is calling local_bh_enable() after napi_schedule() +to kick-start softirqs [0]. +Since netif_napi_add_tx() and napi_enable() now take the netdev_lock() +mutex [1], move them out from under the BH protection. Same solution as in +commit a60558644e20 ("wifi: mt76: move napi_enable() from under BH") + +Fixes: 1b23cdbd2bbc ("net: protect netdev->napi_list with netdev_lock()") +Link: https://lore.kernel.org/netdev/20240612181900.4d9d18d0@kernel.org/ [0] +Link: https://lore.kernel.org/netdev/20250115035319.559603-1-kuba@kernel.org/ [1] +Signed-off-by: Alexandra Winter +Acked-by: Joe Damato +Link: https://patch.msgid.link/20250212163659.2287292-1-wintera@linux.ibm.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/s390/net/qeth_core_main.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c +index a3adaec5504e4..20328d695ef92 100644 +--- a/drivers/s390/net/qeth_core_main.c ++++ b/drivers/s390/net/qeth_core_main.c +@@ -7050,14 +7050,16 @@ int qeth_open(struct net_device *dev) + card->data.state = CH_STATE_UP; + netif_tx_start_all_queues(dev); + +- local_bh_disable(); + qeth_for_each_output_queue(card, queue, i) { + netif_napi_add_tx(dev, &queue->napi, qeth_tx_poll); + napi_enable(&queue->napi); +- napi_schedule(&queue->napi); + } +- + napi_enable(&card->napi); ++ ++ local_bh_disable(); ++ qeth_for_each_output_queue(card, queue, i) { ++ napi_schedule(&queue->napi); ++ } + napi_schedule(&card->napi); + /* kick-start the NAPI softirq: */ + local_bh_enable(); +-- +2.39.5 + diff --git a/queue-6.13/samples-hid-fix-broken-vmlinux-path-for-vmlinux_btf.patch b/queue-6.13/samples-hid-fix-broken-vmlinux-path-for-vmlinux_btf.patch new file mode 100644 index 0000000000..facbf84255 --- /dev/null +++ b/queue-6.13/samples-hid-fix-broken-vmlinux-path-for-vmlinux_btf.patch @@ -0,0 +1,47 @@ +From 985afc960a31369065f6d51ca350bb674439cc9b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 3 Feb 2025 02:55:06 -0600 +Subject: samples/hid: fix broken vmlinux path for VMLINUX_BTF + +From: Jinghao Jia + +[ Upstream commit 8b125949df58a00e8797c6e6d3f3d3dc08f4d939 ] + +Commit 13b25489b6f8 ("kbuild: change working directory to external +module directory with M=") changed kbuild working directory of hid-bpf +sample programs to samples/hid, which broke the vmlinux path for +VMLINUX_BTF, as the Makefiles assume the current work directory to be +the kernel output directory and use a relative path (i.e., ./vmlinux): + + Makefile:173: *** Cannot find a vmlinux for VMLINUX_BTF at any of " /path/to/linux/samples/hid/vmlinux", build the kernel or set VMLINUX_BTF or VMLINUX_H variable. Stop. + +Correctly refer to the kernel output directory using $(objtree). + +Fixes: 13b25489b6f8 ("kbuild: change working directory to external module directory with M=") +Tested-by: Ruowen Qin +Suggested-by: Daniel Borkmann +Suggested-by: Andrii Nakryiko +Signed-off-by: Jinghao Jia +Link: https://patch.msgid.link/20250203085506.220297-4-jinghao7@illinois.edu +Signed-off-by: Benjamin Tissoires +Signed-off-by: Sasha Levin +--- + samples/hid/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/samples/hid/Makefile b/samples/hid/Makefile +index 69159c81d0457..db5a077c77fc8 100644 +--- a/samples/hid/Makefile ++++ b/samples/hid/Makefile +@@ -164,7 +164,7 @@ $(obj)/hid_surface_dial.o: $(obj)/hid_surface_dial.skel.h + + VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \ + $(abspath $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)) \ +- $(abspath ./vmlinux) ++ $(abspath $(objtree)/vmlinux) + VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + + $(obj)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) +-- +2.39.5 + diff --git a/queue-6.13/samples-hid-remove-unnecessary-i-flags-from-libbpf-e.patch b/queue-6.13/samples-hid-remove-unnecessary-i-flags-from-libbpf-e.patch new file mode 100644 index 0000000000..6b2fd3cef2 --- /dev/null +++ b/queue-6.13/samples-hid-remove-unnecessary-i-flags-from-libbpf-e.patch @@ -0,0 +1,63 @@ +From 3ad1c6c3598c978e48a6ea67dc83fe6d30312cb0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 3 Feb 2025 02:55:04 -0600 +Subject: samples/hid: remove unnecessary -I flags from libbpf EXTRA_CFLAGS + +From: Jinghao Jia + +[ Upstream commit 1739cafdb8decad538410b05a4640055408826de ] + +Commit 5a6ea7022ff4 ("samples/bpf: Remove unnecessary -I flags from +libbpf EXTRA_CFLAGS") fixed the build error caused by redundant include +path for samples/bpf, but not samples/hid. + +Apply the same fix on samples/hid as well. + +Fixes: 13b25489b6f8 ("kbuild: change working directory to external module directory with M=") +Tested-by: Ruowen Qin +Signed-off-by: Jinghao Jia +Link: https://patch.msgid.link/20250203085506.220297-2-jinghao7@illinois.edu +Signed-off-by: Benjamin Tissoires +Signed-off-by: Sasha Levin +--- + samples/hid/Makefile | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/samples/hid/Makefile b/samples/hid/Makefile +index 8ea59e9631a33..69159c81d0457 100644 +--- a/samples/hid/Makefile ++++ b/samples/hid/Makefile +@@ -40,16 +40,17 @@ BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-generic + endif + endif + +-TPROGS_CFLAGS += -Wall -O2 +-TPROGS_CFLAGS += -Wmissing-prototypes +-TPROGS_CFLAGS += -Wstrict-prototypes ++COMMON_CFLAGS += -Wall -O2 ++COMMON_CFLAGS += -Wmissing-prototypes ++COMMON_CFLAGS += -Wstrict-prototypes + ++TPROGS_CFLAGS += $(COMMON_CFLAGS) + TPROGS_CFLAGS += -I$(objtree)/usr/include + TPROGS_CFLAGS += -I$(LIBBPF_INCLUDE) + TPROGS_CFLAGS += -I$(srctree)/tools/include + + ifdef SYSROOT +-TPROGS_CFLAGS += --sysroot=$(SYSROOT) ++COMMON_CFLAGS += --sysroot=$(SYSROOT) + TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib + endif + +@@ -112,7 +113,7 @@ clean: + + $(LIBBPF): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT) + # Fix up variables inherited from Kbuild that tools/ build system won't like +- $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \ ++ $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(COMMON_CFLAGS)" \ + LDFLAGS=$(TPROGS_LDFLAGS) srctree=$(HID_SAMPLES_PATH)/../../ \ + O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= \ + $@ install_headers +-- +2.39.5 + diff --git a/queue-6.13/scsi-ufs-core-introduce-a-new-clock_gating-lock.patch b/queue-6.13/scsi-ufs-core-introduce-a-new-clock_gating-lock.patch new file mode 100644 index 0000000000..d8782d0096 --- /dev/null +++ b/queue-6.13/scsi-ufs-core-introduce-a-new-clock_gating-lock.patch @@ -0,0 +1,336 @@ +From 72a36eeeeda669f958f54e4af40c1c042b8af74f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 24 Nov 2024 09:08:07 +0200 +Subject: scsi: ufs: core: Introduce a new clock_gating lock + +From: Avri Altman + +[ Upstream commit 209f4e43b8068c24cde227f464111030430153fa ] + +Introduce a new clock gating lock to serialize access to some of the clock +gating members instead of the host_lock. + +While at it, simplify the code with the guard() macro and co for automatic +cleanup of the new lock. There are some explicit +spin_lock_irqsave()/spin_unlock_irqrestore() snaking instances I left +behind because I couldn't make heads or tails of it. + +Additionally, move the trace_ufshcd_clk_gating() call from inside the +region protected by the lock as it doesn't needs protection. + +Signed-off-by: Avri Altman +Link: https://lore.kernel.org/r/20241124070808.194860-4-avri.altman@wdc.com +Reviewed-by: Bart Van Assche +Signed-off-by: Martin K. Petersen +Stable-dep-of: 839a74b5649c ("scsi: ufs: Fix toggling of clk_gating.state when clock gating is not allowed") +Signed-off-by: Sasha Levin +--- + drivers/ufs/core/ufshcd.c | 109 ++++++++++++++++++-------------------- + include/ufs/ufshcd.h | 9 +++- + 2 files changed, 59 insertions(+), 59 deletions(-) + +diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c +index 36725a12edd1e..8a52397276b74 100644 +--- a/drivers/ufs/core/ufshcd.c ++++ b/drivers/ufs/core/ufshcd.c +@@ -1816,19 +1816,16 @@ static void ufshcd_exit_clk_scaling(struct ufs_hba *hba) + static void ufshcd_ungate_work(struct work_struct *work) + { + int ret; +- unsigned long flags; + struct ufs_hba *hba = container_of(work, struct ufs_hba, + clk_gating.ungate_work); + + cancel_delayed_work_sync(&hba->clk_gating.gate_work); + +- spin_lock_irqsave(hba->host->host_lock, flags); +- if (hba->clk_gating.state == CLKS_ON) { +- spin_unlock_irqrestore(hba->host->host_lock, flags); +- return; ++ scoped_guard(spinlock_irqsave, &hba->clk_gating.lock) { ++ if (hba->clk_gating.state == CLKS_ON) ++ return; + } + +- spin_unlock_irqrestore(hba->host->host_lock, flags); + ufshcd_hba_vreg_set_hpm(hba); + ufshcd_setup_clocks(hba, true); + +@@ -1863,7 +1860,7 @@ void ufshcd_hold(struct ufs_hba *hba) + if (!ufshcd_is_clkgating_allowed(hba) || + !hba->clk_gating.is_initialized) + return; +- spin_lock_irqsave(hba->host->host_lock, flags); ++ spin_lock_irqsave(&hba->clk_gating.lock, flags); + hba->clk_gating.active_reqs++; + + start: +@@ -1879,11 +1876,11 @@ void ufshcd_hold(struct ufs_hba *hba) + */ + if (ufshcd_can_hibern8_during_gating(hba) && + ufshcd_is_link_hibern8(hba)) { +- spin_unlock_irqrestore(hba->host->host_lock, flags); ++ spin_unlock_irqrestore(&hba->clk_gating.lock, flags); + flush_result = flush_work(&hba->clk_gating.ungate_work); + if (hba->clk_gating.is_suspended && !flush_result) + return; +- spin_lock_irqsave(hba->host->host_lock, flags); ++ spin_lock_irqsave(&hba->clk_gating.lock, flags); + goto start; + } + break; +@@ -1912,17 +1909,17 @@ void ufshcd_hold(struct ufs_hba *hba) + */ + fallthrough; + case REQ_CLKS_ON: +- spin_unlock_irqrestore(hba->host->host_lock, flags); ++ spin_unlock_irqrestore(&hba->clk_gating.lock, flags); + flush_work(&hba->clk_gating.ungate_work); + /* Make sure state is CLKS_ON before returning */ +- spin_lock_irqsave(hba->host->host_lock, flags); ++ spin_lock_irqsave(&hba->clk_gating.lock, flags); + goto start; + default: + dev_err(hba->dev, "%s: clk gating is in invalid state %d\n", + __func__, hba->clk_gating.state); + break; + } +- spin_unlock_irqrestore(hba->host->host_lock, flags); ++ spin_unlock_irqrestore(&hba->clk_gating.lock, flags); + } + EXPORT_SYMBOL_GPL(ufshcd_hold); + +@@ -1930,30 +1927,32 @@ static void ufshcd_gate_work(struct work_struct *work) + { + struct ufs_hba *hba = container_of(work, struct ufs_hba, + clk_gating.gate_work.work); +- unsigned long flags; + int ret; + +- spin_lock_irqsave(hba->host->host_lock, flags); +- /* +- * In case you are here to cancel this work the gating state +- * would be marked as REQ_CLKS_ON. In this case save time by +- * skipping the gating work and exit after changing the clock +- * state to CLKS_ON. +- */ +- if (hba->clk_gating.is_suspended || +- (hba->clk_gating.state != REQ_CLKS_OFF)) { +- hba->clk_gating.state = CLKS_ON; +- trace_ufshcd_clk_gating(dev_name(hba->dev), +- hba->clk_gating.state); +- goto rel_lock; +- } ++ scoped_guard(spinlock_irqsave, &hba->clk_gating.lock) { ++ /* ++ * In case you are here to cancel this work the gating state ++ * would be marked as REQ_CLKS_ON. In this case save time by ++ * skipping the gating work and exit after changing the clock ++ * state to CLKS_ON. ++ */ ++ if (hba->clk_gating.is_suspended || ++ hba->clk_gating.state != REQ_CLKS_OFF) { ++ hba->clk_gating.state = CLKS_ON; ++ trace_ufshcd_clk_gating(dev_name(hba->dev), ++ hba->clk_gating.state); ++ return; ++ } + +- if (ufshcd_is_ufs_dev_busy(hba) || +- hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL || +- hba->clk_gating.active_reqs) +- goto rel_lock; ++ if (hba->clk_gating.active_reqs) ++ return; ++ } + +- spin_unlock_irqrestore(hba->host->host_lock, flags); ++ scoped_guard(spinlock_irqsave, hba->host->host_lock) { ++ if (ufshcd_is_ufs_dev_busy(hba) || ++ hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL) ++ return; ++ } + + /* put the link into hibern8 mode before turning off clocks */ + if (ufshcd_can_hibern8_during_gating(hba)) { +@@ -1964,7 +1963,7 @@ static void ufshcd_gate_work(struct work_struct *work) + __func__, ret); + trace_ufshcd_clk_gating(dev_name(hba->dev), + hba->clk_gating.state); +- goto out; ++ return; + } + ufshcd_set_link_hibern8(hba); + } +@@ -1984,32 +1983,34 @@ static void ufshcd_gate_work(struct work_struct *work) + * prevent from doing cancel work multiple times when there are + * new requests arriving before the current cancel work is done. + */ +- spin_lock_irqsave(hba->host->host_lock, flags); ++ guard(spinlock_irqsave)(&hba->clk_gating.lock); + if (hba->clk_gating.state == REQ_CLKS_OFF) { + hba->clk_gating.state = CLKS_OFF; + trace_ufshcd_clk_gating(dev_name(hba->dev), + hba->clk_gating.state); + } +-rel_lock: +- spin_unlock_irqrestore(hba->host->host_lock, flags); +-out: +- return; + } + +-/* host lock must be held before calling this variant */ + static void __ufshcd_release(struct ufs_hba *hba) + { ++ lockdep_assert_held(&hba->clk_gating.lock); ++ + if (!ufshcd_is_clkgating_allowed(hba)) + return; + + hba->clk_gating.active_reqs--; + + if (hba->clk_gating.active_reqs || hba->clk_gating.is_suspended || +- hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL || +- ufshcd_has_pending_tasks(hba) || !hba->clk_gating.is_initialized || ++ !hba->clk_gating.is_initialized || + hba->clk_gating.state == CLKS_OFF) + return; + ++ scoped_guard(spinlock_irqsave, hba->host->host_lock) { ++ if (ufshcd_has_pending_tasks(hba) || ++ hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL) ++ return; ++ } ++ + hba->clk_gating.state = REQ_CLKS_OFF; + trace_ufshcd_clk_gating(dev_name(hba->dev), hba->clk_gating.state); + queue_delayed_work(hba->clk_gating.clk_gating_workq, +@@ -2019,11 +2020,8 @@ static void __ufshcd_release(struct ufs_hba *hba) + + void ufshcd_release(struct ufs_hba *hba) + { +- unsigned long flags; +- +- spin_lock_irqsave(hba->host->host_lock, flags); ++ guard(spinlock_irqsave)(&hba->clk_gating.lock); + __ufshcd_release(hba); +- spin_unlock_irqrestore(hba->host->host_lock, flags); + } + EXPORT_SYMBOL_GPL(ufshcd_release); + +@@ -2038,11 +2036,9 @@ static ssize_t ufshcd_clkgate_delay_show(struct device *dev, + void ufshcd_clkgate_delay_set(struct device *dev, unsigned long value) + { + struct ufs_hba *hba = dev_get_drvdata(dev); +- unsigned long flags; + +- spin_lock_irqsave(hba->host->host_lock, flags); ++ guard(spinlock_irqsave)(&hba->clk_gating.lock); + hba->clk_gating.delay_ms = value; +- spin_unlock_irqrestore(hba->host->host_lock, flags); + } + EXPORT_SYMBOL_GPL(ufshcd_clkgate_delay_set); + +@@ -2070,7 +2066,6 @@ static ssize_t ufshcd_clkgate_enable_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) + { + struct ufs_hba *hba = dev_get_drvdata(dev); +- unsigned long flags; + u32 value; + + if (kstrtou32(buf, 0, &value)) +@@ -2078,9 +2073,10 @@ static ssize_t ufshcd_clkgate_enable_store(struct device *dev, + + value = !!value; + +- spin_lock_irqsave(hba->host->host_lock, flags); ++ guard(spinlock_irqsave)(&hba->clk_gating.lock); ++ + if (value == hba->clk_gating.is_enabled) +- goto out; ++ return count; + + if (value) + __ufshcd_release(hba); +@@ -2088,8 +2084,7 @@ static ssize_t ufshcd_clkgate_enable_store(struct device *dev, + hba->clk_gating.active_reqs++; + + hba->clk_gating.is_enabled = value; +-out: +- spin_unlock_irqrestore(hba->host->host_lock, flags); ++ + return count; + } + +@@ -2131,6 +2126,8 @@ static void ufshcd_init_clk_gating(struct ufs_hba *hba) + INIT_DELAYED_WORK(&hba->clk_gating.gate_work, ufshcd_gate_work); + INIT_WORK(&hba->clk_gating.ungate_work, ufshcd_ungate_work); + ++ spin_lock_init(&hba->clk_gating.lock); ++ + hba->clk_gating.clk_gating_workq = alloc_ordered_workqueue( + "ufs_clk_gating_%d", WQ_MEM_RECLAIM | WQ_HIGHPRI, + hba->host->host_no); +@@ -9163,7 +9160,6 @@ static int ufshcd_setup_clocks(struct ufs_hba *hba, bool on) + int ret = 0; + struct ufs_clk_info *clki; + struct list_head *head = &hba->clk_list_head; +- unsigned long flags; + ktime_t start = ktime_get(); + bool clk_state_changed = false; + +@@ -9214,11 +9210,10 @@ static int ufshcd_setup_clocks(struct ufs_hba *hba, bool on) + clk_disable_unprepare(clki->clk); + } + } else if (!ret && on) { +- spin_lock_irqsave(hba->host->host_lock, flags); +- hba->clk_gating.state = CLKS_ON; ++ scoped_guard(spinlock_irqsave, &hba->clk_gating.lock) ++ hba->clk_gating.state = CLKS_ON; + trace_ufshcd_clk_gating(dev_name(hba->dev), + hba->clk_gating.state); +- spin_unlock_irqrestore(hba->host->host_lock, flags); + } + + if (clk_state_changed) +diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h +index 82b2d2b25c23b..ee2adc4de05e0 100644 +--- a/include/ufs/ufshcd.h ++++ b/include/ufs/ufshcd.h +@@ -403,6 +403,9 @@ enum clk_gating_state { + * delay_ms + * @ungate_work: worker to turn on clocks that will be used in case of + * interrupt context ++ * @clk_gating_workq: workqueue for clock gating work. ++ * @lock: serialize access to some struct ufs_clk_gating members. An outer lock ++ * relative to the host lock + * @state: the current clocks state + * @delay_ms: gating delay in ms + * @is_suspended: clk gating is suspended when set to 1 which can be used +@@ -413,11 +416,14 @@ enum clk_gating_state { + * @is_initialized: Indicates whether clock gating is initialized or not + * @active_reqs: number of requests that are pending and should be waited for + * completion before gating clocks. +- * @clk_gating_workq: workqueue for clock gating work. + */ + struct ufs_clk_gating { + struct delayed_work gate_work; + struct work_struct ungate_work; ++ struct workqueue_struct *clk_gating_workq; ++ ++ spinlock_t lock; ++ + enum clk_gating_state state; + unsigned long delay_ms; + bool is_suspended; +@@ -426,7 +432,6 @@ struct ufs_clk_gating { + bool is_enabled; + bool is_initialized; + int active_reqs; +- struct workqueue_struct *clk_gating_workq; + }; + + /** +-- +2.39.5 + diff --git a/queue-6.13/scsi-ufs-core-introduce-ufshcd_has_pending_tasks.patch b/queue-6.13/scsi-ufs-core-introduce-ufshcd_has_pending_tasks.patch new file mode 100644 index 0000000000..cf548ad899 --- /dev/null +++ b/queue-6.13/scsi-ufs-core-introduce-ufshcd_has_pending_tasks.patch @@ -0,0 +1,58 @@ +From 834418d2ccfaf35d9a098b3aa088f3f04443dc3b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 24 Nov 2024 09:08:05 +0200 +Subject: scsi: ufs: core: Introduce ufshcd_has_pending_tasks() + +From: Avri Altman + +[ Upstream commit e738ba458e7539be1757dcdf85835a5c7b11fad4 ] + +Prepare to remove hba->clk_gating.active_reqs check from +ufshcd_is_ufs_dev_busy(). + +Signed-off-by: Avri Altman +Link: https://lore.kernel.org/r/20241124070808.194860-2-avri.altman@wdc.com +Reviewed-by: Bart Van Assche +Signed-off-by: Martin K. Petersen +Stable-dep-of: 839a74b5649c ("scsi: ufs: Fix toggling of clk_gating.state when clock gating is not allowed") +Signed-off-by: Sasha Levin +--- + drivers/ufs/core/ufshcd.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c +index d4a628169a51a..f2cacdac1a4fe 100644 +--- a/drivers/ufs/core/ufshcd.c ++++ b/drivers/ufs/core/ufshcd.c +@@ -258,10 +258,16 @@ ufs_get_desired_pm_lvl_for_dev_link_state(enum ufs_dev_pwr_mode dev_state, + return UFS_PM_LVL_0; + } + ++static bool ufshcd_has_pending_tasks(struct ufs_hba *hba) ++{ ++ return hba->outstanding_tasks || hba->active_uic_cmd || ++ hba->uic_async_done; ++} ++ + static bool ufshcd_is_ufs_dev_busy(struct ufs_hba *hba) + { +- return (hba->clk_gating.active_reqs || hba->outstanding_reqs || hba->outstanding_tasks || +- hba->active_uic_cmd || hba->uic_async_done); ++ return hba->clk_gating.active_reqs || hba->outstanding_reqs || ++ ufshcd_has_pending_tasks(hba); + } + + static const struct ufs_dev_quirk ufs_fixups[] = { +@@ -1999,8 +2005,7 @@ static void __ufshcd_release(struct ufs_hba *hba) + + if (hba->clk_gating.active_reqs || hba->clk_gating.is_suspended || + hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL || +- hba->outstanding_tasks || !hba->clk_gating.is_initialized || +- hba->active_uic_cmd || hba->uic_async_done || ++ ufshcd_has_pending_tasks(hba) || !hba->clk_gating.is_initialized || + hba->clk_gating.state == CLKS_OFF) + return; + +-- +2.39.5 + diff --git a/queue-6.13/scsi-ufs-core-prepare-to-introduce-a-new-clock_gatin.patch b/queue-6.13/scsi-ufs-core-prepare-to-introduce-a-new-clock_gatin.patch new file mode 100644 index 0000000000..09d327271e --- /dev/null +++ b/queue-6.13/scsi-ufs-core-prepare-to-introduce-a-new-clock_gatin.patch @@ -0,0 +1,61 @@ +From bac913b64f661a20c50664dfdd51f6806e8bad3e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 24 Nov 2024 09:08:06 +0200 +Subject: scsi: ufs: core: Prepare to introduce a new clock_gating lock + +From: Avri Altman + +[ Upstream commit 7869c6521f5715688b3d1f1c897374a68544eef0 ] + +Remove hba->clk_gating.active_reqs check from ufshcd_is_ufs_dev_busy() +function to separate clock gating logic from general device busy checks. + +Signed-off-by: Avri Altman +Link: https://lore.kernel.org/r/20241124070808.194860-3-avri.altman@wdc.com +Reviewed-by: Bart Van Assche +Signed-off-by: Martin K. Petersen +Stable-dep-of: 839a74b5649c ("scsi: ufs: Fix toggling of clk_gating.state when clock gating is not allowed") +Signed-off-by: Sasha Levin +--- + drivers/ufs/core/ufshcd.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c +index f2cacdac1a4fe..36725a12edd1e 100644 +--- a/drivers/ufs/core/ufshcd.c ++++ b/drivers/ufs/core/ufshcd.c +@@ -266,8 +266,7 @@ static bool ufshcd_has_pending_tasks(struct ufs_hba *hba) + + static bool ufshcd_is_ufs_dev_busy(struct ufs_hba *hba) + { +- return hba->clk_gating.active_reqs || hba->outstanding_reqs || +- ufshcd_has_pending_tasks(hba); ++ return hba->outstanding_reqs || ufshcd_has_pending_tasks(hba); + } + + static const struct ufs_dev_quirk ufs_fixups[] = { +@@ -1949,7 +1948,9 @@ static void ufshcd_gate_work(struct work_struct *work) + goto rel_lock; + } + +- if (ufshcd_is_ufs_dev_busy(hba) || hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL) ++ if (ufshcd_is_ufs_dev_busy(hba) || ++ hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL || ++ hba->clk_gating.active_reqs) + goto rel_lock; + + spin_unlock_irqrestore(hba->host->host_lock, flags); +@@ -8264,7 +8265,9 @@ static void ufshcd_rtc_work(struct work_struct *work) + hba = container_of(to_delayed_work(work), struct ufs_hba, ufs_rtc_update_work); + + /* Update RTC only when there are no requests in progress and UFSHCI is operational */ +- if (!ufshcd_is_ufs_dev_busy(hba) && hba->ufshcd_state == UFSHCD_STATE_OPERATIONAL) ++ if (!ufshcd_is_ufs_dev_busy(hba) && ++ hba->ufshcd_state == UFSHCD_STATE_OPERATIONAL && ++ !hba->clk_gating.active_reqs) + ufshcd_update_rtc(hba); + + if (ufshcd_is_ufs_dev_active(hba) && hba->dev_info.rtc_update_period) +-- +2.39.5 + diff --git a/queue-6.13/scsi-ufs-fix-toggling-of-clk_gating.state-when-clock.patch b/queue-6.13/scsi-ufs-fix-toggling-of-clk_gating.state-when-clock.patch new file mode 100644 index 0000000000..15bdff7280 --- /dev/null +++ b/queue-6.13/scsi-ufs-fix-toggling-of-clk_gating.state-when-clock.patch @@ -0,0 +1,48 @@ +From 37cfb988311df359f28e68acaa750681ff16c0fd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 28 Jan 2025 09:12:07 +0200 +Subject: scsi: ufs: Fix toggling of clk_gating.state when clock gating is not + allowed + +From: Avri Altman + +[ Upstream commit 839a74b5649c9f41d939a05059b5ca6b17156d03 ] + +This commit addresses an issue where clk_gating.state is being toggled in +ufshcd_setup_clocks() even if clock gating is not allowed. + +The fix is to add a check for hba->clk_gating.is_initialized before toggling +clk_gating.state in ufshcd_setup_clocks(). + +Since clk_gating.lock is now initialized unconditionally, it can no longer +lead to the spinlock being used before it is properly initialized, but +instead it is mostly for documentation purposes. + +Fixes: 1ab27c9cf8b6 ("ufs: Add support for clock gating") +Reported-by: Geert Uytterhoeven +Tested-by: Geert Uytterhoeven +Signed-off-by: Avri Altman +Link: https://lore.kernel.org/r/20250128071207.75494-3-avri.altman@wdc.com +Reviewed-by: Bart Van Assche +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/ufs/core/ufshcd.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c +index 8a52397276b74..03c0ee10d8f48 100644 +--- a/drivers/ufs/core/ufshcd.c ++++ b/drivers/ufs/core/ufshcd.c +@@ -9209,7 +9209,7 @@ static int ufshcd_setup_clocks(struct ufs_hba *hba, bool on) + if (!IS_ERR_OR_NULL(clki->clk) && clki->enabled) + clk_disable_unprepare(clki->clk); + } +- } else if (!ret && on) { ++ } else if (!ret && on && hba->clk_gating.is_initialized) { + scoped_guard(spinlock_irqsave, &hba->clk_gating.lock) + hba->clk_gating.state = CLKS_ON; + trace_ufshcd_clk_gating(dev_name(hba->dev), +-- +2.39.5 + diff --git a/queue-6.13/series b/queue-6.13/series index a19c121fc4..4b8d96c801 100644 --- a/queue-6.13/series +++ b/queue-6.13/series @@ -108,3 +108,63 @@ kbuild-suppress-stdout-from-merge_config-for-silent-.patch asoc-intel-bytcr_rt5640-add-dmi-quirk-for-vexia-edu-.patch asoc-renesas-snd_siu_migor-should-depend-on-dmadevic.patch kbuild-use-fzero-init-padding-bits-all.patch +include-net-add-static-inline-dst_dev_overhead-to-ds.patch +net-ipv6-ioam6_iptunnel-mitigate-2-realloc-issue.patch +net-ipv6-seg6_iptunnel-mitigate-2-realloc-issue.patch +net-ipv6-rpl_iptunnel-mitigate-2-realloc-issue.patch +net-ipv6-fix-dst-ref-loops-in-rpl-seg6-and-ioam6-lwt.patch +clocksource-use-pr_info-for-checking-clocksource-syn.patch +clocksource-use-migrate_disable-to-avoid-calling-get.patch +drm-xe-oa-uapi-make-oa-buffer-size-configurable.patch +drm-xe-oa-uapi-expose-an-unblock-after-n-reports-oa-.patch +drm-xe-oa-set-stream-pollin-in-xe_oa_buffer_check_un.patch +scsi-ufs-core-introduce-ufshcd_has_pending_tasks.patch +scsi-ufs-core-prepare-to-introduce-a-new-clock_gatin.patch +scsi-ufs-core-introduce-a-new-clock_gating-lock.patch +scsi-ufs-fix-toggling-of-clk_gating.state-when-clock.patch +samples-hid-remove-unnecessary-i-flags-from-libbpf-e.patch +samples-hid-fix-broken-vmlinux-path-for-vmlinux_btf.patch +rust-kbuild-add-fzero-init-padding-bits-to-bindgen_s.patch +cpufreq-amd-pstate-refactor-amd_pstate_epp_reenable-.patch +cpufreq-amd-pstate-remove-the-cppc_state-check-in-of.patch +cpufreq-amd-pstate-merge-amd_pstate_epp_cpu_offline-.patch +cpufreq-amd-pstate-convert-mutex-use-to-guard.patch +cpufreq-amd-pstate-fix-cpufreq_policy-ref-counting.patch +ipv4-add-rcu-protection-to-ip4_dst_hoplimit.patch +ipv4-use-rcu-protection-in-ip_dst_mtu_maybe_forward.patch +net-add-dev_net_rcu-helper.patch +ipv4-use-rcu-protection-in-ipv4_default_advmss.patch +ipv4-use-rcu-protection-in-rt_is_expired.patch +ipv4-use-rcu-protection-in-inet_select_addr.patch +ipv4-use-rcu-protection-in-__ip_rt_update_pmtu.patch +ipv4-icmp-convert-to-dev_net_rcu.patch +flow_dissector-use-rcu-protection-to-fetch-dev_net.patch +ipv6-use-rcu-protection-in-ip6_default_advmss.patch +ipv6-icmp-convert-to-dev_net_rcu.patch +compiler.h-move-c-string-helpers-into-c-only-kernel-.patch +genirq-remove-leading-space-from-irq_chip-irq_print_.patch +hid-hid-steam-make-sure-rumble-work-is-canceled-on-r.patch +hid-hid-steam-move-hidraw-input-un-registering-to-wo.patch +net-make-sure-we-retain-napi-ordering-on-netdev-napi.patch +eth-iavf-extend-the-netdev_lock-usage.patch +net-add-netdev_lock-netdev_unlock-helpers.patch +net-make-netdev_lock-protect-netdev-reg_state.patch +net-add-netdev-up-protected-by-netdev_lock.patch +net-protect-netdev-napi_list-with-netdev_lock.patch +revert-net-skb-introduce-and-use-a-single-page-frag-.patch +ndisc-use-rcu-protection-in-ndisc_alloc_skb.patch +neighbour-use-rcu-protection-in-__neigh_notify.patch +arp-use-rcu-protection-in-arp_xmit.patch +openvswitch-use-rcu-protection-in-ovs_vport_cmd_fill.patch +ndisc-extend-rcu-protection-in-ndisc_send_skb.patch +ipv6-mcast-extend-rcu-protection-in-igmp6_send.patch +btrfs-rename-__get_extent_map-and-pass-btrfs_inode.patch +btrfs-fix-stale-page-cache-after-race-between-readah.patch +iavf-fix-a-locking-bug-in-an-error-path.patch +io_uring-uring_cmd-cleanup-struct-io_uring_cmd_data-.patch +io_uring-uring_cmd-don-t-assume-io_uring_cmd_data-la.patch +io_uring-uring_cmd-switch-sqe-to-async_data-on-eagai.patch +ipv6-mcast-add-rcu-protection-to-mld_newpack.patch +s390-qeth-move-netif_napi_add_tx-and-napi_enable-fro.patch +reapply-net-skb-introduce-and-use-a-single-page-frag.patch +io_uring-uring_cmd-unconditionally-copy-sqes-at-prep.patch -- 2.47.2