From 9ea2e3d08d4863562f4d3a236a4ef4b6de7a03aa Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sat, 13 Jul 2024 09:26:38 -0400 Subject: [PATCH] Fixes for 6.1 Signed-off-by: Sasha Levin --- ...-of-args-in-call-to-bpf_map_kvcalloc.patch | 61 ++ queue-6.1/bpf-reduce-smap-elem_size.patch | 44 ++ ...e-inode-task-sk-storage-functions-fo.patch | 523 ++++++++++++++++++ ...remove-__bpf_local_storage_map_alloc.patch | 110 ++++ ...pf_map_kvcalloc-in-bpf_local_storage.patch | 98 ++++ ...missing-lock-protection-when-polling.patch | 56 ++ ...l-all-requests-for-the-object-that-i.patch | 67 +++ ...c-allocation-of-msg_id-to-avoid-reus.patch | 123 ++++ ...w-the-scope-of-triggering-epollin-ev.patch | 90 +++ ...gate-errors-from-vfs_getxattr-to-avo.patch | 70 +++ ...sending-new-request-when-dropping-ob.patch | 92 +++ ...for-ondemand_object_worker-to-finish.patch | 78 +++ ...do-not-return-sqi-value-if-link-is-d.patch | 122 ++++ ...ential-use-after-free-in-posix_lock_.patch | 50 ++ ...-value-stored-to-dentry-d_flags-inst.patch | 44 ++ ...gram-unloading-while-removing-the-dr.patch | 119 ++++ ...encing-null-ptr-in-pfn_section_valid.patch | 44 ++ ...antiq_etop-fix-double-free-in-detach.patch | 43 ++ ...-star-emac-set-mac_managed_pm-when-p.patch | 55 ++ .../net-fix-rc7-s-__skb_datagram_iter.patch | 45 ++ ...p-lan87xx-reinit-phy-after-cable-tes.patch | 41 ++ ...sched-fix-uaf-when-resolving-a-clash.patch | 131 +++++ ...-eperm-in-case-of-connection-failure.patch | 64 +++ ...-incorrect-value-output-on-error-pat.patch | 44 ++ ...ed-as-lcp-but-actually-malformed-pac.patch | 67 +++ queue-6.1/series | 29 + ...ip-zero-length-skb-in-sk_msg_recvmsg.patch | 105 ++++ ...t-undo-caused-by-dsack-of-tlp-retran.patch | 107 ++++ ...rcu_free-earlier-in-udp_lib_get_port.patch | 123 ++++ ...gative-dentry-count-when-on-shrinker.patch | 88 +++ 30 files changed, 2733 insertions(+) create mode 100644 queue-6.1/bpf-fix-order-of-args-in-call-to-bpf_map_kvcalloc.patch create mode 100644 queue-6.1/bpf-reduce-smap-elem_size.patch create mode 100644 queue-6.1/bpf-refactor-some-inode-task-sk-storage-functions-fo.patch create mode 100644 queue-6.1/bpf-remove-__bpf_local_storage_map_alloc.patch create mode 100644 queue-6.1/bpf-use-bpf_map_kvcalloc-in-bpf_local_storage.patch create mode 100644 queue-6.1/cachefiles-add-missing-lock-protection-when-polling.patch create mode 100644 queue-6.1/cachefiles-cancel-all-requests-for-the-object-that-i.patch create mode 100644 queue-6.1/cachefiles-cyclic-allocation-of-msg_id-to-avoid-reus.patch create mode 100644 queue-6.1/cachefiles-narrow-the-scope-of-triggering-epollin-ev.patch create mode 100644 queue-6.1/cachefiles-propagate-errors-from-vfs_getxattr-to-avo.patch create mode 100644 queue-6.1/cachefiles-stop-sending-new-request-when-dropping-ob.patch create mode 100644 queue-6.1/cachefiles-wait-for-ondemand_object_worker-to-finish.patch create mode 100644 queue-6.1/ethtool-netlink-do-not-return-sqi-value-if-link-is-d.patch create mode 100644 queue-6.1/filelock-fix-potential-use-after-free-in-posix_lock_.patch create mode 100644 queue-6.1/fs-dcache-re-use-value-stored-to-dentry-d_flags-inst.patch create mode 100644 queue-6.1/i40e-fix-xdp-program-unloading-while-removing-the-dr.patch create mode 100644 queue-6.1/mm-prevent-derefencing-null-ptr-in-pfn_section_valid.patch create mode 100644 queue-6.1/net-ethernet-lantiq_etop-fix-double-free-in-detach.patch create mode 100644 queue-6.1/net-ethernet-mtk-star-emac-set-mac_managed_pm-when-p.patch create mode 100644 queue-6.1/net-fix-rc7-s-__skb_datagram_iter.patch create mode 100644 queue-6.1/net-phy-microchip-lan87xx-reinit-phy-after-cable-tes.patch create mode 100644 queue-6.1/net-sched-fix-uaf-when-resolving-a-clash.patch create mode 100644 queue-6.1/net-sunrpc-remap-eperm-in-case-of-connection-failure.patch create mode 100644 queue-6.1/octeontx2-af-fix-incorrect-value-output-on-error-pat.patch create mode 100644 queue-6.1/ppp-reject-claimed-as-lcp-but-actually-malformed-pac.patch create mode 100644 queue-6.1/series create mode 100644 queue-6.1/skmsg-skip-zero-length-skb-in-sk_msg_recvmsg.patch create mode 100644 queue-6.1/tcp-fix-incorrect-undo-caused-by-dsack-of-tlp-retran.patch create mode 100644 queue-6.1/udp-set-sock_rcu_free-earlier-in-udp_lib_get_port.patch create mode 100644 queue-6.1/vfs-don-t-mod-negative-dentry-count-when-on-shrinker.patch diff --git a/queue-6.1/bpf-fix-order-of-args-in-call-to-bpf_map_kvcalloc.patch b/queue-6.1/bpf-fix-order-of-args-in-call-to-bpf_map_kvcalloc.patch new file mode 100644 index 00000000000..87ae30bbec3 --- /dev/null +++ b/queue-6.1/bpf-fix-order-of-args-in-call-to-bpf_map_kvcalloc.patch @@ -0,0 +1,61 @@ +From 690a7c085596a46117e8a65d8e038c4afb3f1112 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 10 Jul 2024 12:05:22 +0200 +Subject: bpf: fix order of args in call to bpf_map_kvcalloc + +From: Mohammad Shehar Yaar Tausif + +[ Upstream commit af253aef183a31ce62d2e39fc520b0ebfb562bb9 ] + +The original function call passed size of smap->bucket before the number of +buckets which raises the error 'calloc-transposed-args' on compilation. + +Vlastimil Babka added: + +The order of parameters can be traced back all the way to 6ac99e8f23d4 +("bpf: Introduce bpf sk local storage") accross several refactorings, +and that's why the commit is used as a Fixes: tag. + +In v6.10-rc1, a different commit 2c321f3f70bc ("mm: change inlined +allocation helpers to account at the call site") however exposed the +order of args in a way that gcc-14 has enough visibility to start +warning about it, because (in !CONFIG_MEMCG case) bpf_map_kvcalloc is +then a macro alias for kvcalloc instead of a static inline wrapper. + +To sum up the warning happens when the following conditions are all met: + +- gcc-14 is used (didn't see it with gcc-13) +- commit 2c321f3f70bc is present +- CONFIG_MEMCG is not enabled in .config +- CONFIG_WERROR turns this from a compiler warning to error + +Fixes: 6ac99e8f23d4 ("bpf: Introduce bpf sk local storage") +Reviewed-by: Andrii Nakryiko +Tested-by: Christian Kujau +Signed-off-by: Mohammad Shehar Yaar Tausif +Signed-off-by: Vlastimil Babka +Link: https://lore.kernel.org/r/20240710100521.15061-2-vbabka@suse.cz +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + kernel/bpf/bpf_local_storage.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c +index 888b8e481083f..51a9f024c1829 100644 +--- a/kernel/bpf/bpf_local_storage.c ++++ b/kernel/bpf/bpf_local_storage.c +@@ -620,8 +620,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, + nbuckets = max_t(u32, 2, nbuckets); + smap->bucket_log = ilog2(nbuckets); + +- smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), +- nbuckets, GFP_USER | __GFP_NOWARN); ++ smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets, ++ sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN); + if (!smap->buckets) { + bpf_map_area_free(smap); + return ERR_PTR(-ENOMEM); +-- +2.43.0 + diff --git a/queue-6.1/bpf-reduce-smap-elem_size.patch b/queue-6.1/bpf-reduce-smap-elem_size.patch new file mode 100644 index 00000000000..139fe7d03b4 --- /dev/null +++ b/queue-6.1/bpf-reduce-smap-elem_size.patch @@ -0,0 +1,44 @@ +From dd95c48b0dd43cc09f096180ebd06fb702320d0f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 20 Dec 2022 17:30:36 -0800 +Subject: bpf: Reduce smap->elem_size + +From: Martin KaFai Lau + +[ Upstream commit 552d42a356ebf78df9d2f4b73e077d2459966fac ] + +'struct bpf_local_storage_elem' has an unused 56 byte padding at the +end due to struct's cache-line alignment requirement. This padding +space is overlapped by storage value contents, so if we use sizeof() +to calculate the total size, we overinflate it by 56 bytes. Use +offsetof() instead to calculate more exact memory use. + +Signed-off-by: Martin KaFai Lau +Signed-off-by: Daniel Borkmann +Acked-by: Yonghong Song +Acked-by: Andrii Nakryiko +Link: https://lore.kernel.org/bpf/20221221013036.3427431-1-martin.lau@linux.dev +Stable-dep-of: af253aef183a ("bpf: fix order of args in call to bpf_map_kvcalloc") +Signed-off-by: Sasha Levin +--- + kernel/bpf/bpf_local_storage.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c +index b1090a2b02b34..f8dd7c516e320 100644 +--- a/kernel/bpf/bpf_local_storage.c ++++ b/kernel/bpf/bpf_local_storage.c +@@ -580,8 +580,8 @@ static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_att + raw_spin_lock_init(&smap->buckets[i].lock); + } + +- smap->elem_size = +- sizeof(struct bpf_local_storage_elem) + attr->value_size; ++ smap->elem_size = offsetof(struct bpf_local_storage_elem, ++ sdata.data[attr->value_size]); + + return smap; + } +-- +2.43.0 + diff --git a/queue-6.1/bpf-refactor-some-inode-task-sk-storage-functions-fo.patch b/queue-6.1/bpf-refactor-some-inode-task-sk-storage-functions-fo.patch new file mode 100644 index 00000000000..9896a58c4e0 --- /dev/null +++ b/queue-6.1/bpf-refactor-some-inode-task-sk-storage-functions-fo.patch @@ -0,0 +1,523 @@ +From b243ffd1225efb6fbb28431fe18afb6657e01725 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 25 Oct 2022 21:28:45 -0700 +Subject: bpf: Refactor some inode/task/sk storage functions for reuse + +From: Yonghong Song + +[ Upstream commit c83597fa5dc6b322e9bdf929e5f4136a3f4aa4db ] + +Refactor codes so that inode/task/sk storage implementation +can maximally share the same code. I also added some comments +in new function bpf_local_storage_unlink_nolock() to make +codes easy to understand. There is no functionality change. + +Acked-by: David Vernet +Signed-off-by: Yonghong Song +Link: https://lore.kernel.org/r/20221026042845.672944-1-yhs@fb.com +Signed-off-by: Alexei Starovoitov +Stable-dep-of: af253aef183a ("bpf: fix order of args in call to bpf_map_kvcalloc") +Signed-off-by: Sasha Levin +--- + include/linux/bpf_local_storage.h | 17 ++- + kernel/bpf/bpf_inode_storage.c | 38 +----- + kernel/bpf/bpf_local_storage.c | 190 +++++++++++++++++++----------- + kernel/bpf/bpf_task_storage.c | 38 +----- + net/core/bpf_sk_storage.c | 35 +----- + 5 files changed, 137 insertions(+), 181 deletions(-) + +diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h +index 7ea18d4da84b8..6d37a40cd90e8 100644 +--- a/include/linux/bpf_local_storage.h ++++ b/include/linux/bpf_local_storage.h +@@ -116,21 +116,22 @@ static struct bpf_local_storage_cache name = { \ + .idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock), \ + } + +-u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); +-void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, +- u16 idx); +- + /* Helper functions for bpf_local_storage */ + int bpf_local_storage_map_alloc_check(union bpf_attr *attr); + +-struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr); ++struct bpf_map * ++bpf_local_storage_map_alloc(union bpf_attr *attr, ++ struct bpf_local_storage_cache *cache); + + struct bpf_local_storage_data * + bpf_local_storage_lookup(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + bool cacheit_lockit); + +-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, ++bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage); ++ ++void bpf_local_storage_map_free(struct bpf_map *map, ++ struct bpf_local_storage_cache *cache, + int __percpu *busy_counter); + + int bpf_local_storage_map_check_btf(const struct bpf_map *map, +@@ -141,10 +142,6 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, + void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem); + +-bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, +- struct bpf_local_storage_elem *selem, +- bool uncharge_omem, bool use_trace_rcu); +- + void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); + + void bpf_selem_link_map(struct bpf_local_storage_map *smap, +diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c +index 5f7683b191998..6a1d4d22816a3 100644 +--- a/kernel/bpf/bpf_inode_storage.c ++++ b/kernel/bpf/bpf_inode_storage.c +@@ -56,11 +56,9 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, + + void bpf_inode_storage_free(struct inode *inode) + { +- struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + bool free_inode_storage = false; + struct bpf_storage_blob *bsb; +- struct hlist_node *n; + + bsb = bpf_inode(inode); + if (!bsb) +@@ -74,30 +72,11 @@ void bpf_inode_storage_free(struct inode *inode) + return; + } + +- /* Neither the bpf_prog nor the bpf-map's syscall +- * could be modifying the local_storage->list now. +- * Thus, no elem can be added-to or deleted-from the +- * local_storage->list by the bpf_prog or by the bpf-map's syscall. +- * +- * It is racing with bpf_local_storage_map_free() alone +- * when unlinking elem from the local_storage->list and +- * the map's bucket->list. +- */ + raw_spin_lock_bh(&local_storage->lock); +- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { +- /* Always unlink from map before unlinking from +- * local_storage. +- */ +- bpf_selem_unlink_map(selem); +- free_inode_storage = bpf_selem_unlink_storage_nolock( +- local_storage, selem, false, false); +- } ++ free_inode_storage = bpf_local_storage_unlink_nolock(local_storage); + raw_spin_unlock_bh(&local_storage->lock); + rcu_read_unlock(); + +- /* free_inoode_storage should always be true as long as +- * local_storage->list was non-empty. +- */ + if (free_inode_storage) + kfree_rcu(local_storage, rcu); + } +@@ -226,23 +205,12 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, + + static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) + { +- struct bpf_local_storage_map *smap; +- +- smap = bpf_local_storage_map_alloc(attr); +- if (IS_ERR(smap)) +- return ERR_CAST(smap); +- +- smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache); +- return &smap->map; ++ return bpf_local_storage_map_alloc(attr, &inode_cache); + } + + static void inode_storage_map_free(struct bpf_map *map) + { +- struct bpf_local_storage_map *smap; +- +- smap = (struct bpf_local_storage_map *)map; +- bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); +- bpf_local_storage_map_free(smap, NULL); ++ bpf_local_storage_map_free(map, &inode_cache, NULL); + } + + BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct, +diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c +index d9d88a2cda5e5..b1090a2b02b34 100644 +--- a/kernel/bpf/bpf_local_storage.c ++++ b/kernel/bpf/bpf_local_storage.c +@@ -114,9 +114,9 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) + * The caller must ensure selem->smap is still valid to be + * dereferenced for its smap->elem_size and smap->cache_idx. + */ +-bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, +- struct bpf_local_storage_elem *selem, +- bool uncharge_mem, bool use_trace_rcu) ++static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, ++ struct bpf_local_storage_elem *selem, ++ bool uncharge_mem, bool use_trace_rcu) + { + struct bpf_local_storage_map *smap; + bool free_local_storage; +@@ -501,7 +501,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, + return ERR_PTR(err); + } + +-u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) ++static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) + { + u64 min_usage = U64_MAX; + u16 i, res = 0; +@@ -525,76 +525,14 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) + return res; + } + +-void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, +- u16 idx) ++static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, ++ u16 idx) + { + spin_lock(&cache->idx_lock); + cache->idx_usage_counts[idx]--; + spin_unlock(&cache->idx_lock); + } + +-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, +- int __percpu *busy_counter) +-{ +- struct bpf_local_storage_elem *selem; +- struct bpf_local_storage_map_bucket *b; +- unsigned int i; +- +- /* Note that this map might be concurrently cloned from +- * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone +- * RCU read section to finish before proceeding. New RCU +- * read sections should be prevented via bpf_map_inc_not_zero. +- */ +- synchronize_rcu(); +- +- /* bpf prog and the userspace can no longer access this map +- * now. No new selem (of this map) can be added +- * to the owner->storage or to the map bucket's list. +- * +- * The elem of this map can be cleaned up here +- * or when the storage is freed e.g. +- * by bpf_sk_storage_free() during __sk_destruct(). +- */ +- for (i = 0; i < (1U << smap->bucket_log); i++) { +- b = &smap->buckets[i]; +- +- rcu_read_lock(); +- /* No one is adding to b->list now */ +- while ((selem = hlist_entry_safe( +- rcu_dereference_raw(hlist_first_rcu(&b->list)), +- struct bpf_local_storage_elem, map_node))) { +- if (busy_counter) { +- migrate_disable(); +- this_cpu_inc(*busy_counter); +- } +- bpf_selem_unlink(selem, false); +- if (busy_counter) { +- this_cpu_dec(*busy_counter); +- migrate_enable(); +- } +- cond_resched_rcu(); +- } +- rcu_read_unlock(); +- } +- +- /* While freeing the storage we may still need to access the map. +- * +- * e.g. when bpf_sk_storage_free() has unlinked selem from the map +- * which then made the above while((selem = ...)) loop +- * exit immediately. +- * +- * However, while freeing the storage one still needs to access the +- * smap->elem_size to do the uncharging in +- * bpf_selem_unlink_storage_nolock(). +- * +- * Hence, wait another rcu grace period for the storage to be freed. +- */ +- synchronize_rcu(); +- +- kvfree(smap->buckets); +- bpf_map_area_free(smap); +-} +- + int bpf_local_storage_map_alloc_check(union bpf_attr *attr) + { + if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || +@@ -614,7 +552,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr) + return 0; + } + +-struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) ++static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr) + { + struct bpf_local_storage_map *smap; + unsigned int i; +@@ -664,3 +602,117 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, + + return 0; + } ++ ++bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage) ++{ ++ struct bpf_local_storage_elem *selem; ++ bool free_storage = false; ++ struct hlist_node *n; ++ ++ /* Neither the bpf_prog nor the bpf_map's syscall ++ * could be modifying the local_storage->list now. ++ * Thus, no elem can be added to or deleted from the ++ * local_storage->list by the bpf_prog or by the bpf_map's syscall. ++ * ++ * It is racing with bpf_local_storage_map_free() alone ++ * when unlinking elem from the local_storage->list and ++ * the map's bucket->list. ++ */ ++ hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { ++ /* Always unlink from map before unlinking from ++ * local_storage. ++ */ ++ bpf_selem_unlink_map(selem); ++ /* If local_storage list has only one element, the ++ * bpf_selem_unlink_storage_nolock() will return true. ++ * Otherwise, it will return false. The current loop iteration ++ * intends to remove all local storage. So the last iteration ++ * of the loop will set the free_cgroup_storage to true. ++ */ ++ free_storage = bpf_selem_unlink_storage_nolock( ++ local_storage, selem, false, false); ++ } ++ ++ return free_storage; ++} ++ ++struct bpf_map * ++bpf_local_storage_map_alloc(union bpf_attr *attr, ++ struct bpf_local_storage_cache *cache) ++{ ++ struct bpf_local_storage_map *smap; ++ ++ smap = __bpf_local_storage_map_alloc(attr); ++ if (IS_ERR(smap)) ++ return ERR_CAST(smap); ++ ++ smap->cache_idx = bpf_local_storage_cache_idx_get(cache); ++ return &smap->map; ++} ++ ++void bpf_local_storage_map_free(struct bpf_map *map, ++ struct bpf_local_storage_cache *cache, ++ int __percpu *busy_counter) ++{ ++ struct bpf_local_storage_map_bucket *b; ++ struct bpf_local_storage_elem *selem; ++ struct bpf_local_storage_map *smap; ++ unsigned int i; ++ ++ smap = (struct bpf_local_storage_map *)map; ++ bpf_local_storage_cache_idx_free(cache, smap->cache_idx); ++ ++ /* Note that this map might be concurrently cloned from ++ * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone ++ * RCU read section to finish before proceeding. New RCU ++ * read sections should be prevented via bpf_map_inc_not_zero. ++ */ ++ synchronize_rcu(); ++ ++ /* bpf prog and the userspace can no longer access this map ++ * now. No new selem (of this map) can be added ++ * to the owner->storage or to the map bucket's list. ++ * ++ * The elem of this map can be cleaned up here ++ * or when the storage is freed e.g. ++ * by bpf_sk_storage_free() during __sk_destruct(). ++ */ ++ for (i = 0; i < (1U << smap->bucket_log); i++) { ++ b = &smap->buckets[i]; ++ ++ rcu_read_lock(); ++ /* No one is adding to b->list now */ ++ while ((selem = hlist_entry_safe( ++ rcu_dereference_raw(hlist_first_rcu(&b->list)), ++ struct bpf_local_storage_elem, map_node))) { ++ if (busy_counter) { ++ migrate_disable(); ++ this_cpu_inc(*busy_counter); ++ } ++ bpf_selem_unlink(selem, false); ++ if (busy_counter) { ++ this_cpu_dec(*busy_counter); ++ migrate_enable(); ++ } ++ cond_resched_rcu(); ++ } ++ rcu_read_unlock(); ++ } ++ ++ /* While freeing the storage we may still need to access the map. ++ * ++ * e.g. when bpf_sk_storage_free() has unlinked selem from the map ++ * which then made the above while((selem = ...)) loop ++ * exit immediately. ++ * ++ * However, while freeing the storage one still needs to access the ++ * smap->elem_size to do the uncharging in ++ * bpf_selem_unlink_storage_nolock(). ++ * ++ * Hence, wait another rcu grace period for the storage to be freed. ++ */ ++ synchronize_rcu(); ++ ++ kvfree(smap->buckets); ++ bpf_map_area_free(smap); ++} +diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c +index 6f290623347e0..40a92edd6f539 100644 +--- a/kernel/bpf/bpf_task_storage.c ++++ b/kernel/bpf/bpf_task_storage.c +@@ -71,10 +71,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map, + + void bpf_task_storage_free(struct task_struct *task) + { +- struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + bool free_task_storage = false; +- struct hlist_node *n; + unsigned long flags; + + rcu_read_lock(); +@@ -85,32 +83,13 @@ void bpf_task_storage_free(struct task_struct *task) + return; + } + +- /* Neither the bpf_prog nor the bpf-map's syscall +- * could be modifying the local_storage->list now. +- * Thus, no elem can be added-to or deleted-from the +- * local_storage->list by the bpf_prog or by the bpf-map's syscall. +- * +- * It is racing with bpf_local_storage_map_free() alone +- * when unlinking elem from the local_storage->list and +- * the map's bucket->list. +- */ + bpf_task_storage_lock(); + raw_spin_lock_irqsave(&local_storage->lock, flags); +- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { +- /* Always unlink from map before unlinking from +- * local_storage. +- */ +- bpf_selem_unlink_map(selem); +- free_task_storage = bpf_selem_unlink_storage_nolock( +- local_storage, selem, false, false); +- } ++ free_task_storage = bpf_local_storage_unlink_nolock(local_storage); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_task_storage_unlock(); + rcu_read_unlock(); + +- /* free_task_storage should always be true as long as +- * local_storage->list was non-empty. +- */ + if (free_task_storage) + kfree_rcu(local_storage, rcu); + } +@@ -288,23 +267,12 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) + + static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) + { +- struct bpf_local_storage_map *smap; +- +- smap = bpf_local_storage_map_alloc(attr); +- if (IS_ERR(smap)) +- return ERR_CAST(smap); +- +- smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache); +- return &smap->map; ++ return bpf_local_storage_map_alloc(attr, &task_cache); + } + + static void task_storage_map_free(struct bpf_map *map) + { +- struct bpf_local_storage_map *smap; +- +- smap = (struct bpf_local_storage_map *)map; +- bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); +- bpf_local_storage_map_free(smap, &bpf_task_storage_busy); ++ bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy); + } + + BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map) +diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c +index ad01b1bea52e4..0124536e8a9db 100644 +--- a/net/core/bpf_sk_storage.c ++++ b/net/core/bpf_sk_storage.c +@@ -48,10 +48,8 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) + /* Called by __sk_destruct() & bpf_sk_storage_clone() */ + void bpf_sk_storage_free(struct sock *sk) + { +- struct bpf_local_storage_elem *selem; + struct bpf_local_storage *sk_storage; + bool free_sk_storage = false; +- struct hlist_node *n; + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); +@@ -60,24 +58,8 @@ void bpf_sk_storage_free(struct sock *sk) + return; + } + +- /* Netiher the bpf_prog nor the bpf-map's syscall +- * could be modifying the sk_storage->list now. +- * Thus, no elem can be added-to or deleted-from the +- * sk_storage->list by the bpf_prog or by the bpf-map's syscall. +- * +- * It is racing with bpf_local_storage_map_free() alone +- * when unlinking elem from the sk_storage->list and +- * the map's bucket->list. +- */ + raw_spin_lock_bh(&sk_storage->lock); +- hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) { +- /* Always unlink from map before unlinking from +- * sk_storage. +- */ +- bpf_selem_unlink_map(selem); +- free_sk_storage = bpf_selem_unlink_storage_nolock( +- sk_storage, selem, true, false); +- } ++ free_sk_storage = bpf_local_storage_unlink_nolock(sk_storage); + raw_spin_unlock_bh(&sk_storage->lock); + rcu_read_unlock(); + +@@ -87,23 +69,12 @@ void bpf_sk_storage_free(struct sock *sk) + + static void bpf_sk_storage_map_free(struct bpf_map *map) + { +- struct bpf_local_storage_map *smap; +- +- smap = (struct bpf_local_storage_map *)map; +- bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); +- bpf_local_storage_map_free(smap, NULL); ++ bpf_local_storage_map_free(map, &sk_cache, NULL); + } + + static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) + { +- struct bpf_local_storage_map *smap; +- +- smap = bpf_local_storage_map_alloc(attr); +- if (IS_ERR(smap)) +- return ERR_CAST(smap); +- +- smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache); +- return &smap->map; ++ return bpf_local_storage_map_alloc(attr, &sk_cache); + } + + static int notsupp_get_next_key(struct bpf_map *map, void *key, +-- +2.43.0 + diff --git a/queue-6.1/bpf-remove-__bpf_local_storage_map_alloc.patch b/queue-6.1/bpf-remove-__bpf_local_storage_map_alloc.patch new file mode 100644 index 00000000000..41f361b7314 --- /dev/null +++ b/queue-6.1/bpf-remove-__bpf_local_storage_map_alloc.patch @@ -0,0 +1,110 @@ +From 24b661a9da3e3ae93686a9005c609a55b0c3f142 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 7 Mar 2023 22:59:22 -0800 +Subject: bpf: Remove __bpf_local_storage_map_alloc + +From: Martin KaFai Lau + +[ Upstream commit 62827d612ae525695799b3635a087cb49c55e977 ] + +bpf_local_storage_map_alloc() is the only caller of +__bpf_local_storage_map_alloc(). The remaining logic in +bpf_local_storage_map_alloc() is only a one liner setting +the smap->cache_idx. + +Remove __bpf_local_storage_map_alloc() to simplify code. + +Signed-off-by: Martin KaFai Lau +Link: https://lore.kernel.org/r/20230308065936.1550103-4-martin.lau@linux.dev +Signed-off-by: Alexei Starovoitov +Stable-dep-of: af253aef183a ("bpf: fix order of args in call to bpf_map_kvcalloc") +Signed-off-by: Sasha Levin +--- + kernel/bpf/bpf_local_storage.c | 63 ++++++++++++++-------------------- + 1 file changed, 26 insertions(+), 37 deletions(-) + +diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c +index 8ea65973739e4..888b8e481083f 100644 +--- a/kernel/bpf/bpf_local_storage.c ++++ b/kernel/bpf/bpf_local_storage.c +@@ -552,40 +552,6 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr) + return 0; + } + +-static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr) +-{ +- struct bpf_local_storage_map *smap; +- unsigned int i; +- u32 nbuckets; +- +- smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); +- if (!smap) +- return ERR_PTR(-ENOMEM); +- bpf_map_init_from_attr(&smap->map, attr); +- +- nbuckets = roundup_pow_of_two(num_possible_cpus()); +- /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ +- nbuckets = max_t(u32, 2, nbuckets); +- smap->bucket_log = ilog2(nbuckets); +- +- smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), +- nbuckets, GFP_USER | __GFP_NOWARN); +- if (!smap->buckets) { +- bpf_map_area_free(smap); +- return ERR_PTR(-ENOMEM); +- } +- +- for (i = 0; i < nbuckets; i++) { +- INIT_HLIST_HEAD(&smap->buckets[i].list); +- raw_spin_lock_init(&smap->buckets[i].lock); +- } +- +- smap->elem_size = offsetof(struct bpf_local_storage_elem, +- sdata.data[attr->value_size]); +- +- return smap; +-} +- + int bpf_local_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, +@@ -641,10 +607,33 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, + struct bpf_local_storage_cache *cache) + { + struct bpf_local_storage_map *smap; ++ unsigned int i; ++ u32 nbuckets; ++ ++ smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); ++ if (!smap) ++ return ERR_PTR(-ENOMEM); ++ bpf_map_init_from_attr(&smap->map, attr); ++ ++ nbuckets = roundup_pow_of_two(num_possible_cpus()); ++ /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ ++ nbuckets = max_t(u32, 2, nbuckets); ++ smap->bucket_log = ilog2(nbuckets); + +- smap = __bpf_local_storage_map_alloc(attr); +- if (IS_ERR(smap)) +- return ERR_CAST(smap); ++ smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), ++ nbuckets, GFP_USER | __GFP_NOWARN); ++ if (!smap->buckets) { ++ bpf_map_area_free(smap); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ for (i = 0; i < nbuckets; i++) { ++ INIT_HLIST_HEAD(&smap->buckets[i].list); ++ raw_spin_lock_init(&smap->buckets[i].lock); ++ } ++ ++ smap->elem_size = offsetof(struct bpf_local_storage_elem, ++ sdata.data[attr->value_size]); + + smap->cache_idx = bpf_local_storage_cache_idx_get(cache); + return &smap->map; +-- +2.43.0 + diff --git a/queue-6.1/bpf-use-bpf_map_kvcalloc-in-bpf_local_storage.patch b/queue-6.1/bpf-use-bpf_map_kvcalloc-in-bpf_local_storage.patch new file mode 100644 index 00000000000..f3b715c0f43 --- /dev/null +++ b/queue-6.1/bpf-use-bpf_map_kvcalloc-in-bpf_local_storage.patch @@ -0,0 +1,98 @@ +From daf3c08fc32bf497e5f7752034b59838dcbe0cad Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 10 Feb 2023 15:47:32 +0000 +Subject: bpf: use bpf_map_kvcalloc in bpf_local_storage + +From: Yafang Shao + +[ Upstream commit ddef81b5fd1da4d7c3cc8785d2043b73b72f38ef ] + +Introduce new helper bpf_map_kvcalloc() for the memory allocation in +bpf_local_storage(). Then the allocation will charge the memory from the +map instead of from current, though currently they are the same thing as +it is only used in map creation path now. By charging map's memory into +the memcg from the map, it will be more clear. + +Signed-off-by: Yafang Shao +Acked-by: Johannes Weiner +Acked-by: Roman Gushchin +Link: https://lore.kernel.org/r/20230210154734.4416-3-laoar.shao@gmail.com +Signed-off-by: Alexei Starovoitov +Stable-dep-of: af253aef183a ("bpf: fix order of args in call to bpf_map_kvcalloc") +Signed-off-by: Sasha Levin +--- + include/linux/bpf.h | 8 ++++++++ + kernel/bpf/bpf_local_storage.c | 4 ++-- + kernel/bpf/syscall.c | 15 +++++++++++++++ + 3 files changed, 25 insertions(+), 2 deletions(-) + +diff --git a/include/linux/bpf.h b/include/linux/bpf.h +index 1ca1902af23e9..6b18b8da025f9 100644 +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -1777,6 +1777,8 @@ struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); + void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, + int node); + void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); ++void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, ++ gfp_t flags); + void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, + size_t align, gfp_t flags); + #else +@@ -1793,6 +1795,12 @@ bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) + return kzalloc(size, flags); + } + ++static inline void * ++bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags) ++{ ++ return kvcalloc(n, size, flags); ++} ++ + static inline void __percpu * + bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, + gfp_t flags) +diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c +index f8dd7c516e320..8ea65973739e4 100644 +--- a/kernel/bpf/bpf_local_storage.c ++++ b/kernel/bpf/bpf_local_storage.c +@@ -568,8 +568,8 @@ static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_att + nbuckets = max_t(u32, 2, nbuckets); + smap->bucket_log = ilog2(nbuckets); + +- smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, +- GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); ++ smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), ++ nbuckets, GFP_USER | __GFP_NOWARN); + if (!smap->buckets) { + bpf_map_area_free(smap); + return ERR_PTR(-ENOMEM); +diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c +index 1e46a84694b8a..d77597daa0022 100644 +--- a/kernel/bpf/syscall.c ++++ b/kernel/bpf/syscall.c +@@ -470,6 +470,21 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) + return ptr; + } + ++void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, ++ gfp_t flags) ++{ ++ struct mem_cgroup *memcg, *old_memcg; ++ void *ptr; ++ ++ memcg = bpf_map_get_memcg(map); ++ old_memcg = set_active_memcg(memcg); ++ ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); ++ set_active_memcg(old_memcg); ++ mem_cgroup_put(memcg); ++ ++ return ptr; ++} ++ + void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, + size_t align, gfp_t flags) + { +-- +2.43.0 + diff --git a/queue-6.1/cachefiles-add-missing-lock-protection-when-polling.patch b/queue-6.1/cachefiles-add-missing-lock-protection-when-polling.patch new file mode 100644 index 00000000000..5fe6b34854b --- /dev/null +++ b/queue-6.1/cachefiles-add-missing-lock-protection-when-polling.patch @@ -0,0 +1,56 @@ +From 7b278c2f2b77afa61fb5e3a7fbe2af01165f3e55 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 28 Jun 2024 14:29:30 +0800 +Subject: cachefiles: add missing lock protection when polling + +From: Jingbo Xu + +[ Upstream commit cf5bb09e742a9cf6349127e868329a8f69b7a014 ] + +Add missing lock protection in poll routine when iterating xarray, +otherwise: + +Even with RCU read lock held, only the slot of the radix tree is +ensured to be pinned there, while the data structure (e.g. struct +cachefiles_req) stored in the slot has no such guarantee. The poll +routine will iterate the radix tree and dereference cachefiles_req +accordingly. Thus RCU read lock is not adequate in this case and +spinlock is needed here. + +Fixes: b817e22b2e91 ("cachefiles: narrow the scope of triggering EPOLLIN events in ondemand mode") +Signed-off-by: Jingbo Xu +Signed-off-by: Baokun Li +Link: https://lore.kernel.org/r/20240628062930.2467993-10-libaokun@huaweicloud.com +Acked-by: Jeff Layton +Reviewed-by: Jia Zhu +Reviewed-by: Gao Xiang +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/cachefiles/daemon.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c +index 06cdf1a8a16f6..89b11336a8369 100644 +--- a/fs/cachefiles/daemon.c ++++ b/fs/cachefiles/daemon.c +@@ -366,14 +366,14 @@ static __poll_t cachefiles_daemon_poll(struct file *file, + + if (cachefiles_in_ondemand_mode(cache)) { + if (!xa_empty(&cache->reqs)) { +- rcu_read_lock(); ++ xas_lock(&xas); + xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) { + if (!cachefiles_ondemand_is_reopening_read(req)) { + mask |= EPOLLIN; + break; + } + } +- rcu_read_unlock(); ++ xas_unlock(&xas); + } + } else { + if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) +-- +2.43.0 + diff --git a/queue-6.1/cachefiles-cancel-all-requests-for-the-object-that-i.patch b/queue-6.1/cachefiles-cancel-all-requests-for-the-object-that-i.patch new file mode 100644 index 00000000000..e50dc8bf812 --- /dev/null +++ b/queue-6.1/cachefiles-cancel-all-requests-for-the-object-that-i.patch @@ -0,0 +1,67 @@ +From 3711211fd0d34743a5a3afc4a784be53de0ec0d4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 28 Jun 2024 14:29:27 +0800 +Subject: cachefiles: cancel all requests for the object that is being dropped + +From: Baokun Li + +[ Upstream commit 751f524635a4f076117d714705eeddadaf6748ee ] + +Because after an object is dropped, requests for that object are useless, +cancel them to avoid causing other problems. + +This prepares for the later addition of cancel_work_sync(). After the +reopen requests is generated, cancel it to avoid cancel_work_sync() +blocking by waiting for daemon to complete the reopen requests. + +Signed-off-by: Baokun Li +Link: https://lore.kernel.org/r/20240628062930.2467993-7-libaokun@huaweicloud.com +Acked-by: Jeff Layton +Reviewed-by: Gao Xiang +Reviewed-by: Jia Zhu +Signed-off-by: Christian Brauner +Stable-dep-of: 12e009d60852 ("cachefiles: wait for ondemand_object_worker to finish when dropping object") +Signed-off-by: Sasha Levin +--- + fs/cachefiles/ondemand.c | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c +index cc2de0e3ee60f..acaecfce8aaa9 100644 +--- a/fs/cachefiles/ondemand.c ++++ b/fs/cachefiles/ondemand.c +@@ -636,12 +636,31 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object) + + void cachefiles_ondemand_clean_object(struct cachefiles_object *object) + { ++ unsigned long index; ++ struct cachefiles_req *req; ++ struct cachefiles_cache *cache; ++ + if (!object->ondemand) + return; + + cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0, + cachefiles_ondemand_init_close_req, NULL); ++ ++ if (!object->ondemand->ondemand_id) ++ return; ++ ++ /* Cancel all requests for the object that is being dropped. */ ++ cache = object->volume->cache; ++ xa_lock(&cache->reqs); + cachefiles_ondemand_set_object_dropping(object); ++ xa_for_each(&cache->reqs, index, req) { ++ if (req->object == object) { ++ req->error = -EIO; ++ complete(&req->done); ++ __xa_erase(&cache->reqs, index); ++ } ++ } ++ xa_unlock(&cache->reqs); + } + + int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object, +-- +2.43.0 + diff --git a/queue-6.1/cachefiles-cyclic-allocation-of-msg_id-to-avoid-reus.patch b/queue-6.1/cachefiles-cyclic-allocation-of-msg_id-to-avoid-reus.patch new file mode 100644 index 00000000000..33056434f1a --- /dev/null +++ b/queue-6.1/cachefiles-cyclic-allocation-of-msg_id-to-avoid-reus.patch @@ -0,0 +1,123 @@ +From 601a171f1123cb278b169ed9a2f135eaa2a53920 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 28 Jun 2024 14:29:29 +0800 +Subject: cachefiles: cyclic allocation of msg_id to avoid reuse + +From: Baokun Li + +[ Upstream commit 19f4f399091478c95947f6bd7ad61622300c30d9 ] + +Reusing the msg_id after a maliciously completed reopen request may cause +a read request to remain unprocessed and result in a hung, as shown below: + + t1 | t2 | t3 +------------------------------------------------- +cachefiles_ondemand_select_req + cachefiles_ondemand_object_is_close(A) + cachefiles_ondemand_set_object_reopening(A) + queue_work(fscache_object_wq, &info->work) + ondemand_object_worker + cachefiles_ondemand_init_object(A) + cachefiles_ondemand_send_req(OPEN) + // get msg_id 6 + wait_for_completion(&req_A->done) +cachefiles_ondemand_daemon_read + // read msg_id 6 req_A + cachefiles_ondemand_get_fd + copy_to_user + // Malicious completion msg_id 6 + copen 6,-1 + cachefiles_ondemand_copen + complete(&req_A->done) + // will not set the object to close + // because ondemand_id && fd is valid. + + // ondemand_object_worker() is done + // but the object is still reopening. + + // new open req_B + cachefiles_ondemand_init_object(B) + cachefiles_ondemand_send_req(OPEN) + // reuse msg_id 6 +process_open_req + copen 6,A.size + // The expected failed copen was executed successfully + +Expect copen to fail, and when it does, it closes fd, which sets the +object to close, and then close triggers reopen again. However, due to +msg_id reuse resulting in a successful copen, the anonymous fd is not +closed until the daemon exits. Therefore read requests waiting for reopen +to complete may trigger hung task. + +To avoid this issue, allocate the msg_id cyclically to avoid reusing the +msg_id for a very short duration of time. + +Fixes: c8383054506c ("cachefiles: notify the user daemon when looking up cookie") +Signed-off-by: Baokun Li +Link: https://lore.kernel.org/r/20240628062930.2467993-9-libaokun@huaweicloud.com +Acked-by: Jeff Layton +Reviewed-by: Gao Xiang +Reviewed-by: Jia Zhu +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/cachefiles/internal.h | 1 + + fs/cachefiles/ondemand.c | 20 ++++++++++++++++---- + 2 files changed, 17 insertions(+), 4 deletions(-) + +diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h +index 94f59123726ca..111ad6ecd4baf 100644 +--- a/fs/cachefiles/internal.h ++++ b/fs/cachefiles/internal.h +@@ -129,6 +129,7 @@ struct cachefiles_cache { + unsigned long req_id_next; + struct xarray ondemand_ids; /* xarray for ondemand_id allocation */ + u32 ondemand_id_next; ++ u32 msg_id_next; + }; + + static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache) +diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c +index 1f6561814e702..51173ab6dbd84 100644 +--- a/fs/cachefiles/ondemand.c ++++ b/fs/cachefiles/ondemand.c +@@ -505,20 +505,32 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, + smp_mb(); + + if (opcode == CACHEFILES_OP_CLOSE && +- !cachefiles_ondemand_object_is_open(object)) { ++ !cachefiles_ondemand_object_is_open(object)) { + WARN_ON_ONCE(object->ondemand->ondemand_id == 0); + xas_unlock(&xas); + ret = -EIO; + goto out; + } + +- xas.xa_index = 0; ++ /* ++ * Cyclically find a free xas to avoid msg_id reuse that would ++ * cause the daemon to successfully copen a stale msg_id. ++ */ ++ xas.xa_index = cache->msg_id_next; + xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK); ++ if (xas.xa_node == XAS_RESTART) { ++ xas.xa_index = 0; ++ xas_find_marked(&xas, cache->msg_id_next - 1, XA_FREE_MARK); ++ } + if (xas.xa_node == XAS_RESTART) + xas_set_err(&xas, -EBUSY); ++ + xas_store(&xas, req); +- xas_clear_mark(&xas, XA_FREE_MARK); +- xas_set_mark(&xas, CACHEFILES_REQ_NEW); ++ if (xas_valid(&xas)) { ++ cache->msg_id_next = xas.xa_index + 1; ++ xas_clear_mark(&xas, XA_FREE_MARK); ++ xas_set_mark(&xas, CACHEFILES_REQ_NEW); ++ } + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + +-- +2.43.0 + diff --git a/queue-6.1/cachefiles-narrow-the-scope-of-triggering-epollin-ev.patch b/queue-6.1/cachefiles-narrow-the-scope-of-triggering-epollin-ev.patch new file mode 100644 index 00000000000..f5fe47b0b86 --- /dev/null +++ b/queue-6.1/cachefiles-narrow-the-scope-of-triggering-epollin-ev.patch @@ -0,0 +1,90 @@ +From bba388b65ccb4c6943dffceaf58b5b059f4c27a4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 20 Nov 2023 12:14:21 +0800 +Subject: cachefiles: narrow the scope of triggering EPOLLIN events in ondemand + mode + +From: Jia Zhu + +[ Upstream commit b817e22b2e91257ace32a6768c3c003faeaa1c5c ] + +Don't trigger EPOLLIN when there are only reopening read requests in +xarray. + +Suggested-by: Xin Yin +Signed-off-by: Jia Zhu +Link: https://lore.kernel.org/r/20231120041422.75170-5-zhujia.zj@bytedance.com +Reviewed-by: Jingbo Xu +Reviewed-by: David Howells +Signed-off-by: Christian Brauner +Stable-dep-of: 12e009d60852 ("cachefiles: wait for ondemand_object_worker to finish when dropping object") +Signed-off-by: Sasha Levin +--- + fs/cachefiles/daemon.c | 14 ++++++++++++-- + fs/cachefiles/internal.h | 12 ++++++++++++ + 2 files changed, 24 insertions(+), 2 deletions(-) + +diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c +index b9945e4f697be..06cdf1a8a16f6 100644 +--- a/fs/cachefiles/daemon.c ++++ b/fs/cachefiles/daemon.c +@@ -357,14 +357,24 @@ static __poll_t cachefiles_daemon_poll(struct file *file, + struct poll_table_struct *poll) + { + struct cachefiles_cache *cache = file->private_data; ++ XA_STATE(xas, &cache->reqs, 0); ++ struct cachefiles_req *req; + __poll_t mask; + + poll_wait(file, &cache->daemon_pollwq, poll); + mask = 0; + + if (cachefiles_in_ondemand_mode(cache)) { +- if (!xa_empty(&cache->reqs)) +- mask |= EPOLLIN; ++ if (!xa_empty(&cache->reqs)) { ++ rcu_read_lock(); ++ xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) { ++ if (!cachefiles_ondemand_is_reopening_read(req)) { ++ mask |= EPOLLIN; ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ } + } else { + if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) + mask |= EPOLLIN; +diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h +index 3eea52462fc87..e0eac16e4741c 100644 +--- a/fs/cachefiles/internal.h ++++ b/fs/cachefiles/internal.h +@@ -335,6 +335,13 @@ cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \ + CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN); + CACHEFILES_OBJECT_STATE_FUNCS(close, CLOSE); + CACHEFILES_OBJECT_STATE_FUNCS(reopening, REOPENING); ++ ++static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req) ++{ ++ return cachefiles_ondemand_object_is_reopening(req->object) && ++ req->msg.opcode == CACHEFILES_OP_READ; ++} ++ + #else + static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, + char __user *_buffer, size_t buflen) +@@ -365,6 +372,11 @@ static inline int cachefiles_ondemand_init_obj_info(struct cachefiles_object *ob + static inline void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *obj) + { + } ++ ++static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req) ++{ ++ return false; ++} + #endif + + /* +-- +2.43.0 + diff --git a/queue-6.1/cachefiles-propagate-errors-from-vfs_getxattr-to-avo.patch b/queue-6.1/cachefiles-propagate-errors-from-vfs_getxattr-to-avo.patch new file mode 100644 index 00000000000..f779664f3f0 --- /dev/null +++ b/queue-6.1/cachefiles-propagate-errors-from-vfs_getxattr-to-avo.patch @@ -0,0 +1,70 @@ +From c21daeb059692c28f47f32a2ec401c964c63db26 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 28 Jun 2024 14:29:25 +0800 +Subject: cachefiles: propagate errors from vfs_getxattr() to avoid infinite + loop + +From: Baokun Li + +[ Upstream commit 0ece614a52bc9d219b839a6a29282b30d10e0c48 ] + +In cachefiles_check_volume_xattr(), the error returned by vfs_getxattr() +is not passed to ret, so it ends up returning -ESTALE, which leads to an +endless loop as follows: + +cachefiles_acquire_volume +retry: + ret = cachefiles_check_volume_xattr + ret = -ESTALE + xlen = vfs_getxattr // return -EIO + // The ret is not updated when xlen < 0, so -ESTALE is returned. + return ret + // Supposed to jump out of the loop at this judgement. + if (ret != -ESTALE) + goto error_dir; + cachefiles_bury_object + // EIO causes rename failure + goto retry; + +Hence propagate the error returned by vfs_getxattr() to avoid the above +issue. Do the same in cachefiles_check_auxdata(). + +Fixes: 32e150037dce ("fscache, cachefiles: Store the volume coherency data") +Fixes: 72b957856b0c ("cachefiles: Implement metadata/coherency data storage in xattrs") +Signed-off-by: Baokun Li +Link: https://lore.kernel.org/r/20240628062930.2467993-5-libaokun@huaweicloud.com +Reviewed-by: Gao Xiang +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/cachefiles/xattr.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c +index 00b087c14995a..0ecfc9065047c 100644 +--- a/fs/cachefiles/xattr.c ++++ b/fs/cachefiles/xattr.c +@@ -110,9 +110,11 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file + if (xlen == 0) + xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, buf, tlen); + if (xlen != tlen) { +- if (xlen < 0) ++ if (xlen < 0) { ++ ret = xlen; + trace_cachefiles_vfs_error(object, file_inode(file), xlen, + cachefiles_trace_getxattr_error); ++ } + if (xlen == -EIO) + cachefiles_io_error_obj( + object, +@@ -252,6 +254,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) + xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, buf, len); + if (xlen != len) { + if (xlen < 0) { ++ ret = xlen; + trace_cachefiles_vfs_error(NULL, d_inode(dentry), xlen, + cachefiles_trace_getxattr_error); + if (xlen == -EIO) +-- +2.43.0 + diff --git a/queue-6.1/cachefiles-stop-sending-new-request-when-dropping-ob.patch b/queue-6.1/cachefiles-stop-sending-new-request-when-dropping-ob.patch new file mode 100644 index 00000000000..7cbc144d56f --- /dev/null +++ b/queue-6.1/cachefiles-stop-sending-new-request-when-dropping-ob.patch @@ -0,0 +1,92 @@ +From fc276fa28a5fac97abd03c720dccb0d9c9c2bb0f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 28 Jun 2024 14:29:26 +0800 +Subject: cachefiles: stop sending new request when dropping object + +From: Baokun Li + +[ Upstream commit b2415d1f4566b6939acacc69637eaa57815829c1 ] + +Added CACHEFILES_ONDEMAND_OBJSTATE_DROPPING indicates that the cachefiles +object is being dropped, and is set after the close request for the dropped +object completes, and no new requests are allowed to be sent after this +state. + +This prepares for the later addition of cancel_work_sync(). It prevents +leftover reopen requests from being sent, to avoid processing unnecessary +requests and to avoid cancel_work_sync() blocking by waiting for daemon to +complete the reopen requests. + +Signed-off-by: Baokun Li +Link: https://lore.kernel.org/r/20240628062930.2467993-6-libaokun@huaweicloud.com +Acked-by: Jeff Layton +Reviewed-by: Gao Xiang +Reviewed-by: Jia Zhu +Signed-off-by: Christian Brauner +Stable-dep-of: 12e009d60852 ("cachefiles: wait for ondemand_object_worker to finish when dropping object") +Signed-off-by: Sasha Levin +--- + fs/cachefiles/internal.h | 2 ++ + fs/cachefiles/ondemand.c | 10 ++++++++-- + 2 files changed, 10 insertions(+), 2 deletions(-) + +diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h +index e0eac16e4741c..94f59123726ca 100644 +--- a/fs/cachefiles/internal.h ++++ b/fs/cachefiles/internal.h +@@ -48,6 +48,7 @@ enum cachefiles_object_state { + CACHEFILES_ONDEMAND_OBJSTATE_CLOSE, /* Anonymous fd closed by daemon or initial state */ + CACHEFILES_ONDEMAND_OBJSTATE_OPEN, /* Anonymous fd associated with object is available */ + CACHEFILES_ONDEMAND_OBJSTATE_REOPENING, /* Object that was closed and is being reopened. */ ++ CACHEFILES_ONDEMAND_OBJSTATE_DROPPING, /* Object is being dropped. */ + }; + + struct cachefiles_ondemand_info { +@@ -335,6 +336,7 @@ cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \ + CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN); + CACHEFILES_OBJECT_STATE_FUNCS(close, CLOSE); + CACHEFILES_OBJECT_STATE_FUNCS(reopening, REOPENING); ++CACHEFILES_OBJECT_STATE_FUNCS(dropping, DROPPING); + + static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req) + { +diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c +index 4b39f0422e590..cc2de0e3ee60f 100644 +--- a/fs/cachefiles/ondemand.c ++++ b/fs/cachefiles/ondemand.c +@@ -494,7 +494,8 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, + */ + xas_lock(&xas); + +- if (test_bit(CACHEFILES_DEAD, &cache->flags)) { ++ if (test_bit(CACHEFILES_DEAD, &cache->flags) || ++ cachefiles_ondemand_object_is_dropping(object)) { + xas_unlock(&xas); + ret = -EIO; + goto out; +@@ -535,7 +536,8 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, + * If error occurs after creating the anonymous fd, + * cachefiles_ondemand_fd_release() will set object to close. + */ +- if (opcode == CACHEFILES_OP_OPEN) ++ if (opcode == CACHEFILES_OP_OPEN && ++ !cachefiles_ondemand_object_is_dropping(object)) + cachefiles_ondemand_set_object_close(object); + kfree(req); + return ret; +@@ -634,8 +636,12 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object) + + void cachefiles_ondemand_clean_object(struct cachefiles_object *object) + { ++ if (!object->ondemand) ++ return; ++ + cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0, + cachefiles_ondemand_init_close_req, NULL); ++ cachefiles_ondemand_set_object_dropping(object); + } + + int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object, +-- +2.43.0 + diff --git a/queue-6.1/cachefiles-wait-for-ondemand_object_worker-to-finish.patch b/queue-6.1/cachefiles-wait-for-ondemand_object_worker-to-finish.patch new file mode 100644 index 00000000000..25b525e7ba2 --- /dev/null +++ b/queue-6.1/cachefiles-wait-for-ondemand_object_worker-to-finish.patch @@ -0,0 +1,78 @@ +From 480dcd6b96a48aad7fd10aa9e50a3bd5896a66f5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 28 Jun 2024 14:29:28 +0800 +Subject: cachefiles: wait for ondemand_object_worker to finish when dropping + object + +From: Hou Tao + +[ Upstream commit 12e009d60852f7bce0afc373ca0b320f14150418 ] + +When queuing ondemand_object_worker() to re-open the object, +cachefiles_object is not pinned. The cachefiles_object may be freed when +the pending read request is completed intentionally and the related +erofs is umounted. If ondemand_object_worker() runs after the object is +freed, it will incur use-after-free problem as shown below. + +process A processs B process C process D + +cachefiles_ondemand_send_req() +// send a read req X +// wait for its completion + + // close ondemand fd + cachefiles_ondemand_fd_release() + // set object as CLOSE + + cachefiles_ondemand_daemon_read() + // set object as REOPENING + queue_work(fscache_wq, &info->ondemand_work) + + // close /dev/cachefiles + cachefiles_daemon_release + cachefiles_flush_reqs + complete(&req->done) + +// read req X is completed +// umount the erofs fs +cachefiles_put_object() +// object will be freed +cachefiles_ondemand_deinit_obj_info() +kmem_cache_free(object) + // both info and object are freed + ondemand_object_worker() + +When dropping an object, it is no longer necessary to reopen the object, +so use cancel_work_sync() to cancel or wait for ondemand_object_worker() +to finish. + +Fixes: 0a7e54c1959c ("cachefiles: resend an open request if the read request's object is closed") +Signed-off-by: Hou Tao +Signed-off-by: Baokun Li +Link: https://lore.kernel.org/r/20240628062930.2467993-8-libaokun@huaweicloud.com +Acked-by: Jeff Layton +Reviewed-by: Jia Zhu +Reviewed-by: Gao Xiang +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/cachefiles/ondemand.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c +index acaecfce8aaa9..1f6561814e702 100644 +--- a/fs/cachefiles/ondemand.c ++++ b/fs/cachefiles/ondemand.c +@@ -661,6 +661,9 @@ void cachefiles_ondemand_clean_object(struct cachefiles_object *object) + } + } + xa_unlock(&cache->reqs); ++ ++ /* Wait for ondemand_object_worker() to finish to avoid UAF. */ ++ cancel_work_sync(&object->ondemand->ondemand_work); + } + + int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object, +-- +2.43.0 + diff --git a/queue-6.1/ethtool-netlink-do-not-return-sqi-value-if-link-is-d.patch b/queue-6.1/ethtool-netlink-do-not-return-sqi-value-if-link-is-d.patch new file mode 100644 index 00000000000..2fd3a0e0cc7 --- /dev/null +++ b/queue-6.1/ethtool-netlink-do-not-return-sqi-value-if-link-is-d.patch @@ -0,0 +1,122 @@ +From 756b80f53663f02d748723e7504c020cfc17bcda Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 9 Jul 2024 08:19:43 +0200 +Subject: ethtool: netlink: do not return SQI value if link is down + +From: Oleksij Rempel + +[ Upstream commit c184cf94e73b04ff7048d045f5413899bc664788 ] + +Do not attach SQI value if link is down. "SQI values are only valid if +link-up condition is present" per OpenAlliance specification of +100Base-T1 Interoperability Test suite [1]. The same rule would apply +for other link types. + +[1] https://opensig.org/automotive-ethernet-specifications/# + +Fixes: 806602191592 ("ethtool: provide UAPI for PHY Signal Quality Index (SQI)") +Signed-off-by: Oleksij Rempel +Reviewed-by: Andrew Lunn +Reviewed-by: Woojung Huh +Link: https://patch.msgid.link/20240709061943.729381-1-o.rempel@pengutronix.de +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + net/ethtool/linkstate.c | 41 ++++++++++++++++++++++++++++------------- + 1 file changed, 28 insertions(+), 13 deletions(-) + +diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c +index fb676f349455a..470582a70ccbe 100644 +--- a/net/ethtool/linkstate.c ++++ b/net/ethtool/linkstate.c +@@ -36,6 +36,8 @@ static int linkstate_get_sqi(struct net_device *dev) + mutex_lock(&phydev->lock); + if (!phydev->drv || !phydev->drv->get_sqi) + ret = -EOPNOTSUPP; ++ else if (!phydev->link) ++ ret = -ENETDOWN; + else + ret = phydev->drv->get_sqi(phydev); + mutex_unlock(&phydev->lock); +@@ -54,6 +56,8 @@ static int linkstate_get_sqi_max(struct net_device *dev) + mutex_lock(&phydev->lock); + if (!phydev->drv || !phydev->drv->get_sqi_max) + ret = -EOPNOTSUPP; ++ else if (!phydev->link) ++ ret = -ENETDOWN; + else + ret = phydev->drv->get_sqi_max(phydev); + mutex_unlock(&phydev->lock); +@@ -61,6 +65,17 @@ static int linkstate_get_sqi_max(struct net_device *dev) + return ret; + }; + ++static bool linkstate_sqi_critical_error(int sqi) ++{ ++ return sqi < 0 && sqi != -EOPNOTSUPP && sqi != -ENETDOWN; ++} ++ ++static bool linkstate_sqi_valid(struct linkstate_reply_data *data) ++{ ++ return data->sqi >= 0 && data->sqi_max >= 0 && ++ data->sqi <= data->sqi_max; ++} ++ + static int linkstate_get_link_ext_state(struct net_device *dev, + struct linkstate_reply_data *data) + { +@@ -92,12 +107,12 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, + data->link = __ethtool_get_link(dev); + + ret = linkstate_get_sqi(dev); +- if (ret < 0 && ret != -EOPNOTSUPP) ++ if (linkstate_sqi_critical_error(ret)) + goto out; + data->sqi = ret; + + ret = linkstate_get_sqi_max(dev); +- if (ret < 0 && ret != -EOPNOTSUPP) ++ if (linkstate_sqi_critical_error(ret)) + goto out; + data->sqi_max = ret; + +@@ -122,11 +137,10 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base, + len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + + 0; + +- if (data->sqi != -EOPNOTSUPP) +- len += nla_total_size(sizeof(u32)); +- +- if (data->sqi_max != -EOPNOTSUPP) +- len += nla_total_size(sizeof(u32)); ++ if (linkstate_sqi_valid(data)) { ++ len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI */ ++ len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI_MAX */ ++ } + + if (data->link_ext_state_provided) + len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */ +@@ -147,13 +161,14 @@ static int linkstate_fill_reply(struct sk_buff *skb, + nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link)) + return -EMSGSIZE; + +- if (data->sqi != -EOPNOTSUPP && +- nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) +- return -EMSGSIZE; ++ if (linkstate_sqi_valid(data)) { ++ if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) ++ return -EMSGSIZE; + +- if (data->sqi_max != -EOPNOTSUPP && +- nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) +- return -EMSGSIZE; ++ if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, ++ data->sqi_max)) ++ return -EMSGSIZE; ++ } + + if (data->link_ext_state_provided) { + if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE, +-- +2.43.0 + diff --git a/queue-6.1/filelock-fix-potential-use-after-free-in-posix_lock_.patch b/queue-6.1/filelock-fix-potential-use-after-free-in-posix_lock_.patch new file mode 100644 index 00000000000..91497eb8bd6 --- /dev/null +++ b/queue-6.1/filelock-fix-potential-use-after-free-in-posix_lock_.patch @@ -0,0 +1,50 @@ +From 6e27922202afc4a8a4ff04e6bbccd83c1d5bf81f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Jul 2024 18:44:48 -0400 +Subject: filelock: fix potential use-after-free in posix_lock_inode +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Jeff Layton + +[ Upstream commit 1b3ec4f7c03d4b07bad70697d7e2f4088d2cfe92 ] + +Light Hsieh reported a KASAN UAF warning in trace_posix_lock_inode(). +The request pointer had been changed earlier to point to a lock entry +that was added to the inode's list. However, before the tracepoint could +fire, another task raced in and freed that lock. + +Fix this by moving the tracepoint inside the spinlock, which should +ensure that this doesn't happen. + +Fixes: 74f6f5912693 ("locks: fix KASAN: use-after-free in trace_event_raw_event_filelock_lock") +Link: https://lore.kernel.org/linux-fsdevel/724ffb0a2962e912ea62bb0515deadf39c325112.camel@kernel.org/ +Reported-by: Light Hsieh (謝明燈) +Signed-off-by: Jeff Layton +Link: https://lore.kernel.org/r/20240702-filelock-6-10-v1-1-96e766aadc98@kernel.org +Reviewed-by: Alexander Aring +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/locks.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/locks.c b/fs/locks.c +index 7d0918b8fe5d6..c23bcfe9b0fdd 100644 +--- a/fs/locks.c ++++ b/fs/locks.c +@@ -1298,9 +1298,9 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, + locks_wake_up_blocks(left); + } + out: ++ trace_posix_lock_inode(inode, request, error); + spin_unlock(&ctx->flc_lock); + percpu_up_read(&file_rwsem); +- trace_posix_lock_inode(inode, request, error); + /* + * Free any unused locks. + */ +-- +2.43.0 + diff --git a/queue-6.1/fs-dcache-re-use-value-stored-to-dentry-d_flags-inst.patch b/queue-6.1/fs-dcache-re-use-value-stored-to-dentry-d_flags-inst.patch new file mode 100644 index 00000000000..5606ffc47ec --- /dev/null +++ b/queue-6.1/fs-dcache-re-use-value-stored-to-dentry-d_flags-inst.patch @@ -0,0 +1,44 @@ +From f523a86638773d00989770c97854689f6d7909f3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Apr 2024 10:10:08 +0800 +Subject: fs/dcache: Re-use value stored to dentry->d_flags instead of + re-reading + +From: linke li + +[ Upstream commit 8bfb40be31ddea0cb4664b352e1797cfe6c91976 ] + +Currently, the __d_clear_type_and_inode() writes the value flags to +dentry->d_flags, then immediately re-reads it in order to use it in a if +statement. This re-read is useless because no other update to +dentry->d_flags can occur at this point. + +This commit therefore re-use flags in the if statement instead of +re-reading dentry->d_flags. + +Signed-off-by: linke li +Link: https://lore.kernel.org/r/tencent_5E187BD0A61BA28605E85405F15228254D0A@qq.com +Reviewed-by: Jan Kara +Signed-off-by: Christian Brauner +Stable-dep-of: aabfe57ebaa7 ("vfs: don't mod negative dentry count when on shrinker list") +Signed-off-by: Sasha Levin +--- + fs/dcache.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/dcache.c b/fs/dcache.c +index b09bc88dbbec7..9b10f1872f6c9 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -356,7 +356,7 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) + flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + WRITE_ONCE(dentry->d_flags, flags); + dentry->d_inode = NULL; +- if (dentry->d_flags & DCACHE_LRU_LIST) ++ if (flags & DCACHE_LRU_LIST) + this_cpu_inc(nr_dentry_negative); + } + +-- +2.43.0 + diff --git a/queue-6.1/i40e-fix-xdp-program-unloading-while-removing-the-dr.patch b/queue-6.1/i40e-fix-xdp-program-unloading-while-removing-the-dr.patch new file mode 100644 index 00000000000..6b3a19b0ce3 --- /dev/null +++ b/queue-6.1/i40e-fix-xdp-program-unloading-while-removing-the-dr.patch @@ -0,0 +1,119 @@ +From 80f0aafd75846964cf0f020cdbdbf999f337e571 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 8 Jul 2024 16:07:49 -0700 +Subject: i40e: Fix XDP program unloading while removing the driver + +From: Michal Kubiak + +[ Upstream commit 01fc5142ae6b06b61ed51a624f2732d6525d8ea3 ] + +The commit 6533e558c650 ("i40e: Fix reset path while removing +the driver") introduced a new PF state "__I40E_IN_REMOVE" to block +modifying the XDP program while the driver is being removed. +Unfortunately, such a change is useful only if the ".ndo_bpf()" +callback was called out of the rmmod context because unloading the +existing XDP program is also a part of driver removing procedure. +In other words, from the rmmod context the driver is expected to +unload the XDP program without reporting any errors. Otherwise, +the kernel warning with callstack is printed out to dmesg. + +Example failing scenario: + 1. Load the i40e driver. + 2. Load the XDP program. + 3. Unload the i40e driver (using "rmmod" command). + +The example kernel warning log: + +[ +0.004646] WARNING: CPU: 94 PID: 10395 at net/core/dev.c:9290 unregister_netdevice_many_notify+0x7a9/0x870 +[...] +[ +0.010959] RIP: 0010:unregister_netdevice_many_notify+0x7a9/0x870 +[...] +[ +0.002726] Call Trace: +[ +0.002457] +[ +0.002119] ? __warn+0x80/0x120 +[ +0.003245] ? unregister_netdevice_many_notify+0x7a9/0x870 +[ +0.005586] ? report_bug+0x164/0x190 +[ +0.003678] ? handle_bug+0x3c/0x80 +[ +0.003503] ? exc_invalid_op+0x17/0x70 +[ +0.003846] ? asm_exc_invalid_op+0x1a/0x20 +[ +0.004200] ? unregister_netdevice_many_notify+0x7a9/0x870 +[ +0.005579] ? unregister_netdevice_many_notify+0x3cc/0x870 +[ +0.005586] unregister_netdevice_queue+0xf7/0x140 +[ +0.004806] unregister_netdev+0x1c/0x30 +[ +0.003933] i40e_vsi_release+0x87/0x2f0 [i40e] +[ +0.004604] i40e_remove+0x1a1/0x420 [i40e] +[ +0.004220] pci_device_remove+0x3f/0xb0 +[ +0.003943] device_release_driver_internal+0x19f/0x200 +[ +0.005243] driver_detach+0x48/0x90 +[ +0.003586] bus_remove_driver+0x6d/0xf0 +[ +0.003939] pci_unregister_driver+0x2e/0xb0 +[ +0.004278] i40e_exit_module+0x10/0x5f0 [i40e] +[ +0.004570] __do_sys_delete_module.isra.0+0x197/0x310 +[ +0.005153] do_syscall_64+0x85/0x170 +[ +0.003684] ? syscall_exit_to_user_mode+0x69/0x220 +[ +0.004886] ? do_syscall_64+0x95/0x170 +[ +0.003851] ? exc_page_fault+0x7e/0x180 +[ +0.003932] entry_SYSCALL_64_after_hwframe+0x71/0x79 +[ +0.005064] RIP: 0033:0x7f59dc9347cb +[ +0.003648] Code: 73 01 c3 48 8b 0d 65 16 0c 00 f7 d8 64 89 01 48 83 +c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 b0 00 00 00 0f +05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 35 16 0c 00 f7 d8 64 89 01 48 +[ +0.018753] RSP: 002b:00007ffffac99048 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 +[ +0.007577] RAX: ffffffffffffffda RBX: 0000559b9bb2f6e0 RCX: 00007f59dc9347cb +[ +0.007140] RDX: 0000000000000000 RSI: 0000000000000800 RDI: 0000559b9bb2f748 +[ +0.007146] RBP: 00007ffffac99070 R08: 1999999999999999 R09: 0000000000000000 +[ +0.007133] R10: 00007f59dc9a5ac0 R11: 0000000000000206 R12: 0000000000000000 +[ +0.007141] R13: 00007ffffac992d8 R14: 0000559b9bb2f6e0 R15: 0000000000000000 +[ +0.007151] +[ +0.002204] ---[ end trace 0000000000000000 ]--- + +Fix this by checking if the XDP program is being loaded or unloaded. +Then, block only loading a new program while "__I40E_IN_REMOVE" is set. +Also, move testing "__I40E_IN_REMOVE" flag to the beginning of XDP_SETUP +callback to avoid unnecessary operations and checks. + +Fixes: 6533e558c650 ("i40e: Fix reset path while removing the driver") +Signed-off-by: Michal Kubiak +Reviewed-by: Maciej Fijalkowski +Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) +Signed-off-by: Tony Nguyen +Link: https://patch.msgid.link/20240708230750.625986-1-anthony.l.nguyen@intel.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_main.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c +index 9efd4b962dce2..1194dcacbd29e 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -13315,6 +13315,10 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog, + bool need_reset; + int i; + ++ /* VSI shall be deleted in a moment, block loading new programs */ ++ if (prog && test_bit(__I40E_IN_REMOVE, pf->state)) ++ return -EINVAL; ++ + /* Don't allow frames that span over multiple buffers */ + if (frame_size > i40e_calculate_vsi_rx_buf_len(vsi)) { + NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP"); +@@ -13323,14 +13327,9 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog, + + /* When turning XDP on->off/off->on we reset and rebuild the rings. */ + need_reset = (i40e_enabled_xdp_vsi(vsi) != !!prog); +- + if (need_reset) + i40e_prep_for_reset(pf); + +- /* VSI shall be deleted in a moment, just return EINVAL */ +- if (test_bit(__I40E_IN_REMOVE, pf->state)) +- return -EINVAL; +- + old_prog = xchg(&vsi->xdp_prog, prog); + + if (need_reset) { +-- +2.43.0 + diff --git a/queue-6.1/mm-prevent-derefencing-null-ptr-in-pfn_section_valid.patch b/queue-6.1/mm-prevent-derefencing-null-ptr-in-pfn_section_valid.patch new file mode 100644 index 00000000000..2fccf06b46b --- /dev/null +++ b/queue-6.1/mm-prevent-derefencing-null-ptr-in-pfn_section_valid.patch @@ -0,0 +1,44 @@ +From a2b56f162ac998de20a082bc3c7748e2f1c8b7a5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 25 Jun 2024 20:16:39 -0400 +Subject: mm: prevent derefencing NULL ptr in pfn_section_valid() + +From: Waiman Long + +[ Upstream commit 82f0b6f041fad768c28b4ad05a683065412c226e ] + +Commit 5ec8e8ea8b77 ("mm/sparsemem: fix race in accessing +memory_section->usage") changed pfn_section_valid() to add a READ_ONCE() +call around "ms->usage" to fix a race with section_deactivate() where +ms->usage can be cleared. The READ_ONCE() call, by itself, is not enough +to prevent NULL pointer dereference. We need to check its value before +dereferencing it. + +Link: https://lkml.kernel.org/r/20240626001639.1350646-1-longman@redhat.com +Fixes: 5ec8e8ea8b77 ("mm/sparsemem: fix race in accessing memory_section->usage") +Signed-off-by: Waiman Long +Cc: Charan Teja Kalla +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + include/linux/mmzone.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 93d2003091222..61906244c14d6 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -1814,8 +1814,9 @@ static inline int subsection_map_index(unsigned long pfn) + static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) + { + int idx = subsection_map_index(pfn); ++ struct mem_section_usage *usage = READ_ONCE(ms->usage); + +- return test_bit(idx, READ_ONCE(ms->usage)->subsection_map); ++ return usage ? test_bit(idx, usage->subsection_map) : 0; + } + #else + static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) +-- +2.43.0 + diff --git a/queue-6.1/net-ethernet-lantiq_etop-fix-double-free-in-detach.patch b/queue-6.1/net-ethernet-lantiq_etop-fix-double-free-in-detach.patch new file mode 100644 index 00000000000..945741bcc95 --- /dev/null +++ b/queue-6.1/net-ethernet-lantiq_etop-fix-double-free-in-detach.patch @@ -0,0 +1,43 @@ +From cfec6d893b5e7b82ed597f8d141a9b1cf2c4746d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 8 Jul 2024 22:58:26 +0200 +Subject: net: ethernet: lantiq_etop: fix double free in detach + +From: Aleksander Jan Bajkowski + +[ Upstream commit e1533b6319ab9c3a97dad314dd88b3783bc41b69 ] + +The number of the currently released descriptor is never incremented +which results in the same skb being released multiple times. + +Fixes: 504d4721ee8e ("MIPS: Lantiq: Add ethernet driver") +Reported-by: Joe Perches +Closes: https://lore.kernel.org/all/fc1bf93d92bb5b2f99c6c62745507cc22f3a7b2d.camel@perches.com/ +Signed-off-by: Aleksander Jan Bajkowski +Reviewed-by: Andrew Lunn +Link: https://patch.msgid.link/20240708205826.5176-1-olek2@wp.pl +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/lantiq_etop.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c +index f5961bdcc4809..61baf1da76eea 100644 +--- a/drivers/net/ethernet/lantiq_etop.c ++++ b/drivers/net/ethernet/lantiq_etop.c +@@ -217,9 +217,9 @@ ltq_etop_free_channel(struct net_device *dev, struct ltq_etop_chan *ch) + if (ch->dma.irq) + free_irq(ch->dma.irq, priv); + if (IS_RX(ch->idx)) { +- int desc; ++ struct ltq_dma_channel *dma = &ch->dma; + +- for (desc = 0; desc < LTQ_DESC_NUM; desc++) ++ for (dma->desc = 0; dma->desc < LTQ_DESC_NUM; dma->desc++) + dev_kfree_skb_any(ch->skb[ch->dma.desc]); + } + } +-- +2.43.0 + diff --git a/queue-6.1/net-ethernet-mtk-star-emac-set-mac_managed_pm-when-p.patch b/queue-6.1/net-ethernet-mtk-star-emac-set-mac_managed_pm-when-p.patch new file mode 100644 index 00000000000..66cd0eb7d79 --- /dev/null +++ b/queue-6.1/net-ethernet-mtk-star-emac-set-mac_managed_pm-when-p.patch @@ -0,0 +1,55 @@ +From 7fa7c84c102ed338a2d455e52aa9941b0087df81 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 8 Jul 2024 14:52:09 +0800 +Subject: net: ethernet: mtk-star-emac: set mac_managed_pm when probing + +From: Jian Hui Lee + +[ Upstream commit 8c6790b5c25dfac11b589cc37346bcf9e23ad468 ] + +The below commit introduced a warning message when phy state is not in +the states: PHY_HALTED, PHY_READY, and PHY_UP. +commit 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state") + +mtk-star-emac doesn't need mdiobus suspend/resume. To fix the warning +message during resume, indicate the phy resume/suspend is managed by the +mac when probing. + +Fixes: 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state") +Signed-off-by: Jian Hui Lee +Reviewed-by: Jacob Keller +Link: https://patch.msgid.link/20240708065210.4178980-1-jianhui.lee@canonical.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mediatek/mtk_star_emac.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c +index 7050351250b7a..ad27749c0931c 100644 +--- a/drivers/net/ethernet/mediatek/mtk_star_emac.c ++++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c +@@ -1531,6 +1531,7 @@ static int mtk_star_probe(struct platform_device *pdev) + { + struct device_node *of_node; + struct mtk_star_priv *priv; ++ struct phy_device *phydev; + struct net_device *ndev; + struct device *dev; + void __iomem *base; +@@ -1656,6 +1657,12 @@ static int mtk_star_probe(struct platform_device *pdev) + netif_napi_add(ndev, &priv->rx_napi, mtk_star_rx_poll); + netif_napi_add_tx(ndev, &priv->tx_napi, mtk_star_tx_poll); + ++ phydev = of_phy_find_device(priv->phy_node); ++ if (phydev) { ++ phydev->mac_managed_pm = true; ++ put_device(&phydev->mdio.dev); ++ } ++ + return devm_register_netdev(dev, ndev); + } + +-- +2.43.0 + diff --git a/queue-6.1/net-fix-rc7-s-__skb_datagram_iter.patch b/queue-6.1/net-fix-rc7-s-__skb_datagram_iter.patch new file mode 100644 index 00000000000..2ae319eac55 --- /dev/null +++ b/queue-6.1/net-fix-rc7-s-__skb_datagram_iter.patch @@ -0,0 +1,45 @@ +From b10d47dcc440d3c66b9eb216f61a3458570494b7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 8 Jul 2024 07:46:00 -0700 +Subject: net: fix rc7's __skb_datagram_iter() + +From: Hugh Dickins + +[ Upstream commit f153831097b4435f963e385304cc0f1acba1c657 ] + +X would not start in my old 32-bit partition (and the "n"-handling looks +just as wrong on 64-bit, but for whatever reason did not show up there): +"n" must be accumulated over all pages before it's added to "offset" and +compared with "copy", immediately after the skb_frag_foreach_page() loop. + +Fixes: d2d30a376d9c ("net: allow skb_datagram_iter to be called from any context") +Signed-off-by: Hugh Dickins +Reviewed-by: Sagi Grimberg +Link: https://patch.msgid.link/fef352e8-b89a-da51-f8ce-04bc39ee6481@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/core/datagram.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/net/core/datagram.c b/net/core/datagram.c +index cdd65ca3124a4..87c39cc12327f 100644 +--- a/net/core/datagram.c ++++ b/net/core/datagram.c +@@ -441,11 +441,12 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, + if (copy > len) + copy = len; + ++ n = 0; + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_local_page(p); +- n = INDIRECT_CALL_1(cb, simple_copy_to_iter, ++ n += INDIRECT_CALL_1(cb, simple_copy_to_iter, + vaddr + p_off, p_len, data, to); + kunmap_local(vaddr); + } +-- +2.43.0 + diff --git a/queue-6.1/net-phy-microchip-lan87xx-reinit-phy-after-cable-tes.patch b/queue-6.1/net-phy-microchip-lan87xx-reinit-phy-after-cable-tes.patch new file mode 100644 index 00000000000..73816b12c56 --- /dev/null +++ b/queue-6.1/net-phy-microchip-lan87xx-reinit-phy-after-cable-tes.patch @@ -0,0 +1,41 @@ +From f8b4ed67efc4725fa5fde6d4424a93e26e68490a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 5 Jul 2024 10:49:54 +0200 +Subject: net: phy: microchip: lan87xx: reinit PHY after cable test + +From: Oleksij Rempel + +[ Upstream commit 30f747b8d53bc73555f268d0f48f56174fa5bf10 ] + +Reinit PHY after cable test, otherwise link can't be established on +tested port. This issue is reproducible on LAN9372 switches with +integrated 100BaseT1 PHYs. + +Fixes: 788050256c411 ("net: phy: microchip_t1: add cable test support for lan87xx phy") +Signed-off-by: Oleksij Rempel +Reviewed-by: Andrew Lunn +Reviewed-by: Michal Kubiak +Reviewed-by: Florian Fainelli +Link: https://patch.msgid.link/20240705084954.83048-1-o.rempel@pengutronix.de +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/phy/microchip_t1.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/phy/microchip_t1.c b/drivers/net/phy/microchip_t1.c +index 8569a545e0a3f..9517243e3051e 100644 +--- a/drivers/net/phy/microchip_t1.c ++++ b/drivers/net/phy/microchip_t1.c +@@ -711,7 +711,7 @@ static int lan87xx_cable_test_report(struct phy_device *phydev) + ethnl_cable_test_result(phydev, ETHTOOL_A_CABLE_PAIR_A, + lan87xx_cable_test_report_trans(detect)); + +- return 0; ++ return phy_init_hw(phydev); + } + + static int lan87xx_cable_test_get_status(struct phy_device *phydev, +-- +2.43.0 + diff --git a/queue-6.1/net-sched-fix-uaf-when-resolving-a-clash.patch b/queue-6.1/net-sched-fix-uaf-when-resolving-a-clash.patch new file mode 100644 index 00000000000..58069325ad6 --- /dev/null +++ b/queue-6.1/net-sched-fix-uaf-when-resolving-a-clash.patch @@ -0,0 +1,131 @@ +From 7de8447ba1b93d5698b921b9ef49c66bed1b74fc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 10 Jul 2024 13:37:47 +0800 +Subject: net/sched: Fix UAF when resolving a clash + +From: Chengen Du + +[ Upstream commit 26488172b0292bed837b95a006a3f3431d1898c3 ] + +KASAN reports the following UAF: + + BUG: KASAN: slab-use-after-free in tcf_ct_flow_table_process_conn+0x12b/0x380 [act_ct] + Read of size 1 at addr ffff888c07603600 by task handler130/6469 + + Call Trace: + + dump_stack_lvl+0x48/0x70 + print_address_description.constprop.0+0x33/0x3d0 + print_report+0xc0/0x2b0 + kasan_report+0xd0/0x120 + __asan_load1+0x6c/0x80 + tcf_ct_flow_table_process_conn+0x12b/0x380 [act_ct] + tcf_ct_act+0x886/0x1350 [act_ct] + tcf_action_exec+0xf8/0x1f0 + fl_classify+0x355/0x360 [cls_flower] + __tcf_classify+0x1fd/0x330 + tcf_classify+0x21c/0x3c0 + sch_handle_ingress.constprop.0+0x2c5/0x500 + __netif_receive_skb_core.constprop.0+0xb25/0x1510 + __netif_receive_skb_list_core+0x220/0x4c0 + netif_receive_skb_list_internal+0x446/0x620 + napi_complete_done+0x157/0x3d0 + gro_cell_poll+0xcf/0x100 + __napi_poll+0x65/0x310 + net_rx_action+0x30c/0x5c0 + __do_softirq+0x14f/0x491 + __irq_exit_rcu+0x82/0xc0 + irq_exit_rcu+0xe/0x20 + common_interrupt+0xa1/0xb0 + + + asm_common_interrupt+0x27/0x40 + + Allocated by task 6469: + kasan_save_stack+0x38/0x70 + kasan_set_track+0x25/0x40 + kasan_save_alloc_info+0x1e/0x40 + __kasan_krealloc+0x133/0x190 + krealloc+0xaa/0x130 + nf_ct_ext_add+0xed/0x230 [nf_conntrack] + tcf_ct_act+0x1095/0x1350 [act_ct] + tcf_action_exec+0xf8/0x1f0 + fl_classify+0x355/0x360 [cls_flower] + __tcf_classify+0x1fd/0x330 + tcf_classify+0x21c/0x3c0 + sch_handle_ingress.constprop.0+0x2c5/0x500 + __netif_receive_skb_core.constprop.0+0xb25/0x1510 + __netif_receive_skb_list_core+0x220/0x4c0 + netif_receive_skb_list_internal+0x446/0x620 + napi_complete_done+0x157/0x3d0 + gro_cell_poll+0xcf/0x100 + __napi_poll+0x65/0x310 + net_rx_action+0x30c/0x5c0 + __do_softirq+0x14f/0x491 + + Freed by task 6469: + kasan_save_stack+0x38/0x70 + kasan_set_track+0x25/0x40 + kasan_save_free_info+0x2b/0x60 + ____kasan_slab_free+0x180/0x1f0 + __kasan_slab_free+0x12/0x30 + slab_free_freelist_hook+0xd2/0x1a0 + __kmem_cache_free+0x1a2/0x2f0 + kfree+0x78/0x120 + nf_conntrack_free+0x74/0x130 [nf_conntrack] + nf_ct_destroy+0xb2/0x140 [nf_conntrack] + __nf_ct_resolve_clash+0x529/0x5d0 [nf_conntrack] + nf_ct_resolve_clash+0xf6/0x490 [nf_conntrack] + __nf_conntrack_confirm+0x2c6/0x770 [nf_conntrack] + tcf_ct_act+0x12ad/0x1350 [act_ct] + tcf_action_exec+0xf8/0x1f0 + fl_classify+0x355/0x360 [cls_flower] + __tcf_classify+0x1fd/0x330 + tcf_classify+0x21c/0x3c0 + sch_handle_ingress.constprop.0+0x2c5/0x500 + __netif_receive_skb_core.constprop.0+0xb25/0x1510 + __netif_receive_skb_list_core+0x220/0x4c0 + netif_receive_skb_list_internal+0x446/0x620 + napi_complete_done+0x157/0x3d0 + gro_cell_poll+0xcf/0x100 + __napi_poll+0x65/0x310 + net_rx_action+0x30c/0x5c0 + __do_softirq+0x14f/0x491 + +The ct may be dropped if a clash has been resolved but is still passed to +the tcf_ct_flow_table_process_conn function for further usage. This issue +can be fixed by retrieving ct from skb again after confirming conntrack. + +Fixes: 0cc254e5aa37 ("net/sched: act_ct: Offload connections with commit action") +Co-developed-by: Gerald Yang +Signed-off-by: Gerald Yang +Signed-off-by: Chengen Du +Link: https://patch.msgid.link/20240710053747.13223-1-chengen.du@canonical.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + net/sched/act_ct.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c +index cd95a315fde82..44ff7f356ec15 100644 +--- a/net/sched/act_ct.c ++++ b/net/sched/act_ct.c +@@ -1212,6 +1212,14 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, + */ + if (nf_conntrack_confirm(skb) != NF_ACCEPT) + goto drop; ++ ++ /* The ct may be dropped if a clash has been resolved, ++ * so it's necessary to retrieve it from skb again to ++ * prevent UAF. ++ */ ++ ct = nf_ct_get(skb, &ctinfo); ++ if (!ct) ++ skip_add = true; + } + + if (!skip_add) +-- +2.43.0 + diff --git a/queue-6.1/net-sunrpc-remap-eperm-in-case-of-connection-failure.patch b/queue-6.1/net-sunrpc-remap-eperm-in-case-of-connection-failure.patch new file mode 100644 index 00000000000..038540a1d47 --- /dev/null +++ b/queue-6.1/net-sunrpc-remap-eperm-in-case-of-connection-failure.patch @@ -0,0 +1,64 @@ +From 47cf0624e2d33d14d0dcc3e8ff3627e21ea0339b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 4 Jul 2024 08:41:57 +0200 +Subject: net, sunrpc: Remap EPERM in case of connection failure in + xs_tcp_setup_socket + +From: Daniel Borkmann + +[ Upstream commit 626dfed5fa3bfb41e0dffd796032b555b69f9cde ] + +When using a BPF program on kernel_connect(), the call can return -EPERM. This +causes xs_tcp_setup_socket() to loop forever, filling up the syslog and causing +the kernel to potentially freeze up. + +Neil suggested: + + This will propagate -EPERM up into other layers which might not be ready + to handle it. It might be safer to map EPERM to an error we would be more + likely to expect from the network system - such as ECONNREFUSED or ENETDOWN. + +ECONNREFUSED as error seems reasonable. For programs setting a different error +can be out of reach (see handling in 4fbac77d2d09) in particular on kernels +which do not have f10d05966196 ("bpf: Make BPF_PROG_RUN_ARRAY return -err +instead of allow boolean"), thus given that it is better to simply remap for +consistent behavior. UDP does handle EPERM in xs_udp_send_request(). + +Fixes: d74bad4e74ee ("bpf: Hooks for sys_connect") +Fixes: 4fbac77d2d09 ("bpf: Hooks for sys_bind") +Co-developed-by: Lex Siegel +Signed-off-by: Lex Siegel +Signed-off-by: Daniel Borkmann +Cc: Neil Brown +Cc: Trond Myklebust +Cc: Anna Schumaker +Link: https://github.com/cilium/cilium/issues/33395 +Link: https://lore.kernel.org/bpf/171374175513.12877.8993642908082014881@noble.neil.brown.name +Link: https://patch.msgid.link/9069ec1d59e4b2129fc23433349fd5580ad43921.1720075070.git.daniel@iogearbox.net +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + net/sunrpc/xprtsock.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c +index 05aa32696e7c2..02f651f85e739 100644 +--- a/net/sunrpc/xprtsock.c ++++ b/net/sunrpc/xprtsock.c +@@ -2333,6 +2333,13 @@ static void xs_tcp_setup_socket(struct work_struct *work) + transport->srcport = 0; + status = -EAGAIN; + break; ++ case -EPERM: ++ /* Happens, for instance, if a BPF program is preventing ++ * the connect. Remap the error so upper layers can better ++ * deal with it. ++ */ ++ status = -ECONNREFUSED; ++ fallthrough; + case -EINVAL: + /* Happens, for instance, if the user specified a link + * local IPv6 address without a scope-id. +-- +2.43.0 + diff --git a/queue-6.1/octeontx2-af-fix-incorrect-value-output-on-error-pat.patch b/queue-6.1/octeontx2-af-fix-incorrect-value-output-on-error-pat.patch new file mode 100644 index 00000000000..6139bcb6743 --- /dev/null +++ b/queue-6.1/octeontx2-af-fix-incorrect-value-output-on-error-pat.patch @@ -0,0 +1,44 @@ +From 5a964e46e1aaa12cfb118a93529cc56dfe66349b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 5 Jul 2024 12:53:17 +0300 +Subject: octeontx2-af: Fix incorrect value output on error path in + rvu_check_rsrc_availability() + +From: Aleksandr Mishin + +[ Upstream commit 442e26af9aa8115c96541026cbfeaaa76c85d178 ] + +In rvu_check_rsrc_availability() in case of invalid SSOW req, an incorrect +data is printed to error log. 'req->sso' value is printed instead of +'req->ssow'. Looks like "copy-paste" mistake. + +Fix this mistake by replacing 'req->sso' with 'req->ssow'. + +Found by Linux Verification Center (linuxtesting.org) with SVACE. + +Fixes: 746ea74241fa ("octeontx2-af: Add RVU block LF provisioning support") +Signed-off-by: Aleksandr Mishin +Reviewed-by: Simon Horman +Link: https://patch.msgid.link/20240705095317.12640-1-amishin@t-argos.ru +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/marvell/octeontx2/af/rvu.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +index a7034b47ed6c9..c7829265eade9 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +@@ -1638,7 +1638,7 @@ static int rvu_check_rsrc_availability(struct rvu *rvu, + if (req->ssow > block->lf.max) { + dev_err(&rvu->pdev->dev, + "Func 0x%x: Invalid SSOW req, %d > max %d\n", +- pcifunc, req->sso, block->lf.max); ++ pcifunc, req->ssow, block->lf.max); + return -EINVAL; + } + mappedlfs = rvu_get_rsrc_mapcount(pfvf, block->addr); +-- +2.43.0 + diff --git a/queue-6.1/ppp-reject-claimed-as-lcp-but-actually-malformed-pac.patch b/queue-6.1/ppp-reject-claimed-as-lcp-but-actually-malformed-pac.patch new file mode 100644 index 00000000000..57352014353 --- /dev/null +++ b/queue-6.1/ppp-reject-claimed-as-lcp-but-actually-malformed-pac.patch @@ -0,0 +1,67 @@ +From 3277d7c93db469c71caed9dc9e7be624c1fc42eb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 8 Jul 2024 14:56:15 +0300 +Subject: ppp: reject claimed-as-LCP but actually malformed packets + +From: Dmitry Antipov + +[ Upstream commit f2aeb7306a898e1cbd03963d376f4b6656ca2b55 ] + +Since 'ppp_async_encode()' assumes valid LCP packets (with code +from 1 to 7 inclusive), add 'ppp_check_packet()' to ensure that +LCP packet has an actual body beyond PPP_LCP header bytes, and +reject claimed-as-LCP but actually malformed data otherwise. + +Reported-by: syzbot+ec0723ba9605678b14bf@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=ec0723ba9605678b14bf +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Signed-off-by: Dmitry Antipov +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/ppp/ppp_generic.c | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c +index 1d71f5276241c..5a6fa566e722f 100644 +--- a/drivers/net/ppp/ppp_generic.c ++++ b/drivers/net/ppp/ppp_generic.c +@@ -70,6 +70,7 @@ + #define MPHDRLEN_SSN 4 /* ditto with short sequence numbers */ + + #define PPP_PROTO_LEN 2 ++#define PPP_LCP_HDRLEN 4 + + /* + * An instance of /dev/ppp can be associated with either a ppp +@@ -491,6 +492,15 @@ static ssize_t ppp_read(struct file *file, char __user *buf, + return ret; + } + ++static bool ppp_check_packet(struct sk_buff *skb, size_t count) ++{ ++ /* LCP packets must include LCP header which 4 bytes long: ++ * 1-byte code, 1-byte identifier, and 2-byte length. ++ */ ++ return get_unaligned_be16(skb->data) != PPP_LCP || ++ count >= PPP_PROTO_LEN + PPP_LCP_HDRLEN; ++} ++ + static ssize_t ppp_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) + { +@@ -513,6 +523,11 @@ static ssize_t ppp_write(struct file *file, const char __user *buf, + kfree_skb(skb); + goto out; + } ++ ret = -EINVAL; ++ if (unlikely(!ppp_check_packet(skb, count))) { ++ kfree_skb(skb); ++ goto out; ++ } + + switch (pf->kind) { + case INTERFACE: +-- +2.43.0 + diff --git a/queue-6.1/series b/queue-6.1/series new file mode 100644 index 00000000000..5d2462af1e2 --- /dev/null +++ b/queue-6.1/series @@ -0,0 +1,29 @@ +mm-prevent-derefencing-null-ptr-in-pfn_section_valid.patch +cachefiles-propagate-errors-from-vfs_getxattr-to-avo.patch +cachefiles-narrow-the-scope-of-triggering-epollin-ev.patch +cachefiles-stop-sending-new-request-when-dropping-ob.patch +cachefiles-cancel-all-requests-for-the-object-that-i.patch +cachefiles-wait-for-ondemand_object_worker-to-finish.patch +cachefiles-cyclic-allocation-of-msg_id-to-avoid-reus.patch +cachefiles-add-missing-lock-protection-when-polling.patch +filelock-fix-potential-use-after-free-in-posix_lock_.patch +fs-dcache-re-use-value-stored-to-dentry-d_flags-inst.patch +vfs-don-t-mod-negative-dentry-count-when-on-shrinker.patch +tcp-fix-incorrect-undo-caused-by-dsack-of-tlp-retran.patch +net-phy-microchip-lan87xx-reinit-phy-after-cable-tes.patch +skmsg-skip-zero-length-skb-in-sk_msg_recvmsg.patch +octeontx2-af-fix-incorrect-value-output-on-error-pat.patch +net-fix-rc7-s-__skb_datagram_iter.patch +i40e-fix-xdp-program-unloading-while-removing-the-dr.patch +net-ethernet-lantiq_etop-fix-double-free-in-detach.patch +bpf-refactor-some-inode-task-sk-storage-functions-fo.patch +bpf-reduce-smap-elem_size.patch +bpf-use-bpf_map_kvcalloc-in-bpf_local_storage.patch +bpf-remove-__bpf_local_storage_map_alloc.patch +bpf-fix-order-of-args-in-call-to-bpf_map_kvcalloc.patch +net-ethernet-mtk-star-emac-set-mac_managed_pm-when-p.patch +ppp-reject-claimed-as-lcp-but-actually-malformed-pac.patch +ethtool-netlink-do-not-return-sqi-value-if-link-is-d.patch +udp-set-sock_rcu_free-earlier-in-udp_lib_get_port.patch +net-sched-fix-uaf-when-resolving-a-clash.patch +net-sunrpc-remap-eperm-in-case-of-connection-failure.patch diff --git a/queue-6.1/skmsg-skip-zero-length-skb-in-sk_msg_recvmsg.patch b/queue-6.1/skmsg-skip-zero-length-skb-in-sk_msg_recvmsg.patch new file mode 100644 index 00000000000..f8767be1cc6 --- /dev/null +++ b/queue-6.1/skmsg-skip-zero-length-skb-in-sk_msg_recvmsg.patch @@ -0,0 +1,105 @@ +From a56e65a2b1e3978341f865019dc4fa5cbfaba2e2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Jul 2024 16:39:31 +0800 +Subject: skmsg: Skip zero length skb in sk_msg_recvmsg + +From: Geliang Tang + +[ Upstream commit f0c18025693707ec344a70b6887f7450bf4c826b ] + +When running BPF selftests (./test_progs -t sockmap_basic) on a Loongarch +platform, the following kernel panic occurs: + + [...] + Oops[#1]: + CPU: 22 PID: 2824 Comm: test_progs Tainted: G OE 6.10.0-rc2+ #18 + Hardware name: LOONGSON Dabieshan/Loongson-TC542F0, BIOS Loongson-UDK2018 + ... ... + ra: 90000000048bf6c0 sk_msg_recvmsg+0x120/0x560 + ERA: 9000000004162774 copy_page_to_iter+0x74/0x1c0 + CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE) + PRMD: 0000000c (PPLV0 +PIE +PWE) + EUEN: 00000007 (+FPE +SXE +ASXE -BTE) + ECFG: 00071c1d (LIE=0,2-4,10-12 VS=7) + ESTAT: 00010000 [PIL] (IS= ECode=1 EsubCode=0) + BADV: 0000000000000040 + PRID: 0014c011 (Loongson-64bit, Loongson-3C5000) + Modules linked in: bpf_testmod(OE) xt_CHECKSUM xt_MASQUERADE xt_conntrack + Process test_progs (pid: 2824, threadinfo=0000000000863a31, task=...) + Stack : ... + Call Trace: + [<9000000004162774>] copy_page_to_iter+0x74/0x1c0 + [<90000000048bf6c0>] sk_msg_recvmsg+0x120/0x560 + [<90000000049f2b90>] tcp_bpf_recvmsg_parser+0x170/0x4e0 + [<90000000049aae34>] inet_recvmsg+0x54/0x100 + [<900000000481ad5c>] sock_recvmsg+0x7c/0xe0 + [<900000000481e1a8>] __sys_recvfrom+0x108/0x1c0 + [<900000000481e27c>] sys_recvfrom+0x1c/0x40 + [<9000000004c076ec>] do_syscall+0x8c/0xc0 + [<9000000003731da4>] handle_syscall+0xc4/0x160 + Code: ... + ---[ end trace 0000000000000000 ]--- + Kernel panic - not syncing: Fatal exception + Kernel relocated by 0x3510000 + .text @ 0x9000000003710000 + .data @ 0x9000000004d70000 + .bss @ 0x9000000006469400 + ---[ end Kernel panic - not syncing: Fatal exception ]--- + [...] + +This crash happens every time when running sockmap_skb_verdict_shutdown +subtest in sockmap_basic. + +This crash is because a NULL pointer is passed to page_address() in the +sk_msg_recvmsg(). Due to the different implementations depending on the +architecture, page_address(NULL) will trigger a panic on Loongarch +platform but not on x86 platform. So this bug was hidden on x86 platform +for a while, but now it is exposed on Loongarch platform. The root cause +is that a zero length skb (skb->len == 0) was put on the queue. + +This zero length skb is a TCP FIN packet, which was sent by shutdown(), +invoked in test_sockmap_skb_verdict_shutdown(): + + shutdown(p1, SHUT_WR); + +In this case, in sk_psock_skb_ingress_enqueue(), num_sge is zero, and no +page is put to this sge (see sg_set_page in sg_set_page), but this empty +sge is queued into ingress_msg list. + +And in sk_msg_recvmsg(), this empty sge is used, and a NULL page is got by +sg_page(sge). Pass this NULL page to copy_page_to_iter(), which passes it +to kmap_local_page() and to page_address(), then kernel panics. + +To solve this, we should skip this zero length skb. So in sk_msg_recvmsg(), +if copy is zero, that means it's a zero length skb, skip invoking +copy_page_to_iter(). We are using the EFAULT return triggered by +copy_page_to_iter to check for is_fin in tcp_bpf.c. + +Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") +Suggested-by: John Fastabend +Signed-off-by: Geliang Tang +Signed-off-by: Daniel Borkmann +Reviewed-by: John Fastabend +Link: https://lore.kernel.org/bpf/e3a16eacdc6740658ee02a33489b1b9d4912f378.1719992715.git.tanggeliang@kylinos.cn +Signed-off-by: Sasha Levin +--- + net/core/skmsg.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/net/core/skmsg.c b/net/core/skmsg.c +index 8b0459a6b629f..746d950de0e14 100644 +--- a/net/core/skmsg.c ++++ b/net/core/skmsg.c +@@ -433,7 +433,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + page = sg_page(sge); + if (copied + copy > len) + copy = len - copied; +- copy = copy_page_to_iter(page, sge->offset, copy, iter); ++ if (copy) ++ copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (!copy) { + copied = copied ? copied : -EFAULT; + goto out; +-- +2.43.0 + diff --git a/queue-6.1/tcp-fix-incorrect-undo-caused-by-dsack-of-tlp-retran.patch b/queue-6.1/tcp-fix-incorrect-undo-caused-by-dsack-of-tlp-retran.patch new file mode 100644 index 00000000000..b4116e4063c --- /dev/null +++ b/queue-6.1/tcp-fix-incorrect-undo-caused-by-dsack-of-tlp-retran.patch @@ -0,0 +1,107 @@ +From 688a773b8047b9d343714911fd4fb702b6d50fca Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Jul 2024 13:12:46 -0400 +Subject: tcp: fix incorrect undo caused by DSACK of TLP retransmit + +From: Neal Cardwell + +[ Upstream commit 0ec986ed7bab6801faed1440e8839dcc710331ff ] + +Loss recovery undo_retrans bookkeeping had a long-standing bug where a +DSACK from a spurious TLP retransmit packet could cause an erroneous +undo of a fast recovery or RTO recovery that repaired a single +really-lost packet (in a sequence range outside that of the TLP +retransmit). Basically, because the loss recovery state machine didn't +account for the fact that it sent a TLP retransmit, the DSACK for the +TLP retransmit could erroneously be implicitly be interpreted as +corresponding to the normal fast recovery or RTO recovery retransmit +that plugged a real hole, thus resulting in an improper undo. + +For example, consider the following buggy scenario where there is a +real packet loss but the congestion control response is improperly +undone because of this bug: + ++ send packets P1, P2, P3, P4 ++ P1 is really lost ++ send TLP retransmit of P4 ++ receive SACK for original P2, P3, P4 ++ enter fast recovery, fast-retransmit P1, increment undo_retrans to 1 ++ receive DSACK for TLP P4, decrement undo_retrans to 0, undo (bug!) ++ receive cumulative ACK for P1-P4 (fast retransmit plugged real hole) + +The fix: when we initialize undo machinery in tcp_init_undo(), if +there is a TLP retransmit in flight, then increment tp->undo_retrans +so that we make sure that we receive a DSACK corresponding to the TLP +retransmit, as well as DSACKs for all later normal retransmits, before +triggering a loss recovery undo. Note that we also have to move the +line that clears tp->tlp_high_seq for RTO recovery, so that upon RTO +we remember the tp->tlp_high_seq value until tcp_init_undo() and clear +it only afterward. + +Also note that the bug dates back to the original 2013 TLP +implementation, commit 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)"). + +However, this patch will only compile and work correctly with kernels +that have tp->tlp_retrans, which was added only in v5.8 in 2020 in +commit 76be93fc0702 ("tcp: allow at most one TLP probe per flight"). +So we associate this fix with that later commit. + +Fixes: 76be93fc0702 ("tcp: allow at most one TLP probe per flight") +Signed-off-by: Neal Cardwell +Reviewed-by: Eric Dumazet +Cc: Yuchung Cheng +Cc: Kevin Yang +Link: https://patch.msgid.link/20240703171246.1739561-1-ncardwell.sw@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ipv4/tcp_input.c | 11 ++++++++++- + net/ipv4/tcp_timer.c | 2 -- + 2 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 317cb90d77102..359ffda9b736b 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -2101,8 +2101,16 @@ void tcp_clear_retrans(struct tcp_sock *tp) + static inline void tcp_init_undo(struct tcp_sock *tp) + { + tp->undo_marker = tp->snd_una; ++ + /* Retransmission still in flight may cause DSACKs later. */ +- tp->undo_retrans = tp->retrans_out ? : -1; ++ /* First, account for regular retransmits in flight: */ ++ tp->undo_retrans = tp->retrans_out; ++ /* Next, account for TLP retransmits in flight: */ ++ if (tp->tlp_high_seq && tp->tlp_retrans) ++ tp->undo_retrans++; ++ /* Finally, avoid 0, because undo_retrans==0 means "can undo now": */ ++ if (!tp->undo_retrans) ++ tp->undo_retrans = -1; + } + + static bool tcp_is_rack(const struct sock *sk) +@@ -2181,6 +2189,7 @@ void tcp_enter_loss(struct sock *sk) + + tcp_set_ca_state(sk, TCP_CA_Loss); + tp->high_seq = tp->snd_nxt; ++ tp->tlp_high_seq = 0; + tcp_ecn_queue_cwr(tp); + + /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index 44b49f7d1a9e6..f36492331ef0b 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -496,8 +496,6 @@ void tcp_retransmit_timer(struct sock *sk) + if (WARN_ON_ONCE(!skb)) + return; + +- tp->tlp_high_seq = 0; +- + if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && + !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { + /* Receiver dastardly shrinks window. Our retransmits +-- +2.43.0 + diff --git a/queue-6.1/udp-set-sock_rcu_free-earlier-in-udp_lib_get_port.patch b/queue-6.1/udp-set-sock_rcu_free-earlier-in-udp_lib_get_port.patch new file mode 100644 index 00000000000..c8b9f78f2f2 --- /dev/null +++ b/queue-6.1/udp-set-sock_rcu_free-earlier-in-udp_lib_get_port.patch @@ -0,0 +1,123 @@ +From 139f02dcb0bb8d37a5229c7a8b5c2b206eb18227 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 9 Jul 2024 12:13:56 -0700 +Subject: udp: Set SOCK_RCU_FREE earlier in udp_lib_get_port(). + +From: Kuniyuki Iwashima + +[ Upstream commit 5c0b485a8c6116516f33925b9ce5b6104a6eadfd ] + +syzkaller triggered the warning [0] in udp_v4_early_demux(). + +In udp_v[46]_early_demux() and sk_lookup(), we do not touch the refcount +of the looked-up sk and use sock_pfree() as skb->destructor, so we check +SOCK_RCU_FREE to ensure that the sk is safe to access during the RCU grace +period. + +Currently, SOCK_RCU_FREE is flagged for a bound socket after being put +into the hash table. Moreover, the SOCK_RCU_FREE check is done too early +in udp_v[46]_early_demux() and sk_lookup(), so there could be a small race +window: + + CPU1 CPU2 + ---- ---- + udp_v4_early_demux() udp_lib_get_port() + | |- hlist_add_head_rcu() + |- sk = __udp4_lib_demux_lookup() | + |- DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk)); + `- sock_set_flag(sk, SOCK_RCU_FREE) + +We had the same bug in TCP and fixed it in commit 871019b22d1b ("net: +set SOCK_RCU_FREE before inserting socket into hashtable"). + +Let's apply the same fix for UDP. + +[0]: +WARNING: CPU: 0 PID: 11198 at net/ipv4/udp.c:2599 udp_v4_early_demux+0x481/0xb70 net/ipv4/udp.c:2599 +Modules linked in: +CPU: 0 PID: 11198 Comm: syz-executor.1 Not tainted 6.9.0-g93bda33046e7 #13 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 +RIP: 0010:udp_v4_early_demux+0x481/0xb70 net/ipv4/udp.c:2599 +Code: c5 7a 15 fe bb 01 00 00 00 44 89 e9 31 ff d3 e3 81 e3 bf ef ff ff 89 de e8 2c 74 15 fe 85 db 0f 85 02 06 00 00 e8 9f 7a 15 fe <0f> 0b e8 98 7a 15 fe 49 8d 7e 60 e8 4f 39 2f fe 49 c7 46 60 20 52 +RSP: 0018:ffffc9000ce3fa58 EFLAGS: 00010293 +RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff8318c92c +RDX: ffff888036ccde00 RSI: ffffffff8318c2f1 RDI: 0000000000000001 +RBP: ffff88805a2dd6e0 R08: 0000000000000001 R09: 0000000000000000 +R10: 0000000000000000 R11: 0001ffffffffffff R12: ffff88805a2dd680 +R13: 0000000000000007 R14: ffff88800923f900 R15: ffff88805456004e +FS: 00007fc449127640(0000) GS:ffff88807dc00000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00007fc449126e38 CR3: 000000003de4b002 CR4: 0000000000770ef0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 +PKRU: 55555554 +Call Trace: + + ip_rcv_finish_core.constprop.0+0xbdd/0xd20 net/ipv4/ip_input.c:349 + ip_rcv_finish+0xda/0x150 net/ipv4/ip_input.c:447 + NF_HOOK include/linux/netfilter.h:314 [inline] + NF_HOOK include/linux/netfilter.h:308 [inline] + ip_rcv+0x16c/0x180 net/ipv4/ip_input.c:569 + __netif_receive_skb_one_core+0xb3/0xe0 net/core/dev.c:5624 + __netif_receive_skb+0x21/0xd0 net/core/dev.c:5738 + netif_receive_skb_internal net/core/dev.c:5824 [inline] + netif_receive_skb+0x271/0x300 net/core/dev.c:5884 + tun_rx_batched drivers/net/tun.c:1549 [inline] + tun_get_user+0x24db/0x2c50 drivers/net/tun.c:2002 + tun_chr_write_iter+0x107/0x1a0 drivers/net/tun.c:2048 + new_sync_write fs/read_write.c:497 [inline] + vfs_write+0x76f/0x8d0 fs/read_write.c:590 + ksys_write+0xbf/0x190 fs/read_write.c:643 + __do_sys_write fs/read_write.c:655 [inline] + __se_sys_write fs/read_write.c:652 [inline] + __x64_sys_write+0x41/0x50 fs/read_write.c:652 + x64_sys_call+0xe66/0x1990 arch/x86/include/generated/asm/syscalls_64.h:2 + do_syscall_x64 arch/x86/entry/common.c:52 [inline] + do_syscall_64+0x4b/0x110 arch/x86/entry/common.c:83 + entry_SYSCALL_64_after_hwframe+0x4b/0x53 +RIP: 0033:0x7fc44a68bc1f +Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 e9 cf f5 ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 3c d0 f5 ff 48 +RSP: 002b:00007fc449126c90 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 +RAX: ffffffffffffffda RBX: 00000000004bc050 RCX: 00007fc44a68bc1f +RDX: 0000000000000032 RSI: 00000000200000c0 RDI: 00000000000000c8 +RBP: 00000000004bc050 R08: 0000000000000000 R09: 0000000000000000 +R10: 0000000000000032 R11: 0000000000000293 R12: 0000000000000000 +R13: 000000000000000b R14: 00007fc44a5ec530 R15: 0000000000000000 + + +Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF") +Reported-by: syzkaller +Signed-off-by: Kuniyuki Iwashima +Reviewed-by: Eric Dumazet +Link: https://patch.msgid.link/20240709191356.24010-1-kuniyu@amazon.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + net/ipv4/udp.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index b8f93c1479ae1..53267566808c1 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -319,6 +319,8 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, + goto fail_unlock; + } + ++ sock_set_flag(sk, SOCK_RCU_FREE); ++ + sk_add_node_rcu(sk, &hslot->head); + hslot->count++; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); +@@ -335,7 +337,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, + hslot2->count++; + spin_unlock(&hslot2->lock); + } +- sock_set_flag(sk, SOCK_RCU_FREE); ++ + error = 0; + fail_unlock: + spin_unlock_bh(&hslot->lock); +-- +2.43.0 + diff --git a/queue-6.1/vfs-don-t-mod-negative-dentry-count-when-on-shrinker.patch b/queue-6.1/vfs-don-t-mod-negative-dentry-count-when-on-shrinker.patch new file mode 100644 index 00000000000..695f9bf84b0 --- /dev/null +++ b/queue-6.1/vfs-don-t-mod-negative-dentry-count-when-on-shrinker.patch @@ -0,0 +1,88 @@ +From 4bcc40214587f2d1bda953fb620d9c97f16a02a8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Jul 2024 08:13:01 -0400 +Subject: vfs: don't mod negative dentry count when on shrinker list + +From: Brian Foster + +[ Upstream commit aabfe57ebaa75841db47ea59091ec3c5a06d2f52 ] + +The nr_dentry_negative counter is intended to only account negative +dentries that are present on the superblock LRU. Therefore, the LRU +add, remove and isolate helpers modify the counter based on whether +the dentry is negative, but the shrinker list related helpers do not +modify the counter, and the paths that change a dentry between +positive and negative only do so if DCACHE_LRU_LIST is set. + +The problem with this is that a dentry on a shrinker list still has +DCACHE_LRU_LIST set to indicate ->d_lru is in use. The additional +DCACHE_SHRINK_LIST flag denotes whether the dentry is on LRU or a +shrink related list. Therefore if a relevant operation (i.e. unlink) +occurs while a dentry is present on a shrinker list, and the +associated codepath only checks for DCACHE_LRU_LIST, then it is +technically possible to modify the negative dentry count for a +dentry that is off the LRU. Since the shrinker list related helpers +do not modify the negative dentry count (because non-LRU dentries +should not be included in the count) when the dentry is ultimately +removed from the shrinker list, this can cause the negative dentry +count to become permanently inaccurate. + +This problem can be reproduced via a heavy file create/unlink vs. +drop_caches workload. On an 80xcpu system, I start 80 tasks each +running a 1k file create/delete loop, and one task spinning on +drop_caches. After 10 minutes or so of runtime, the idle/clean cache +negative dentry count increases from somewhere in the range of 5-10 +entries to several hundred (and increasingly grows beyond +nr_dentry_unused). + +Tweak the logic in the paths that turn a dentry negative or positive +to filter out the case where the dentry is present on a shrink +related list. This allows the above workload to maintain an accurate +negative dentry count. + +Fixes: af0c9af1b3f6 ("fs/dcache: Track & report number of negative dentries") +Signed-off-by: Brian Foster +Link: https://lore.kernel.org/r/20240703121301.247680-1-bfoster@redhat.com +Acked-by: Ian Kent +Reviewed-by: Josef Bacik +Reviewed-by: Waiman Long +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/dcache.c | 12 +++++++++--- + 1 file changed, 9 insertions(+), 3 deletions(-) + +diff --git a/fs/dcache.c b/fs/dcache.c +index 9b10f1872f6c9..04f32dc8d1ad8 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -356,7 +356,11 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) + flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + WRITE_ONCE(dentry->d_flags, flags); + dentry->d_inode = NULL; +- if (flags & DCACHE_LRU_LIST) ++ /* ++ * The negative counter only tracks dentries on the LRU. Don't inc if ++ * d_lru is on another list. ++ */ ++ if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) + this_cpu_inc(nr_dentry_negative); + } + +@@ -2001,9 +2005,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) + + spin_lock(&dentry->d_lock); + /* +- * Decrement negative dentry count if it was in the LRU list. ++ * The negative counter only tracks dentries on the LRU. Don't dec if ++ * d_lru is on another list. + */ +- if (dentry->d_flags & DCACHE_LRU_LIST) ++ if ((dentry->d_flags & ++ (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) + this_cpu_dec(nr_dentry_negative); + hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + raw_write_seqcount_begin(&dentry->d_seq); +-- +2.43.0 + -- 2.47.3