From: Greg Kroah-Hartman Date: Wed, 3 Jan 2024 10:34:11 +0000 (+0100) Subject: 6.1-stable patches X-Git-Tag: v5.10.206~16 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=2fa8d1c53604a2e3efdf364ad203e9baf7fac64b;p=thirdparty%2Fkernel%2Fstable-queue.git 6.1-stable patches added patches: netfilter-nf_tables-skip-set-commit-for-deleted-destroyed-sets.patch ring-buffer-fix-slowpath-of-interrupted-event.patch --- diff --git a/queue-6.1/ftrace-fix-modification-of-direct_function-hash-while-in-use.patch b/queue-6.1/ftrace-fix-modification-of-direct_function-hash-while-in-use.patch deleted file mode 100644 index f661d8f4b70..00000000000 --- a/queue-6.1/ftrace-fix-modification-of-direct_function-hash-while-in-use.patch +++ /dev/null @@ -1,304 +0,0 @@ -From d05cb470663a2a1879277e544f69e660208f08f2 Mon Sep 17 00:00:00 2001 -From: "Steven Rostedt (Google)" -Date: Fri, 29 Dec 2023 11:51:34 -0500 -Subject: ftrace: Fix modification of direct_function hash while in use - -From: Steven Rostedt (Google) - -commit d05cb470663a2a1879277e544f69e660208f08f2 upstream. - -Masami Hiramatsu reported a memory leak in register_ftrace_direct() where -if the number of new entries are added is large enough to cause two -allocations in the loop: - - for (i = 0; i < size; i++) { - hlist_for_each_entry(entry, &hash->buckets[i], hlist) { - new = ftrace_add_rec_direct(entry->ip, addr, &free_hash); - if (!new) - goto out_remove; - entry->direct = addr; - } - } - -Where ftrace_add_rec_direct() has: - - if (ftrace_hash_empty(direct_functions) || - direct_functions->count > 2 * (1 << direct_functions->size_bits)) { - struct ftrace_hash *new_hash; - int size = ftrace_hash_empty(direct_functions) ? 0 : - direct_functions->count + 1; - - if (size < 32) - size = 32; - - new_hash = dup_hash(direct_functions, size); - if (!new_hash) - return NULL; - - *free_hash = direct_functions; - direct_functions = new_hash; - } - -The "*free_hash = direct_functions;" can happen twice, losing the previous -allocation of direct_functions. - -But this also exposed a more serious bug. - -The modification of direct_functions above is not safe. As -direct_functions can be referenced at any time to find what direct caller -it should call, the time between: - - new_hash = dup_hash(direct_functions, size); - and - direct_functions = new_hash; - -can have a race with another CPU (or even this one if it gets interrupted), -and the entries being moved to the new hash are not referenced. - -That's because the "dup_hash()" is really misnamed and is really a -"move_hash()". It moves the entries from the old hash to the new one. - -Now even if that was changed, this code is not proper as direct_functions -should not be updated until the end. That is the best way to handle -function reference changes, and is the way other parts of ftrace handles -this. - -The following is done: - - 1. Change add_hash_entry() to return the entry it created and inserted - into the hash, and not just return success or not. - - 2. Replace ftrace_add_rec_direct() with add_hash_entry(), and remove - the former. - - 3. Allocate a "new_hash" at the start that is made for holding both the - new hash entries as well as the existing entries in direct_functions. - - 4. Copy (not move) the direct_function entries over to the new_hash. - - 5. Copy the entries of the added hash to the new_hash. - - 6. If everything succeeds, then use rcu_pointer_assign() to update the - direct_functions with the new_hash. - -This simplifies the code and fixes both the memory leak as well as the -race condition mentioned above. - -Link: https://lore.kernel.org/all/170368070504.42064.8960569647118388081.stgit@devnote2/ -Link: https://lore.kernel.org/linux-trace-kernel/20231229115134.08dd5174@gandalf.local.home - -Cc: stable@vger.kernel.org -Cc: Mark Rutland -Cc: Mathieu Desnoyers -Cc: Jiri Olsa -Cc: Alexei Starovoitov -Cc: Daniel Borkmann -Acked-by: Masami Hiramatsu (Google) -Fixes: 763e34e74bb7d ("ftrace: Add register_ftrace_direct()") -Signed-off-by: Steven Rostedt (Google) -Signed-off-by: Greg Kroah-Hartman ---- - kernel/trace/ftrace.c | 100 +++++++++++++++++++++++--------------------------- - 1 file changed, 47 insertions(+), 53 deletions(-) - ---- a/kernel/trace/ftrace.c -+++ b/kernel/trace/ftrace.c -@@ -1152,18 +1152,19 @@ static void __add_hash_entry(struct ftra - hash->count++; - } - --static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) -+static struct ftrace_func_entry * -+add_hash_entry(struct ftrace_hash *hash, unsigned long ip) - { - struct ftrace_func_entry *entry; - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) -- return -ENOMEM; -+ return NULL; - - entry->ip = ip; - __add_hash_entry(hash, entry); - -- return 0; -+ return entry; - } - - static void -@@ -1318,7 +1319,6 @@ alloc_and_copy_ftrace_hash(int size_bits - struct ftrace_func_entry *entry; - struct ftrace_hash *new_hash; - int size; -- int ret; - int i; - - new_hash = alloc_ftrace_hash(size_bits); -@@ -1335,8 +1335,7 @@ alloc_and_copy_ftrace_hash(int size_bits - size = 1 << hash->size_bits; - for (i = 0; i < size; i++) { - hlist_for_each_entry(entry, &hash->buckets[i], hlist) { -- ret = add_hash_entry(new_hash, entry->ip); -- if (ret < 0) -+ if (add_hash_entry(new_hash, entry->ip) == NULL) - goto free_hash; - } - } -@@ -2439,7 +2438,7 @@ ftrace_find_tramp_ops_new(struct dyn_ftr - - #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS - /* Protected by rcu_tasks for reading, and direct_mutex for writing */ --static struct ftrace_hash *direct_functions = EMPTY_HASH; -+static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH; - static DEFINE_MUTEX(direct_mutex); - int ftrace_direct_func_count; - -@@ -2458,39 +2457,6 @@ unsigned long ftrace_find_rec_direct(uns - return entry->direct; - } - --static struct ftrace_func_entry* --ftrace_add_rec_direct(unsigned long ip, unsigned long addr, -- struct ftrace_hash **free_hash) --{ -- struct ftrace_func_entry *entry; -- -- if (ftrace_hash_empty(direct_functions) || -- direct_functions->count > 2 * (1 << direct_functions->size_bits)) { -- struct ftrace_hash *new_hash; -- int size = ftrace_hash_empty(direct_functions) ? 0 : -- direct_functions->count + 1; -- -- if (size < 32) -- size = 32; -- -- new_hash = dup_hash(direct_functions, size); -- if (!new_hash) -- return NULL; -- -- *free_hash = direct_functions; -- direct_functions = new_hash; -- } -- -- entry = kmalloc(sizeof(*entry), GFP_KERNEL); -- if (!entry) -- return NULL; -- -- entry->ip = ip; -- entry->direct = addr; -- __add_hash_entry(direct_functions, entry); -- return entry; --} -- - static void call_direct_funcs(unsigned long ip, unsigned long pip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) - { -@@ -4065,8 +4031,8 @@ enter_record(struct ftrace_hash *hash, s - /* Do nothing if it exists */ - if (entry) - return 0; -- -- ret = add_hash_entry(hash, rec->ip); -+ if (add_hash_entry(hash, rec->ip) == NULL) -+ ret = -ENOMEM; - } - return ret; - } -@@ -5107,7 +5073,8 @@ __ftrace_match_addr(struct ftrace_hash * - return 0; - } - -- return add_hash_entry(hash, ip); -+ entry = add_hash_entry(hash, ip); -+ return entry ? 0 : -ENOMEM; - } - - static int -@@ -5633,7 +5600,7 @@ static void remove_direct_functions_hash - */ - int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) - { -- struct ftrace_hash *hash, *free_hash = NULL; -+ struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL; - struct ftrace_func_entry *entry, *new; - int err = -EBUSY, size, i; - -@@ -5659,34 +5626,61 @@ int register_ftrace_direct_multi(struct - } - } - -- /* ... and insert them to direct_functions hash. */ - err = -ENOMEM; -+ -+ /* Make a copy hash to place the new and the old entries in */ -+ size = hash->count + direct_functions->count; -+ if (size > 32) -+ size = 32; -+ new_hash = alloc_ftrace_hash(fls(size)); -+ if (!new_hash) -+ goto out_unlock; -+ -+ /* Now copy over the existing direct entries */ -+ size = 1 << direct_functions->size_bits; -+ for (i = 0; i < size; i++) { -+ hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) { -+ new = add_hash_entry(new_hash, entry->ip); -+ if (!new) -+ goto out_unlock; -+ new->direct = entry->direct; -+ } -+ } -+ -+ /* ... and add the new entries */ -+ size = 1 << hash->size_bits; - for (i = 0; i < size; i++) { - hlist_for_each_entry(entry, &hash->buckets[i], hlist) { -- new = ftrace_add_rec_direct(entry->ip, addr, &free_hash); -+ new = add_hash_entry(new_hash, entry->ip); - if (!new) -- goto out_remove; -+ goto out_unlock; -+ /* Update both the copy and the hash entry */ -+ new->direct = addr; - entry->direct = addr; - } - } - -+ free_hash = direct_functions; -+ rcu_assign_pointer(direct_functions, new_hash); -+ new_hash = NULL; -+ - ops->func = call_direct_funcs; - ops->flags = MULTI_FLAGS; - ops->trampoline = FTRACE_REGS_ADDR; - - err = register_ftrace_function_nolock(ops); - -- out_remove: -- if (err) -- remove_direct_functions_hash(hash, addr); -- - out_unlock: - mutex_unlock(&direct_mutex); - -- if (free_hash) { -+ if (free_hash && free_hash != EMPTY_HASH) { - synchronize_rcu_tasks(); - free_ftrace_hash(free_hash); - } -+ -+ if (new_hash) -+ free_ftrace_hash(new_hash); -+ - return err; - } - EXPORT_SYMBOL_GPL(register_ftrace_direct_multi); -@@ -6510,7 +6504,7 @@ ftrace_graph_set_hash(struct ftrace_hash - - if (entry) - continue; -- if (add_hash_entry(hash, rec->ip) < 0) -+ if (add_hash_entry(hash, rec->ip) == NULL) - goto out; - } else { - if (entry) { diff --git a/queue-6.1/netfilter-nf_tables-skip-set-commit-for-deleted-destroyed-sets.patch b/queue-6.1/netfilter-nf_tables-skip-set-commit-for-deleted-destroyed-sets.patch new file mode 100644 index 00000000000..c3009409de0 --- /dev/null +++ b/queue-6.1/netfilter-nf_tables-skip-set-commit-for-deleted-destroyed-sets.patch @@ -0,0 +1,33 @@ +From 7315dc1e122c85ffdfc8defffbb8f8b616c2eb1a Mon Sep 17 00:00:00 2001 +From: Pablo Neira Ayuso +Date: Tue, 19 Dec 2023 19:44:49 +0100 +Subject: netfilter: nf_tables: skip set commit for deleted/destroyed sets + +From: Pablo Neira Ayuso + +commit 7315dc1e122c85ffdfc8defffbb8f8b616c2eb1a upstream. + +NFT_MSG_DELSET deactivates all elements in the set, skip +set->ops->commit() to avoid the unnecessary clone (for the pipapo case) +as well as the sync GC cycle, which could deactivate again expired +elements in such set. + +Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") +Reported-by: Kevin Rich +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Greg Kroah-Hartman +--- + net/netfilter/nf_tables_api.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -9480,7 +9480,7 @@ static void nft_set_commit_update(struct + list_for_each_entry_safe(set, next, set_update_list, pending_update) { + list_del_init(&set->pending_update); + +- if (!set->ops->commit) ++ if (!set->ops->commit || set->dead) + continue; + + set->ops->commit(set); diff --git a/queue-6.1/ring-buffer-fix-slowpath-of-interrupted-event.patch b/queue-6.1/ring-buffer-fix-slowpath-of-interrupted-event.patch new file mode 100644 index 00000000000..f3a08b7bc5f --- /dev/null +++ b/queue-6.1/ring-buffer-fix-slowpath-of-interrupted-event.patch @@ -0,0 +1,276 @@ +From b803d7c664d55705831729d2f2e29c874bcd62ea Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Google)" +Date: Mon, 18 Dec 2023 23:07:12 -0500 +Subject: ring-buffer: Fix slowpath of interrupted event + +From: Steven Rostedt (Google) + +commit b803d7c664d55705831729d2f2e29c874bcd62ea upstream. + +To synchronize the timestamps with the ring buffer reservation, there are +two timestamps that are saved in the buffer meta data. + +1. before_stamp +2. write_stamp + +When the two are equal, the write_stamp is considered valid, as in, it may +be used to calculate the delta of the next event as the write_stamp is the +timestamp of the previous reserved event on the buffer. + +This is done by the following: + + /*A*/ w = current position on the ring buffer + before = before_stamp + after = write_stamp + ts = read current timestamp + + if (before != after) { + write_stamp is not valid, force adding an absolute + timestamp. + } + + /*B*/ before_stamp = ts + + /*C*/ write = local_add_return(event length, position on ring buffer) + + if (w == write - event length) { + /* Nothing interrupted between A and C */ + /*E*/ write_stamp = ts; + delta = ts - after + /* + * If nothing interrupted again, + * before_stamp == write_stamp and write_stamp + * can be used to calculate the delta for + * events that come in after this one. + */ + } else { + + /* + * The slow path! + * Was interrupted between A and C. + */ + +This is the place that there's a bug. We currently have: + + after = write_stamp + ts = read current timestamp + + /*F*/ if (write == current position on the ring buffer && + after < ts && cmpxchg(write_stamp, after, ts)) { + + delta = ts - after; + + } else { + delta = 0; + } + +The assumption is that if the current position on the ring buffer hasn't +moved between C and F, then it also was not interrupted, and that the last +event written has a timestamp that matches the write_stamp. That is the +write_stamp is valid. + +But this may not be the case: + +If a task context event was interrupted by softirq between B and C. + +And the softirq wrote an event that got interrupted by a hard irq between +C and E. + +and the hard irq wrote an event (does not need to be interrupted) + +We have: + + /*B*/ before_stamp = ts of normal context + + ---> interrupted by softirq + + /*B*/ before_stamp = ts of softirq context + + ---> interrupted by hardirq + + /*B*/ before_stamp = ts of hard irq context + /*E*/ write_stamp = ts of hard irq context + + /* matches and write_stamp valid */ + <---- + + /*E*/ write_stamp = ts of softirq context + + /* No longer matches before_stamp, write_stamp is not valid! */ + + <--- + + w != write - length, go to slow path + +// Right now the order of events in the ring buffer is: +// +// |-- softirq event --|-- hard irq event --|-- normal context event --| +// + + after = write_stamp (this is the ts of softirq) + ts = read current timestamp + + if (write == current position on the ring buffer [true] && + after < ts [true] && cmpxchg(write_stamp, after, ts) [true]) { + + delta = ts - after [Wrong!] + +The delta is to be between the hard irq event and the normal context +event, but the above logic made the delta between the softirq event and +the normal context event, where the hard irq event is between the two. This +will shift all the remaining event timestamps on the sub-buffer +incorrectly. + +The write_stamp is only valid if it matches the before_stamp. The cmpxchg +does nothing to help this. + +Instead, the following logic can be done to fix this: + + before = before_stamp + ts = read current timestamp + before_stamp = ts + + after = write_stamp + + if (write == current position on the ring buffer && + after == before && after < ts) { + + delta = ts - after + + } else { + delta = 0; + } + +The above will only use the write_stamp if it still matches before_stamp +and was tested to not have changed since C. + +As a bonus, with this logic we do not need any 64-bit cmpxchg() at all! + +This means the 32-bit rb_time_t workaround can finally be removed. But +that's for a later time. + +Link: https://lore.kernel.org/linux-trace-kernel/20231218175229.58ec3daf@gandalf.local.home/ +Link: https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.local.home + +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Mark Rutland +Cc: Mathieu Desnoyers +Cc: Linus Torvalds +Fixes: dd93942570789 ("ring-buffer: Do not try to put back write_stamp") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/ring_buffer.c | 81 +++++++++++++-------------------------------- + 1 file changed, 24 insertions(+), 57 deletions(-) + +--- a/kernel/trace/ring_buffer.c ++++ b/kernel/trace/ring_buffer.c +@@ -705,48 +705,6 @@ rb_time_read_cmpxchg(local_t *l, unsigne + return ret == expect; + } + +-static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +-{ +- unsigned long cnt, top, bottom, msb; +- unsigned long cnt2, top2, bottom2, msb2; +- u64 val; +- +- /* Any interruptions in this function should cause a failure */ +- cnt = local_read(&t->cnt); +- +- /* The cmpxchg always fails if it interrupted an update */ +- if (!__rb_time_read(t, &val, &cnt2)) +- return false; +- +- if (val != expect) +- return false; +- +- if ((cnt & 3) != cnt2) +- return false; +- +- cnt2 = cnt + 1; +- +- rb_time_split(val, &top, &bottom, &msb); +- msb = rb_time_val_cnt(msb, cnt); +- top = rb_time_val_cnt(top, cnt); +- bottom = rb_time_val_cnt(bottom, cnt); +- +- rb_time_split(set, &top2, &bottom2, &msb2); +- msb2 = rb_time_val_cnt(msb2, cnt); +- top2 = rb_time_val_cnt(top2, cnt2); +- bottom2 = rb_time_val_cnt(bottom2, cnt2); +- +- if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) +- return false; +- if (!rb_time_read_cmpxchg(&t->msb, msb, msb2)) +- return false; +- if (!rb_time_read_cmpxchg(&t->top, top, top2)) +- return false; +- if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) +- return false; +- return true; +-} +- + #else /* 64 bits */ + + /* local64_t always succeeds */ +@@ -760,13 +718,6 @@ static void rb_time_set(rb_time_t *t, u6 + { + local64_set(&t->time, val); + } +- +-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +-{ +- u64 val; +- val = local64_cmpxchg(&t->time, expect, set); +- return val == expect; +-} + #endif + + /* +@@ -3613,20 +3564,36 @@ __rb_reserve_next(struct ring_buffer_per + } else { + u64 ts; + /* SLOW PATH - Interrupted between A and C */ +- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); +- /* Was interrupted before here, write_stamp must be valid */ ++ ++ /* Save the old before_stamp */ ++ a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); + RB_WARN_ON(cpu_buffer, !a_ok); ++ ++ /* ++ * Read a new timestamp and update the before_stamp to make ++ * the next event after this one force using an absolute ++ * timestamp. This is in case an interrupt were to come in ++ * between E and F. ++ */ + ts = rb_time_stamp(cpu_buffer->buffer); ++ rb_time_set(&cpu_buffer->before_stamp, ts); ++ + barrier(); +- /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && +- info->after < ts && +- rb_time_cmpxchg(&cpu_buffer->write_stamp, +- info->after, ts)) { +- /* Nothing came after this event between C and E */ ++ /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); ++ /* Was interrupted before here, write_stamp must be valid */ ++ RB_WARN_ON(cpu_buffer, !a_ok); ++ barrier(); ++ /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && ++ info->after == info->before && info->after < ts) { ++ /* ++ * Nothing came after this event between C and F, it is ++ * safe to use info->after for the delta as it ++ * matched info->before and is still valid. ++ */ + info->delta = ts - info->after; + } else { + /* +- * Interrupted between C and E: ++ * Interrupted between C and F: + * Lost the previous events time stamp. Just set the + * delta to zero, and this will be the same time as + * the event this event interrupted. And the events that diff --git a/queue-6.1/series b/queue-6.1/series index 85abc97b5b4..dc93b17f68c 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -89,6 +89,7 @@ mm-migrate-high-order-folios-in-swap-cache-correctly.patch mm-memory-failure-cast-index-to-loff_t-before-shifting-it.patch mm-memory-failure-check-the-mapcount-of-the-precise-page.patch ring-buffer-fix-wake-ups-when-buffer_percent-is-set-to-100.patch -ftrace-fix-modification-of-direct_function-hash-while-in-use.patch tracing-fix-blocked-reader-of-snapshot-buffer.patch ring-buffer-remove-useless-update-to-write_stamp-in-rb_try_to_discard.patch +netfilter-nf_tables-skip-set-commit-for-deleted-destroyed-sets.patch +ring-buffer-fix-slowpath-of-interrupted-event.patch