+++ /dev/null
-From d05cb470663a2a1879277e544f69e660208f08f2 Mon Sep 17 00:00:00 2001
-From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
-Date: Fri, 29 Dec 2023 11:51:34 -0500
-Subject: ftrace: Fix modification of direct_function hash while in use
-
-From: Steven Rostedt (Google) <rostedt@goodmis.org>
-
-commit d05cb470663a2a1879277e544f69e660208f08f2 upstream.
-
-Masami Hiramatsu reported a memory leak in register_ftrace_direct() where
-if the number of new entries are added is large enough to cause two
-allocations in the loop:
-
- for (i = 0; i < size; i++) {
- hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- new = ftrace_add_rec_direct(entry->ip, addr, &free_hash);
- if (!new)
- goto out_remove;
- entry->direct = addr;
- }
- }
-
-Where ftrace_add_rec_direct() has:
-
- if (ftrace_hash_empty(direct_functions) ||
- direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
- struct ftrace_hash *new_hash;
- int size = ftrace_hash_empty(direct_functions) ? 0 :
- direct_functions->count + 1;
-
- if (size < 32)
- size = 32;
-
- new_hash = dup_hash(direct_functions, size);
- if (!new_hash)
- return NULL;
-
- *free_hash = direct_functions;
- direct_functions = new_hash;
- }
-
-The "*free_hash = direct_functions;" can happen twice, losing the previous
-allocation of direct_functions.
-
-But this also exposed a more serious bug.
-
-The modification of direct_functions above is not safe. As
-direct_functions can be referenced at any time to find what direct caller
-it should call, the time between:
-
- new_hash = dup_hash(direct_functions, size);
- and
- direct_functions = new_hash;
-
-can have a race with another CPU (or even this one if it gets interrupted),
-and the entries being moved to the new hash are not referenced.
-
-That's because the "dup_hash()" is really misnamed and is really a
-"move_hash()". It moves the entries from the old hash to the new one.
-
-Now even if that was changed, this code is not proper as direct_functions
-should not be updated until the end. That is the best way to handle
-function reference changes, and is the way other parts of ftrace handles
-this.
-
-The following is done:
-
- 1. Change add_hash_entry() to return the entry it created and inserted
- into the hash, and not just return success or not.
-
- 2. Replace ftrace_add_rec_direct() with add_hash_entry(), and remove
- the former.
-
- 3. Allocate a "new_hash" at the start that is made for holding both the
- new hash entries as well as the existing entries in direct_functions.
-
- 4. Copy (not move) the direct_function entries over to the new_hash.
-
- 5. Copy the entries of the added hash to the new_hash.
-
- 6. If everything succeeds, then use rcu_pointer_assign() to update the
- direct_functions with the new_hash.
-
-This simplifies the code and fixes both the memory leak as well as the
-race condition mentioned above.
-
-Link: https://lore.kernel.org/all/170368070504.42064.8960569647118388081.stgit@devnote2/
-Link: https://lore.kernel.org/linux-trace-kernel/20231229115134.08dd5174@gandalf.local.home
-
-Cc: stable@vger.kernel.org
-Cc: Mark Rutland <mark.rutland@arm.com>
-Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
-Cc: Jiri Olsa <jolsa@kernel.org>
-Cc: Alexei Starovoitov <ast@kernel.org>
-Cc: Daniel Borkmann <daniel@iogearbox.net>
-Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
-Fixes: 763e34e74bb7d ("ftrace: Add register_ftrace_direct()")
-Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- kernel/trace/ftrace.c | 100 +++++++++++++++++++++++---------------------------
- 1 file changed, 47 insertions(+), 53 deletions(-)
-
---- a/kernel/trace/ftrace.c
-+++ b/kernel/trace/ftrace.c
-@@ -1152,18 +1152,19 @@ static void __add_hash_entry(struct ftra
- hash->count++;
- }
-
--static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
-+static struct ftrace_func_entry *
-+add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
- {
- struct ftrace_func_entry *entry;
-
- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
-- return -ENOMEM;
-+ return NULL;
-
- entry->ip = ip;
- __add_hash_entry(hash, entry);
-
-- return 0;
-+ return entry;
- }
-
- static void
-@@ -1318,7 +1319,6 @@ alloc_and_copy_ftrace_hash(int size_bits
- struct ftrace_func_entry *entry;
- struct ftrace_hash *new_hash;
- int size;
-- int ret;
- int i;
-
- new_hash = alloc_ftrace_hash(size_bits);
-@@ -1335,8 +1335,7 @@ alloc_and_copy_ftrace_hash(int size_bits
- size = 1 << hash->size_bits;
- for (i = 0; i < size; i++) {
- hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
-- ret = add_hash_entry(new_hash, entry->ip);
-- if (ret < 0)
-+ if (add_hash_entry(new_hash, entry->ip) == NULL)
- goto free_hash;
- }
- }
-@@ -2439,7 +2438,7 @@ ftrace_find_tramp_ops_new(struct dyn_ftr
-
- #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
- /* Protected by rcu_tasks for reading, and direct_mutex for writing */
--static struct ftrace_hash *direct_functions = EMPTY_HASH;
-+static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH;
- static DEFINE_MUTEX(direct_mutex);
- int ftrace_direct_func_count;
-
-@@ -2458,39 +2457,6 @@ unsigned long ftrace_find_rec_direct(uns
- return entry->direct;
- }
-
--static struct ftrace_func_entry*
--ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
-- struct ftrace_hash **free_hash)
--{
-- struct ftrace_func_entry *entry;
--
-- if (ftrace_hash_empty(direct_functions) ||
-- direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
-- struct ftrace_hash *new_hash;
-- int size = ftrace_hash_empty(direct_functions) ? 0 :
-- direct_functions->count + 1;
--
-- if (size < 32)
-- size = 32;
--
-- new_hash = dup_hash(direct_functions, size);
-- if (!new_hash)
-- return NULL;
--
-- *free_hash = direct_functions;
-- direct_functions = new_hash;
-- }
--
-- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-- if (!entry)
-- return NULL;
--
-- entry->ip = ip;
-- entry->direct = addr;
-- __add_hash_entry(direct_functions, entry);
-- return entry;
--}
--
- static void call_direct_funcs(unsigned long ip, unsigned long pip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
- {
-@@ -4065,8 +4031,8 @@ enter_record(struct ftrace_hash *hash, s
- /* Do nothing if it exists */
- if (entry)
- return 0;
--
-- ret = add_hash_entry(hash, rec->ip);
-+ if (add_hash_entry(hash, rec->ip) == NULL)
-+ ret = -ENOMEM;
- }
- return ret;
- }
-@@ -5107,7 +5073,8 @@ __ftrace_match_addr(struct ftrace_hash *
- return 0;
- }
-
-- return add_hash_entry(hash, ip);
-+ entry = add_hash_entry(hash, ip);
-+ return entry ? 0 : -ENOMEM;
- }
-
- static int
-@@ -5633,7 +5600,7 @@ static void remove_direct_functions_hash
- */
- int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
- {
-- struct ftrace_hash *hash, *free_hash = NULL;
-+ struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL;
- struct ftrace_func_entry *entry, *new;
- int err = -EBUSY, size, i;
-
-@@ -5659,34 +5626,61 @@ int register_ftrace_direct_multi(struct
- }
- }
-
-- /* ... and insert them to direct_functions hash. */
- err = -ENOMEM;
-+
-+ /* Make a copy hash to place the new and the old entries in */
-+ size = hash->count + direct_functions->count;
-+ if (size > 32)
-+ size = 32;
-+ new_hash = alloc_ftrace_hash(fls(size));
-+ if (!new_hash)
-+ goto out_unlock;
-+
-+ /* Now copy over the existing direct entries */
-+ size = 1 << direct_functions->size_bits;
-+ for (i = 0; i < size; i++) {
-+ hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) {
-+ new = add_hash_entry(new_hash, entry->ip);
-+ if (!new)
-+ goto out_unlock;
-+ new->direct = entry->direct;
-+ }
-+ }
-+
-+ /* ... and add the new entries */
-+ size = 1 << hash->size_bits;
- for (i = 0; i < size; i++) {
- hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
-- new = ftrace_add_rec_direct(entry->ip, addr, &free_hash);
-+ new = add_hash_entry(new_hash, entry->ip);
- if (!new)
-- goto out_remove;
-+ goto out_unlock;
-+ /* Update both the copy and the hash entry */
-+ new->direct = addr;
- entry->direct = addr;
- }
- }
-
-+ free_hash = direct_functions;
-+ rcu_assign_pointer(direct_functions, new_hash);
-+ new_hash = NULL;
-+
- ops->func = call_direct_funcs;
- ops->flags = MULTI_FLAGS;
- ops->trampoline = FTRACE_REGS_ADDR;
-
- err = register_ftrace_function_nolock(ops);
-
-- out_remove:
-- if (err)
-- remove_direct_functions_hash(hash, addr);
--
- out_unlock:
- mutex_unlock(&direct_mutex);
-
-- if (free_hash) {
-+ if (free_hash && free_hash != EMPTY_HASH) {
- synchronize_rcu_tasks();
- free_ftrace_hash(free_hash);
- }
-+
-+ if (new_hash)
-+ free_ftrace_hash(new_hash);
-+
- return err;
- }
- EXPORT_SYMBOL_GPL(register_ftrace_direct_multi);
-@@ -6510,7 +6504,7 @@ ftrace_graph_set_hash(struct ftrace_hash
-
- if (entry)
- continue;
-- if (add_hash_entry(hash, rec->ip) < 0)
-+ if (add_hash_entry(hash, rec->ip) == NULL)
- goto out;
- } else {
- if (entry) {
--- /dev/null
+From 7315dc1e122c85ffdfc8defffbb8f8b616c2eb1a Mon Sep 17 00:00:00 2001
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+Date: Tue, 19 Dec 2023 19:44:49 +0100
+Subject: netfilter: nf_tables: skip set commit for deleted/destroyed sets
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+commit 7315dc1e122c85ffdfc8defffbb8f8b616c2eb1a upstream.
+
+NFT_MSG_DELSET deactivates all elements in the set, skip
+set->ops->commit() to avoid the unnecessary clone (for the pipapo case)
+as well as the sync GC cycle, which could deactivate again expired
+elements in such set.
+
+Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane")
+Reported-by: Kevin Rich <kevinrich1337@gmail.com>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/netfilter/nf_tables_api.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/netfilter/nf_tables_api.c
++++ b/net/netfilter/nf_tables_api.c
+@@ -9480,7 +9480,7 @@ static void nft_set_commit_update(struct
+ list_for_each_entry_safe(set, next, set_update_list, pending_update) {
+ list_del_init(&set->pending_update);
+
+- if (!set->ops->commit)
++ if (!set->ops->commit || set->dead)
+ continue;
+
+ set->ops->commit(set);
--- /dev/null
+From b803d7c664d55705831729d2f2e29c874bcd62ea Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+Date: Mon, 18 Dec 2023 23:07:12 -0500
+Subject: ring-buffer: Fix slowpath of interrupted event
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+commit b803d7c664d55705831729d2f2e29c874bcd62ea upstream.
+
+To synchronize the timestamps with the ring buffer reservation, there are
+two timestamps that are saved in the buffer meta data.
+
+1. before_stamp
+2. write_stamp
+
+When the two are equal, the write_stamp is considered valid, as in, it may
+be used to calculate the delta of the next event as the write_stamp is the
+timestamp of the previous reserved event on the buffer.
+
+This is done by the following:
+
+ /*A*/ w = current position on the ring buffer
+ before = before_stamp
+ after = write_stamp
+ ts = read current timestamp
+
+ if (before != after) {
+ write_stamp is not valid, force adding an absolute
+ timestamp.
+ }
+
+ /*B*/ before_stamp = ts
+
+ /*C*/ write = local_add_return(event length, position on ring buffer)
+
+ if (w == write - event length) {
+ /* Nothing interrupted between A and C */
+ /*E*/ write_stamp = ts;
+ delta = ts - after
+ /*
+ * If nothing interrupted again,
+ * before_stamp == write_stamp and write_stamp
+ * can be used to calculate the delta for
+ * events that come in after this one.
+ */
+ } else {
+
+ /*
+ * The slow path!
+ * Was interrupted between A and C.
+ */
+
+This is the place that there's a bug. We currently have:
+
+ after = write_stamp
+ ts = read current timestamp
+
+ /*F*/ if (write == current position on the ring buffer &&
+ after < ts && cmpxchg(write_stamp, after, ts)) {
+
+ delta = ts - after;
+
+ } else {
+ delta = 0;
+ }
+
+The assumption is that if the current position on the ring buffer hasn't
+moved between C and F, then it also was not interrupted, and that the last
+event written has a timestamp that matches the write_stamp. That is the
+write_stamp is valid.
+
+But this may not be the case:
+
+If a task context event was interrupted by softirq between B and C.
+
+And the softirq wrote an event that got interrupted by a hard irq between
+C and E.
+
+and the hard irq wrote an event (does not need to be interrupted)
+
+We have:
+
+ /*B*/ before_stamp = ts of normal context
+
+ ---> interrupted by softirq
+
+ /*B*/ before_stamp = ts of softirq context
+
+ ---> interrupted by hardirq
+
+ /*B*/ before_stamp = ts of hard irq context
+ /*E*/ write_stamp = ts of hard irq context
+
+ /* matches and write_stamp valid */
+ <----
+
+ /*E*/ write_stamp = ts of softirq context
+
+ /* No longer matches before_stamp, write_stamp is not valid! */
+
+ <---
+
+ w != write - length, go to slow path
+
+// Right now the order of events in the ring buffer is:
+//
+// |-- softirq event --|-- hard irq event --|-- normal context event --|
+//
+
+ after = write_stamp (this is the ts of softirq)
+ ts = read current timestamp
+
+ if (write == current position on the ring buffer [true] &&
+ after < ts [true] && cmpxchg(write_stamp, after, ts) [true]) {
+
+ delta = ts - after [Wrong!]
+
+The delta is to be between the hard irq event and the normal context
+event, but the above logic made the delta between the softirq event and
+the normal context event, where the hard irq event is between the two. This
+will shift all the remaining event timestamps on the sub-buffer
+incorrectly.
+
+The write_stamp is only valid if it matches the before_stamp. The cmpxchg
+does nothing to help this.
+
+Instead, the following logic can be done to fix this:
+
+ before = before_stamp
+ ts = read current timestamp
+ before_stamp = ts
+
+ after = write_stamp
+
+ if (write == current position on the ring buffer &&
+ after == before && after < ts) {
+
+ delta = ts - after
+
+ } else {
+ delta = 0;
+ }
+
+The above will only use the write_stamp if it still matches before_stamp
+and was tested to not have changed since C.
+
+As a bonus, with this logic we do not need any 64-bit cmpxchg() at all!
+
+This means the 32-bit rb_time_t workaround can finally be removed. But
+that's for a later time.
+
+Link: https://lore.kernel.org/linux-trace-kernel/20231218175229.58ec3daf@gandalf.local.home/
+Link: https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.local.home
+
+Cc: stable@vger.kernel.org
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Fixes: dd93942570789 ("ring-buffer: Do not try to put back write_stamp")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/ring_buffer.c | 81 +++++++++++++--------------------------------
+ 1 file changed, 24 insertions(+), 57 deletions(-)
+
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -705,48 +705,6 @@ rb_time_read_cmpxchg(local_t *l, unsigne
+ return ret == expect;
+ }
+
+-static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+-{
+- unsigned long cnt, top, bottom, msb;
+- unsigned long cnt2, top2, bottom2, msb2;
+- u64 val;
+-
+- /* Any interruptions in this function should cause a failure */
+- cnt = local_read(&t->cnt);
+-
+- /* The cmpxchg always fails if it interrupted an update */
+- if (!__rb_time_read(t, &val, &cnt2))
+- return false;
+-
+- if (val != expect)
+- return false;
+-
+- if ((cnt & 3) != cnt2)
+- return false;
+-
+- cnt2 = cnt + 1;
+-
+- rb_time_split(val, &top, &bottom, &msb);
+- msb = rb_time_val_cnt(msb, cnt);
+- top = rb_time_val_cnt(top, cnt);
+- bottom = rb_time_val_cnt(bottom, cnt);
+-
+- rb_time_split(set, &top2, &bottom2, &msb2);
+- msb2 = rb_time_val_cnt(msb2, cnt);
+- top2 = rb_time_val_cnt(top2, cnt2);
+- bottom2 = rb_time_val_cnt(bottom2, cnt2);
+-
+- if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
+- return false;
+- if (!rb_time_read_cmpxchg(&t->msb, msb, msb2))
+- return false;
+- if (!rb_time_read_cmpxchg(&t->top, top, top2))
+- return false;
+- if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
+- return false;
+- return true;
+-}
+-
+ #else /* 64 bits */
+
+ /* local64_t always succeeds */
+@@ -760,13 +718,6 @@ static void rb_time_set(rb_time_t *t, u6
+ {
+ local64_set(&t->time, val);
+ }
+-
+-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+-{
+- u64 val;
+- val = local64_cmpxchg(&t->time, expect, set);
+- return val == expect;
+-}
+ #endif
+
+ /*
+@@ -3613,20 +3564,36 @@ __rb_reserve_next(struct ring_buffer_per
+ } else {
+ u64 ts;
+ /* SLOW PATH - Interrupted between A and C */
+- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+- /* Was interrupted before here, write_stamp must be valid */
++
++ /* Save the old before_stamp */
++ a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+ RB_WARN_ON(cpu_buffer, !a_ok);
++
++ /*
++ * Read a new timestamp and update the before_stamp to make
++ * the next event after this one force using an absolute
++ * timestamp. This is in case an interrupt were to come in
++ * between E and F.
++ */
+ ts = rb_time_stamp(cpu_buffer->buffer);
++ rb_time_set(&cpu_buffer->before_stamp, ts);
++
+ barrier();
+- /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
+- info->after < ts &&
+- rb_time_cmpxchg(&cpu_buffer->write_stamp,
+- info->after, ts)) {
+- /* Nothing came after this event between C and E */
++ /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
++ /* Was interrupted before here, write_stamp must be valid */
++ RB_WARN_ON(cpu_buffer, !a_ok);
++ barrier();
++ /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
++ info->after == info->before && info->after < ts) {
++ /*
++ * Nothing came after this event between C and F, it is
++ * safe to use info->after for the delta as it
++ * matched info->before and is still valid.
++ */
+ info->delta = ts - info->after;
+ } else {
+ /*
+- * Interrupted between C and E:
++ * Interrupted between C and F:
+ * Lost the previous events time stamp. Just set the
+ * delta to zero, and this will be the same time as
+ * the event this event interrupted. And the events that
mm-memory-failure-cast-index-to-loff_t-before-shifting-it.patch
mm-memory-failure-check-the-mapcount-of-the-precise-page.patch
ring-buffer-fix-wake-ups-when-buffer_percent-is-set-to-100.patch
-ftrace-fix-modification-of-direct_function-hash-while-in-use.patch
tracing-fix-blocked-reader-of-snapshot-buffer.patch
ring-buffer-remove-useless-update-to-write_stamp-in-rb_try_to_discard.patch
+netfilter-nf_tables-skip-set-commit-for-deleted-destroyed-sets.patch
+ring-buffer-fix-slowpath-of-interrupted-event.patch