From: Greg Kroah-Hartman Date: Fri, 23 Feb 2024 16:01:04 +0000 (+0100) Subject: 5.15-stable patches X-Git-Tag: v4.19.308~98 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=18379bdafcdabe8c6966e816c4bb8e91343a8762;p=thirdparty%2Fkernel%2Fstable-queue.git 5.15-stable patches added patches: bpf-add-struct-for-bin_args-arg-in-bpf_bprintf_prepare.patch bpf-do-cleanup-in-bpf_bprintf_cleanup-only-when-needed.patch bpf-merge-printk-and-seq_printf-vararg-max-macros.patch bpf-remove-trace_printk_lock.patch userfaultfd-fix-mmap_changing-checking-in-mfill_atomic_hugetlb.patch zonefs-improve-error-handling.patch --- diff --git a/queue-5.15/bpf-add-struct-for-bin_args-arg-in-bpf_bprintf_prepare.patch b/queue-5.15/bpf-add-struct-for-bin_args-arg-in-bpf_bprintf_prepare.patch new file mode 100644 index 00000000000..613de9b914a --- /dev/null +++ b/queue-5.15/bpf-add-struct-for-bin_args-arg-in-bpf_bprintf_prepare.patch @@ -0,0 +1,195 @@ +From stable+bounces-20402-greg=kroah.com@vger.kernel.org Sat Feb 17 13:13:54 2024 +From: Thadeu Lima de Souza Cascardo +Date: Sat, 17 Feb 2024 09:13:19 -0300 +Subject: bpf: Add struct for bin_args arg in bpf_bprintf_prepare +To: stable@vger.kernel.org +Cc: cascardo@igalia.com, jolsa@kernel.org, daniel@iogearbox.net, yhs@fb.com +Message-ID: <20240217121321.2045993-6-cascardo@igalia.com> + +From: Jiri Olsa + +commit 78aa1cc9404399a15d2a1205329c6a06236f5378 upstream. + +Adding struct bpf_bprintf_data to hold bin_args argument for +bpf_bprintf_prepare function. + +We will add another return argument to bpf_bprintf_prepare and +pass the struct to bpf_bprintf_cleanup for proper cleanup in +following changes. + +Signed-off-by: Jiri Olsa +Signed-off-by: Daniel Borkmann +Acked-by: Yonghong Song +Link: https://lore.kernel.org/bpf/20221215214430.1336195-2-jolsa@kernel.org +[cascardo: there is no bpf_trace_vprintk in 5.15] +Signed-off-by: Thadeu Lima de Souza Cascardo +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/bpf.h | 7 ++++++- + kernel/bpf/helpers.c | 24 +++++++++++++----------- + kernel/bpf/verifier.c | 3 ++- + kernel/trace/bpf_trace.c | 22 +++++++++++++--------- + 4 files changed, 34 insertions(+), 22 deletions(-) + +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -2292,8 +2292,13 @@ bool btf_id_set_contains(const struct bt + + #define MAX_BPRINTF_VARARGS 12 + ++struct bpf_bprintf_data { ++ u32 *bin_args; ++ bool get_bin_args; ++}; ++ + int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, +- u32 **bin_buf, u32 num_args); ++ u32 num_args, struct bpf_bprintf_data *data); + void bpf_bprintf_cleanup(void); + + #endif /* _LINUX_BPF_H */ +--- a/kernel/bpf/helpers.c ++++ b/kernel/bpf/helpers.c +@@ -752,16 +752,16 @@ void bpf_bprintf_cleanup(void) + * Returns a negative value if fmt is an invalid format string or 0 otherwise. + * + * This can be used in two ways: +- * - Format string verification only: when bin_args is NULL ++ * - Format string verification only: when data->get_bin_args is false + * - Arguments preparation: in addition to the above verification, it writes in +- * bin_args a binary representation of arguments usable by bstr_printf where +- * pointers from BPF have been sanitized. ++ * data->bin_args a binary representation of arguments usable by bstr_printf ++ * where pointers from BPF have been sanitized. + * + * In argument preparation mode, if 0 is returned, safe temporary buffers are + * allocated and bpf_bprintf_cleanup should be called to free them after use. + */ + int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, +- u32 **bin_args, u32 num_args) ++ u32 num_args, struct bpf_bprintf_data *data) + { + char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end; + size_t sizeof_cur_arg, sizeof_cur_ip; +@@ -774,12 +774,12 @@ int bpf_bprintf_prepare(char *fmt, u32 f + return -EINVAL; + fmt_size = fmt_end - fmt; + +- if (bin_args) { ++ if (data->get_bin_args) { + if (num_args && try_get_fmt_tmp_buf(&tmp_buf)) + return -EBUSY; + + tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN; +- *bin_args = (u32 *)tmp_buf; ++ data->bin_args = (u32 *)tmp_buf; + } + + for (i = 0; i < fmt_size; i++) { +@@ -980,24 +980,26 @@ out: + } + + BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, +- const void *, data, u32, data_len) ++ const void *, args, u32, data_len) + { ++ struct bpf_bprintf_data data = { ++ .get_bin_args = true, ++ }; + int err, num_args; +- u32 *bin_args; + + if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 || +- (data_len && !data)) ++ (data_len && !args)) + return -EINVAL; + num_args = data_len / 8; + + /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we + * can safely give an unbounded size. + */ +- err = bpf_bprintf_prepare(fmt, UINT_MAX, data, &bin_args, num_args); ++ err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data); + if (err < 0) + return err; + +- err = bstr_printf(str, str_size, fmt, bin_args); ++ err = bstr_printf(str, str_size, fmt, data.bin_args); + + bpf_bprintf_cleanup(); + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -6407,6 +6407,7 @@ static int check_bpf_snprintf_call(struc + struct bpf_reg_state *fmt_reg = ®s[BPF_REG_3]; + struct bpf_reg_state *data_len_reg = ®s[BPF_REG_5]; + struct bpf_map *fmt_map = fmt_reg->map_ptr; ++ struct bpf_bprintf_data data = {}; + int err, fmt_map_off, num_args; + u64 fmt_addr; + char *fmt; +@@ -6431,7 +6432,7 @@ static int check_bpf_snprintf_call(struc + /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we + * can focus on validating the format specifiers. + */ +- err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, NULL, num_args); ++ err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, num_args, &data); + if (err < 0) + verbose(env, "Invalid format string\n"); + +--- a/kernel/trace/bpf_trace.c ++++ b/kernel/trace/bpf_trace.c +@@ -369,18 +369,20 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt + u64, arg2, u64, arg3) + { + u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; +- u32 *bin_args; ++ struct bpf_bprintf_data data = { ++ .get_bin_args = true, ++ }; + static char buf[BPF_TRACE_PRINTK_SIZE]; + unsigned long flags; + int ret; + +- ret = bpf_bprintf_prepare(fmt, fmt_size, args, &bin_args, +- MAX_TRACE_PRINTK_VARARGS); ++ ret = bpf_bprintf_prepare(fmt, fmt_size, args, ++ MAX_TRACE_PRINTK_VARARGS, &data); + if (ret < 0) + return ret; + + raw_spin_lock_irqsave(&trace_printk_lock, flags); +- ret = bstr_printf(buf, sizeof(buf), fmt, bin_args); ++ ret = bstr_printf(buf, sizeof(buf), fmt, data.bin_args); + + trace_bpf_trace_printk(buf); + raw_spin_unlock_irqrestore(&trace_printk_lock, flags); +@@ -415,21 +417,23 @@ const struct bpf_func_proto *bpf_get_tra + } + + BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, +- const void *, data, u32, data_len) ++ const void *, args, u32, data_len) + { ++ struct bpf_bprintf_data data = { ++ .get_bin_args = true, ++ }; + int err, num_args; +- u32 *bin_args; + + if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || +- (data_len && !data)) ++ (data_len && !args)) + return -EINVAL; + num_args = data_len / 8; + +- err = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args); ++ err = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data); + if (err < 0) + return err; + +- seq_bprintf(m, fmt, bin_args); ++ seq_bprintf(m, fmt, data.bin_args); + + bpf_bprintf_cleanup(); + diff --git a/queue-5.15/bpf-do-cleanup-in-bpf_bprintf_cleanup-only-when-needed.patch b/queue-5.15/bpf-do-cleanup-in-bpf_bprintf_cleanup-only-when-needed.patch new file mode 100644 index 00000000000..760c37d8e26 --- /dev/null +++ b/queue-5.15/bpf-do-cleanup-in-bpf_bprintf_cleanup-only-when-needed.patch @@ -0,0 +1,115 @@ +From stable+bounces-20401-greg=kroah.com@vger.kernel.org Sat Feb 17 13:13:54 2024 +From: Thadeu Lima de Souza Cascardo +Date: Sat, 17 Feb 2024 09:13:20 -0300 +Subject: bpf: Do cleanup in bpf_bprintf_cleanup only when needed +To: stable@vger.kernel.org +Cc: cascardo@igalia.com, jolsa@kernel.org, daniel@iogearbox.net, yhs@fb.com +Message-ID: <20240217121321.2045993-7-cascardo@igalia.com> + +From: Jiri Olsa + +commit f19a4050455aad847fb93f18dc1fe502eb60f989 upstream. + +Currently we always cleanup/decrement bpf_bprintf_nest_level variable +in bpf_bprintf_cleanup if it's > 0. + +There's possible scenario where this could cause a problem, when +bpf_bprintf_prepare does not get bin_args buffer (because num_args is 0) +and following bpf_bprintf_cleanup call decrements bpf_bprintf_nest_level +variable, like: + + in task context: + bpf_bprintf_prepare(num_args != 0) increments 'bpf_bprintf_nest_level = 1' + -> first irq : + bpf_bprintf_prepare(num_args == 0) + bpf_bprintf_cleanup decrements 'bpf_bprintf_nest_level = 0' + -> second irq: + bpf_bprintf_prepare(num_args != 0) bpf_bprintf_nest_level = 1 + gets same buffer as task context above + +Adding check to bpf_bprintf_cleanup and doing the real cleanup only if we +got bin_args data in the first place. + +Signed-off-by: Jiri Olsa +Signed-off-by: Daniel Borkmann +Acked-by: Yonghong Song +Link: https://lore.kernel.org/bpf/20221215214430.1336195-3-jolsa@kernel.org +[cascardo: there is no bpf_trace_vprintk in 5.15] +Signed-off-by: Thadeu Lima de Souza Cascardo +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/bpf.h | 2 +- + kernel/bpf/helpers.c | 16 +++++++++------- + kernel/trace/bpf_trace.c | 4 ++-- + 3 files changed, 12 insertions(+), 10 deletions(-) + +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -2299,6 +2299,6 @@ struct bpf_bprintf_data { + + int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u32 num_args, struct bpf_bprintf_data *data); +-void bpf_bprintf_cleanup(void); ++void bpf_bprintf_cleanup(struct bpf_bprintf_data *data); + + #endif /* _LINUX_BPF_H */ +--- a/kernel/bpf/helpers.c ++++ b/kernel/bpf/helpers.c +@@ -738,12 +738,14 @@ static int try_get_fmt_tmp_buf(char **tm + return 0; + } + +-void bpf_bprintf_cleanup(void) ++void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) + { +- if (this_cpu_read(bpf_bprintf_nest_level)) { +- this_cpu_dec(bpf_bprintf_nest_level); +- preempt_enable(); +- } ++ if (!data->bin_args) ++ return; ++ if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) ++ return; ++ this_cpu_dec(bpf_bprintf_nest_level); ++ preempt_enable(); + } + + /* +@@ -975,7 +977,7 @@ nocopy_fmt: + err = 0; + out: + if (err) +- bpf_bprintf_cleanup(); ++ bpf_bprintf_cleanup(data); + return err; + } + +@@ -1001,7 +1003,7 @@ BPF_CALL_5(bpf_snprintf, char *, str, u3 + + err = bstr_printf(str, str_size, fmt, data.bin_args); + +- bpf_bprintf_cleanup(); ++ bpf_bprintf_cleanup(&data); + + return err + 1; + } +--- a/kernel/trace/bpf_trace.c ++++ b/kernel/trace/bpf_trace.c +@@ -387,7 +387,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt + trace_bpf_trace_printk(buf); + raw_spin_unlock_irqrestore(&trace_printk_lock, flags); + +- bpf_bprintf_cleanup(); ++ bpf_bprintf_cleanup(&data); + + return ret; + } +@@ -435,7 +435,7 @@ BPF_CALL_5(bpf_seq_printf, struct seq_fi + + seq_bprintf(m, fmt, data.bin_args); + +- bpf_bprintf_cleanup(); ++ bpf_bprintf_cleanup(&data); + + return seq_has_overflowed(m) ? -EOVERFLOW : 0; + } diff --git a/queue-5.15/bpf-merge-printk-and-seq_printf-vararg-max-macros.patch b/queue-5.15/bpf-merge-printk-and-seq_printf-vararg-max-macros.patch new file mode 100644 index 00000000000..156025eddbf --- /dev/null +++ b/queue-5.15/bpf-merge-printk-and-seq_printf-vararg-max-macros.patch @@ -0,0 +1,83 @@ +From stable+bounces-20399-greg=kroah.com@vger.kernel.org Sat Feb 17 13:13:52 2024 +From: Thadeu Lima de Souza Cascardo +Date: Sat, 17 Feb 2024 09:13:18 -0300 +Subject: bpf: Merge printk and seq_printf VARARG max macros +To: stable@vger.kernel.org +Cc: cascardo@igalia.com, jolsa@kernel.org, daniel@iogearbox.net, yhs@fb.com +Message-ID: <20240217121321.2045993-5-cascardo@igalia.com> + +From: Dave Marchevsky + +commit 335ff4990cf3bfa42d8846f9b3d8c09456f51801 upstream. + +MAX_SNPRINTF_VARARGS and MAX_SEQ_PRINTF_VARARGS are used by bpf helpers +bpf_snprintf and bpf_seq_printf to limit their varargs. Both call into +bpf_bprintf_prepare for print formatting logic and have convenience +macros in libbpf (BPF_SNPRINTF, BPF_SEQ_PRINTF) which use the same +helper macros to convert varargs to a byte array. + +Changing shared functionality to support more varargs for either bpf +helper would affect the other as well, so let's combine the _VARARGS +macros to make this more obvious. + +Signed-off-by: Dave Marchevsky +Signed-off-by: Alexei Starovoitov +Acked-by: Andrii Nakryiko +Link: https://lore.kernel.org/bpf/20210917182911.2426606-2-davemarchevsky@fb.com +Signed-off-by: Thadeu Lima de Souza Cascardo +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/bpf.h | 2 ++ + kernel/bpf/helpers.c | 4 +--- + kernel/trace/bpf_trace.c | 4 +--- + 3 files changed, 4 insertions(+), 6 deletions(-) + +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -2290,6 +2290,8 @@ void bpf_arch_poke_desc_update(struct bp + struct btf_id_set; + bool btf_id_set_contains(const struct btf_id_set *set, u32 id); + ++#define MAX_BPRINTF_VARARGS 12 ++ + int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u32 **bin_buf, u32 num_args); + void bpf_bprintf_cleanup(void); +--- a/kernel/bpf/helpers.c ++++ b/kernel/bpf/helpers.c +@@ -979,15 +979,13 @@ out: + return err; + } + +-#define MAX_SNPRINTF_VARARGS 12 +- + BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, + const void *, data, u32, data_len) + { + int err, num_args; + u32 *bin_args; + +- if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 || ++ if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; + num_args = data_len / 8; +--- a/kernel/trace/bpf_trace.c ++++ b/kernel/trace/bpf_trace.c +@@ -414,15 +414,13 @@ const struct bpf_func_proto *bpf_get_tra + return &bpf_trace_printk_proto; + } + +-#define MAX_SEQ_PRINTF_VARARGS 12 +- + BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, + const void *, data, u32, data_len) + { + int err, num_args; + u32 *bin_args; + +- if (data_len & 7 || data_len > MAX_SEQ_PRINTF_VARARGS * 8 || ++ if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; + num_args = data_len / 8; diff --git a/queue-5.15/bpf-remove-trace_printk_lock.patch b/queue-5.15/bpf-remove-trace_printk_lock.patch new file mode 100644 index 00000000000..747b61f358c --- /dev/null +++ b/queue-5.15/bpf-remove-trace_printk_lock.patch @@ -0,0 +1,184 @@ +From stable+bounces-20403-greg=kroah.com@vger.kernel.org Sat Feb 17 13:13:55 2024 +From: Thadeu Lima de Souza Cascardo +Date: Sat, 17 Feb 2024 09:13:21 -0300 +Subject: bpf: Remove trace_printk_lock +To: stable@vger.kernel.org +Cc: cascardo@igalia.com, jolsa@kernel.org, daniel@iogearbox.net, yhs@fb.com +Message-ID: <20240217121321.2045993-8-cascardo@igalia.com> + +From: Jiri Olsa + +commit e2bb9e01d589f7fa82573aedd2765ff9b277816a upstream. + +Both bpf_trace_printk and bpf_trace_vprintk helpers use static buffer guarded +with trace_printk_lock spin lock. + +The spin lock contention causes issues with bpf programs attached to +contention_begin tracepoint [1][2]. + +Andrii suggested we could get rid of the contention by using trylock, but we +could actually get rid of the spinlock completely by using percpu buffers the +same way as for bin_args in bpf_bprintf_prepare function. + +Adding new return 'buf' argument to struct bpf_bprintf_data and making +bpf_bprintf_prepare to return also the buffer for printk helpers. + + [1] https://lore.kernel.org/bpf/CACkBjsakT_yWxnSWr4r-0TpPvbKm9-OBmVUhJb7hV3hY8fdCkw@mail.gmail.com/ + [2] https://lore.kernel.org/bpf/CACkBjsaCsTovQHFfkqJKto6S4Z8d02ud1D7MPESrHa1cVNNTrw@mail.gmail.com/ + +Reported-by: Hao Sun +Suggested-by: Andrii Nakryiko +Signed-off-by: Jiri Olsa +Signed-off-by: Daniel Borkmann +Acked-by: Yonghong Song +Link: https://lore.kernel.org/bpf/20221215214430.1336195-4-jolsa@kernel.org +[cascardo: there is no bpf_trace_vprintk in 5.15] +Signed-off-by: Thadeu Lima de Souza Cascardo +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/bpf.h | 3 +++ + kernel/bpf/helpers.c | 31 +++++++++++++++++++------------ + kernel/trace/bpf_trace.c | 11 +++-------- + 3 files changed, 25 insertions(+), 20 deletions(-) + +diff --git a/include/linux/bpf.h b/include/linux/bpf.h +index d18f717b6e7e..ef8fc639a575 100644 +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -2287,10 +2287,13 @@ struct btf_id_set; + bool btf_id_set_contains(const struct btf_id_set *set, u32 id); + + #define MAX_BPRINTF_VARARGS 12 ++#define MAX_BPRINTF_BUF 1024 + + struct bpf_bprintf_data { + u32 *bin_args; ++ char *buf; + bool get_bin_args; ++ bool get_buf; + }; + + int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, +diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c +index cfa23c7112ed..c8827d1ff3c5 100644 +--- a/kernel/bpf/helpers.c ++++ b/kernel/bpf/helpers.c +@@ -710,19 +710,20 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, + /* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary + * arguments representation. + */ +-#define MAX_BPRINTF_BUF_LEN 512 ++#define MAX_BPRINTF_BIN_ARGS 512 + + /* Support executing three nested bprintf helper calls on a given CPU */ + #define MAX_BPRINTF_NEST_LEVEL 3 + struct bpf_bprintf_buffers { +- char tmp_bufs[MAX_BPRINTF_NEST_LEVEL][MAX_BPRINTF_BUF_LEN]; ++ char bin_args[MAX_BPRINTF_BIN_ARGS]; ++ char buf[MAX_BPRINTF_BUF]; + }; +-static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs); ++ ++static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs); + static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); + +-static int try_get_fmt_tmp_buf(char **tmp_buf) ++static int try_get_buffers(struct bpf_bprintf_buffers **bufs) + { +- struct bpf_bprintf_buffers *bufs; + int nest_level; + + preempt_disable(); +@@ -732,15 +733,14 @@ static int try_get_fmt_tmp_buf(char **tmp_buf) + preempt_enable(); + return -EBUSY; + } +- bufs = this_cpu_ptr(&bpf_bprintf_bufs); +- *tmp_buf = bufs->tmp_bufs[nest_level - 1]; ++ *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]); + + return 0; + } + + void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) + { +- if (!data->bin_args) ++ if (!data->bin_args && !data->buf) + return; + if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) + return; +@@ -765,7 +765,9 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) + int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u32 num_args, struct bpf_bprintf_data *data) + { ++ bool get_buffers = (data->get_bin_args && num_args) || data->get_buf; + char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end; ++ struct bpf_bprintf_buffers *buffers = NULL; + size_t sizeof_cur_arg, sizeof_cur_ip; + int err, i, num_spec = 0; + u64 cur_arg; +@@ -776,14 +778,19 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + return -EINVAL; + fmt_size = fmt_end - fmt; + +- if (data->get_bin_args) { +- if (num_args && try_get_fmt_tmp_buf(&tmp_buf)) +- return -EBUSY; ++ if (get_buffers && try_get_buffers(&buffers)) ++ return -EBUSY; + +- tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN; ++ if (data->get_bin_args) { ++ if (num_args) ++ tmp_buf = buffers->bin_args; ++ tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS; + data->bin_args = (u32 *)tmp_buf; + } + ++ if (data->get_buf) ++ data->buf = buffers->buf; ++ + for (i = 0; i < fmt_size; i++) { + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + err = -EINVAL; +diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c +index 2bc85a87f020..a1dc0ff1962e 100644 +--- a/kernel/trace/bpf_trace.c ++++ b/kernel/trace/bpf_trace.c +@@ -360,8 +360,6 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) + return &bpf_probe_write_user_proto; + } + +-static DEFINE_RAW_SPINLOCK(trace_printk_lock); +- + #define MAX_TRACE_PRINTK_VARARGS 3 + #define BPF_TRACE_PRINTK_SIZE 1024 + +@@ -371,9 +369,8 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, + u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; + struct bpf_bprintf_data data = { + .get_bin_args = true, ++ .get_buf = true, + }; +- static char buf[BPF_TRACE_PRINTK_SIZE]; +- unsigned long flags; + int ret; + + ret = bpf_bprintf_prepare(fmt, fmt_size, args, +@@ -381,11 +378,9 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, + if (ret < 0) + return ret; + +- raw_spin_lock_irqsave(&trace_printk_lock, flags); +- ret = bstr_printf(buf, sizeof(buf), fmt, data.bin_args); ++ ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args); + +- trace_bpf_trace_printk(buf); +- raw_spin_unlock_irqrestore(&trace_printk_lock, flags); ++ trace_bpf_trace_printk(data.buf); + + bpf_bprintf_cleanup(&data); + +-- +2.34.1 + diff --git a/queue-5.15/series b/queue-5.15/series index 114db49cc82..37038ce3470 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -6,3 +6,9 @@ smb-client-fix-potential-oobs-in-smb2_parse_contexts.patch smb-client-fix-parsing-of-smb3.1.1-posix-create-context.patch sched-rt-sysctl_sched_rr_timeslice-show-default-timeslice-after-reset.patch pci-dwc-fix-a-64bit-bug-in-dw_pcie_ep_raise_msix_irq.patch +bpf-merge-printk-and-seq_printf-vararg-max-macros.patch +bpf-add-struct-for-bin_args-arg-in-bpf_bprintf_prepare.patch +bpf-do-cleanup-in-bpf_bprintf_cleanup-only-when-needed.patch +bpf-remove-trace_printk_lock.patch +userfaultfd-fix-mmap_changing-checking-in-mfill_atomic_hugetlb.patch +zonefs-improve-error-handling.patch diff --git a/queue-5.15/userfaultfd-fix-mmap_changing-checking-in-mfill_atomic_hugetlb.patch b/queue-5.15/userfaultfd-fix-mmap_changing-checking-in-mfill_atomic_hugetlb.patch new file mode 100644 index 00000000000..f5a3cea50ed --- /dev/null +++ b/queue-5.15/userfaultfd-fix-mmap_changing-checking-in-mfill_atomic_hugetlb.patch @@ -0,0 +1,80 @@ +From 67695f18d55924b2013534ef3bdc363bc9e14605 Mon Sep 17 00:00:00 2001 +From: Lokesh Gidra +Date: Wed, 17 Jan 2024 14:37:29 -0800 +Subject: userfaultfd: fix mmap_changing checking in mfill_atomic_hugetlb + +From: Lokesh Gidra + +commit 67695f18d55924b2013534ef3bdc363bc9e14605 upstream. + +In mfill_atomic_hugetlb(), mmap_changing isn't being checked +again if we drop mmap_lock and reacquire it. When the lock is not held, +mmap_changing could have been incremented. This is also inconsistent +with the behavior in mfill_atomic(). + +Link: https://lkml.kernel.org/r/20240117223729.1444522-1-lokeshgidra@google.com +Fixes: df2cc96e77011 ("userfaultfd: prevent non-cooperative events vs mcopy_atomic races") +Signed-off-by: Lokesh Gidra +Cc: Andrea Arcangeli +Cc: Mike Rapoport +Cc: Axel Rasmussen +Cc: Brian Geffon +Cc: David Hildenbrand +Cc: Jann Horn +Cc: Kalesh Singh +Cc: Matthew Wilcox (Oracle) +Cc: Nicolas Geoffray +Cc: Peter Xu +Cc: Suren Baghdasaryan +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Mike Rapoport (IBM) +Signed-off-by: Greg Kroah-Hartman +--- + mm/userfaultfd.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +--- a/mm/userfaultfd.c ++++ b/mm/userfaultfd.c +@@ -289,6 +289,7 @@ static __always_inline ssize_t __mcopy_a + unsigned long dst_start, + unsigned long src_start, + unsigned long len, ++ atomic_t *mmap_changing, + enum mcopy_atomic_mode mode) + { + int vm_shared = dst_vma->vm_flags & VM_SHARED; +@@ -405,6 +406,15 @@ retry: + goto out; + } + mmap_read_lock(dst_mm); ++ /* ++ * If memory mappings are changing because of non-cooperative ++ * operation (e.g. mremap) running in parallel, bail out and ++ * request the user to retry later ++ */ ++ if (mmap_changing && atomic_read(mmap_changing)) { ++ err = -EAGAIN; ++ break; ++ } + + dst_vma = NULL; + goto retry; +@@ -440,6 +450,7 @@ extern ssize_t __mcopy_atomic_hugetlb(st + unsigned long dst_start, + unsigned long src_start, + unsigned long len, ++ atomic_t *mmap_changing, + enum mcopy_atomic_mode mode); + #endif /* CONFIG_HUGETLB_PAGE */ + +@@ -561,7 +572,8 @@ retry: + */ + if (is_vm_hugetlb_page(dst_vma)) + return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, +- src_start, len, mcopy_mode); ++ src_start, len, mmap_changing, ++ mcopy_mode); + + if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) + goto out_unlock; diff --git a/queue-5.15/zonefs-improve-error-handling.patch b/queue-5.15/zonefs-improve-error-handling.patch new file mode 100644 index 00000000000..bfb6c32f4df --- /dev/null +++ b/queue-5.15/zonefs-improve-error-handling.patch @@ -0,0 +1,188 @@ +From 14db5f64a971fce3d8ea35de4dfc7f443a3efb92 Mon Sep 17 00:00:00 2001 +From: Damien Le Moal +Date: Thu, 8 Feb 2024 17:26:59 +0900 +Subject: zonefs: Improve error handling + +From: Damien Le Moal + +commit 14db5f64a971fce3d8ea35de4dfc7f443a3efb92 upstream. + +Write error handling is racy and can sometime lead to the error recovery +path wrongly changing the inode size of a sequential zone file to an +incorrect value which results in garbage data being readable at the end +of a file. There are 2 problems: + +1) zonefs_file_dio_write() updates a zone file write pointer offset + after issuing a direct IO with iomap_dio_rw(). This update is done + only if the IO succeed for synchronous direct writes. However, for + asynchronous direct writes, the update is done without waiting for + the IO completion so that the next asynchronous IO can be + immediately issued. However, if an asynchronous IO completes with a + failure right before the i_truncate_mutex lock protecting the update, + the update may change the value of the inode write pointer offset + that was corrected by the error path (zonefs_io_error() function). + +2) zonefs_io_error() is called when a read or write error occurs. This + function executes a report zone operation using the callback function + zonefs_io_error_cb(), which does all the error recovery handling + based on the current zone condition, write pointer position and + according to the mount options being used. However, depending on the + zoned device being used, a report zone callback may be executed in a + context that is different from the context of __zonefs_io_error(). As + a result, zonefs_io_error_cb() may be executed without the inode + truncate mutex lock held, which can lead to invalid error processing. + +Fix both problems as follows: +- Problem 1: Perform the inode write pointer offset update before a + direct write is issued with iomap_dio_rw(). This is safe to do as + partial direct writes are not supported (IOMAP_DIO_PARTIAL is not + set) and any failed IO will trigger the execution of zonefs_io_error() + which will correct the inode write pointer offset to reflect the + current state of the one on the device. +- Problem 2: Change zonefs_io_error_cb() into zonefs_handle_io_error() + and call this function directly from __zonefs_io_error() after + obtaining the zone information using blkdev_report_zones() with a + simple callback function that copies to a local stack variable the + struct blk_zone obtained from the device. This ensures that error + handling is performed holding the inode truncate mutex. + This change also simplifies error handling for conventional zone files + by bypassing the execution of report zones entirely. This is safe to + do because the condition of conventional zones cannot be read-only or + offline and conventional zone files are always fully mapped with a + constant file size. + +Reported-by: Shin'ichiro Kawasaki +Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system") +Cc: stable@vger.kernel.org +Signed-off-by: Damien Le Moal +Tested-by: Shin'ichiro Kawasaki +Reviewed-by: Johannes Thumshirn +Reviewed-by: Himanshu Madhani +Signed-off-by: Greg Kroah-Hartman +--- + fs/zonefs/super.c | 70 ++++++++++++++++++++++++++++++------------------------ + 1 file changed, 40 insertions(+), 30 deletions(-) + +--- a/fs/zonefs/super.c ++++ b/fs/zonefs/super.c +@@ -327,16 +327,18 @@ static loff_t zonefs_check_zone_conditio + } + } + +-struct zonefs_ioerr_data { +- struct inode *inode; +- bool write; +-}; +- + static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + void *data) + { +- struct zonefs_ioerr_data *err = data; +- struct inode *inode = err->inode; ++ struct blk_zone *z = data; ++ ++ *z = *zone; ++ return 0; ++} ++ ++static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone, ++ bool write) ++{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); +@@ -352,8 +354,8 @@ static int zonefs_io_error_cb(struct blk + isize = i_size_read(inode); + if (zone->cond != BLK_ZONE_COND_OFFLINE && + zone->cond != BLK_ZONE_COND_READONLY && +- !err->write && isize == data_size) +- return 0; ++ !write && isize == data_size) ++ return; + + /* + * At this point, we detected either a bad zone or an inconsistency +@@ -374,8 +376,9 @@ static int zonefs_io_error_cb(struct blk + * In all cases, warn about inode size inconsistency and handle the + * IO error according to the zone condition and to the mount options. + */ +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && isize != data_size) +- zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", ++ if (isize != data_size) ++ zonefs_warn(sb, ++ "inode %lu: invalid size %lld (should be %lld)\n", + inode->i_ino, isize, data_size); + + /* +@@ -435,8 +438,6 @@ static int zonefs_io_error_cb(struct blk + zonefs_update_stats(inode, data_size); + zonefs_i_size_write(inode, data_size); + zi->i_wpoffset = data_size; +- +- return 0; + } + + /* +@@ -450,23 +451,25 @@ static void __zonefs_io_error(struct ino + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; +- struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + unsigned int noio_flag; +- unsigned int nr_zones = 1; +- struct zonefs_ioerr_data err = { +- .inode = inode, +- .write = write, +- }; ++ struct blk_zone zone; + int ret; + + /* +- * The only files that have more than one zone are conventional zone +- * files with aggregated conventional zones, for which the inode zone +- * size is always larger than the device zone size. +- */ +- if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev)) +- nr_zones = zi->i_zone_size >> +- (sbi->s_zone_sectors_shift + SECTOR_SHIFT); ++ * Conventional zone have no write pointer and cannot become read-only ++ * or offline. So simply fake a report for a single or aggregated zone ++ * and let zonefs_handle_io_error() correct the zone inode information ++ * according to the mount options. ++ */ ++ if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) { ++ zone.start = zi->i_zsector; ++ zone.len = zi->i_max_size >> SECTOR_SHIFT; ++ zone.wp = zone.start + zone.len; ++ zone.type = BLK_ZONE_TYPE_CONVENTIONAL; ++ zone.cond = BLK_ZONE_COND_NOT_WP; ++ zone.capacity = zone.len; ++ goto handle_io_error; ++ } + + /* + * Memory allocations in blkdev_report_zones() can trigger a memory +@@ -477,12 +480,19 @@ static void __zonefs_io_error(struct ino + * the GFP_NOIO context avoids both problems. + */ + noio_flag = memalloc_noio_save(); +- ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, nr_zones, +- zonefs_io_error_cb, &err); +- if (ret != nr_zones) ++ ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, 1, ++ zonefs_io_error_cb, &zone); ++ memalloc_noio_restore(noio_flag); ++ if (ret != 1) { + zonefs_err(sb, "Get inode %lu zone information failed %d\n", + inode->i_ino, ret); +- memalloc_noio_restore(noio_flag); ++ zonefs_warn(sb, "remounting filesystem read-only\n"); ++ sb->s_flags |= SB_RDONLY; ++ return; ++ } ++ ++handle_io_error: ++ zonefs_handle_io_error(inode, &zone, write); + } + + static void zonefs_io_error(struct inode *inode, bool write)