From: Greg Kroah-Hartman Date: Thu, 23 Jun 2022 16:00:18 +0000 (+0200) Subject: 4.9-stable patches X-Git-Tag: v4.9.320~22 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=95abe8300ea1f17f22f40a210976e141ea08c01e;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: fuse-fix-pipe-buffer-lifetime-for-direct_io.patch reinstate-some-of-swiotlb-rework-fix-info-leak-with-dma_from_device.patch s390-mm-use-non-quiescing-sske-for-kvm-switch-to-keyed-guest.patch secure_seq-use-the-64-bits-of-the-siphash-for-port-offset-calculation.patch swiotlb-fix-info-leak-with-dma_from_device.patch tcp-add-small-random-increments-to-the-source-port.patch tcp-add-some-entropy-in-__inet_hash_connect.patch tcp-change-source-port-randomizarion-at-connect-time.patch tcp-drop-the-hash_32-part-from-the-index-calculation.patch tcp-dynamically-allocate-the-perturb-table-used-by-source-ports.patch tcp-increase-source-port-perturb-table-to-2-16.patch tcp-use-different-parts-of-the-port_offset-for-index-and-offset.patch xprtrdma-fix-incorrect-header-size-calculations.patch --- diff --git a/queue-4.9/fuse-fix-pipe-buffer-lifetime-for-direct_io.patch b/queue-4.9/fuse-fix-pipe-buffer-lifetime-for-direct_io.patch new file mode 100644 index 00000000000..2cc5ded659d --- /dev/null +++ b/queue-4.9/fuse-fix-pipe-buffer-lifetime-for-direct_io.patch @@ -0,0 +1,83 @@ +From foo@baz Thu Jun 23 05:59:40 PM CEST 2022 +From: Miklos Szeredi +Date: Mon, 7 Mar 2022 16:30:44 +0100 +Subject: fuse: fix pipe buffer lifetime for direct_io + +From: Miklos Szeredi + +commit 0c4bcfdecb1ac0967619ee7ff44871d93c08c909 upstream. + +In FOPEN_DIRECT_IO mode, fuse_file_write_iter() calls +fuse_direct_write_iter(), which normally calls fuse_direct_io(), which then +imports the write buffer with fuse_get_user_pages(), which uses +iov_iter_get_pages() to grab references to userspace pages instead of +actually copying memory. + +On the filesystem device side, these pages can then either be read to +userspace (via fuse_dev_read()), or splice()d over into a pipe using +fuse_dev_splice_read() as pipe buffers with &nosteal_pipe_buf_ops. + +This is wrong because after fuse_dev_do_read() unlocks the FUSE request, +the userspace filesystem can mark the request as completed, causing write() +to return. At that point, the userspace filesystem should no longer have +access to the pipe buffer. + +Fix by copying pages coming from the user address space to new pipe +buffers. + +Reported-by: Jann Horn +Fixes: c3021629a0d8 ("fuse: support splice() reading from fuse device") +Cc: +Signed-off-by: Miklos Szeredi +Signed-off-by: Zach O'Keefe +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + fs/fuse/dev.c | 12 +++++++++++- + fs/fuse/file.c | 1 + + fs/fuse/fuse_i.h | 2 ++ + 3 files changed, 14 insertions(+), 1 deletion(-) + +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -992,7 +992,17 @@ static int fuse_copy_page(struct fuse_co + + while (count) { + if (cs->write && cs->pipebufs && page) { +- return fuse_ref_page(cs, page, offset, count); ++ /* ++ * Can't control lifetime of pipe buffers, so always ++ * copy user pages. ++ */ ++ if (cs->req->user_pages) { ++ err = fuse_copy_fill(cs); ++ if (err) ++ return err; ++ } else { ++ return fuse_ref_page(cs, page, offset, count); ++ } + } else if (!cs->len) { + if (cs->move_pages && page && + offset == 0 && count == PAGE_SIZE) { +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -1319,6 +1319,7 @@ static int fuse_get_user_pages(struct fu + (PAGE_SIZE - ret) & (PAGE_SIZE - 1); + } + ++ req->user_pages = true; + if (write) + req->in.argpages = 1; + else +--- a/fs/fuse/fuse_i.h ++++ b/fs/fuse/fuse_i.h +@@ -310,6 +310,8 @@ struct fuse_req { + /** refcount */ + atomic_t count; + ++ bool user_pages; ++ + /** Unique ID for the interrupt request */ + u64 intr_unique; + diff --git a/queue-4.9/reinstate-some-of-swiotlb-rework-fix-info-leak-with-dma_from_device.patch b/queue-4.9/reinstate-some-of-swiotlb-rework-fix-info-leak-with-dma_from_device.patch new file mode 100644 index 00000000000..304e449d4ce --- /dev/null +++ b/queue-4.9/reinstate-some-of-swiotlb-rework-fix-info-leak-with-dma_from_device.patch @@ -0,0 +1,94 @@ +From foo@baz Thu Jun 23 05:59:40 PM CEST 2022 +From: Linus Torvalds +Date: Mon, 28 Mar 2022 11:37:05 -0700 +Subject: Reinstate some of "swiotlb: rework "fix info leak with DMA_FROM_DEVICE"" + +From: Linus Torvalds + +commit 901c7280ca0d5e2b4a8929fbe0bfb007ac2a6544 upstream. + +Halil Pasic points out [1] that the full revert of that commit (revert +in bddac7c1e02b), and that a partial revert that only reverts the +problematic case, but still keeps some of the cleanups is probably +better.  + +And that partial revert [2] had already been verified by Oleksandr +Natalenko to also fix the issue, I had just missed that in the long +discussion. + +So let's reinstate the cleanups from commit aa6f8dcbab47 ("swiotlb: +rework "fix info leak with DMA_FROM_DEVICE""), and effectively only +revert the part that caused problems. + +Link: https://lore.kernel.org/all/20220328013731.017ae3e3.pasic@linux.ibm.com/ [1] +Link: https://lore.kernel.org/all/20220324055732.GB12078@lst.de/ [2] +Link: https://lore.kernel.org/all/4386660.LvFx2qVVIh@natalenko.name/ [3] +Suggested-by: Halil Pasic +Tested-by: Oleksandr Natalenko +Cc: Christoph Hellwig" +Signed-off-by: Linus Torvalds +[OP: backport to 4.14: apply swiotlb_tbl_map_single() changes in lib/swiotlb.c] +Signed-off-by: Ovidiu Panait +Signed-off-by: Greg Kroah-Hartman +[bwh: Backported to 4.9: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/DMA-attributes.txt | 10 ---------- + include/linux/dma-mapping.h | 7 ------- + lib/swiotlb.c | 12 ++++++++---- + 3 files changed, 8 insertions(+), 21 deletions(-) + +--- a/Documentation/DMA-attributes.txt ++++ b/Documentation/DMA-attributes.txt +@@ -143,13 +143,3 @@ So, this provides a way for drivers to a + where allocation failures are not a problem, and shouldn't bother the logs. + + NOTE: At the moment DMA_ATTR_NO_WARN is only implemented on PowerPC. +- +-DMA_ATTR_PRIVILEGED +-------------------- +- +-Some advanced peripherals such as remote processors and GPUs perform +-accesses to DMA buffers in both privileged "supervisor" and unprivileged +-"user" modes. This attribute is used to indicate to the DMA-mapping +-subsystem that the buffer is fully accessible at the elevated privilege +-level (and ideally inaccessible or at least read-only at the +-lesser-privileged levels). +--- a/include/linux/dma-mapping.h ++++ b/include/linux/dma-mapping.h +@@ -61,13 +61,6 @@ + * allocation failure reports (similarly to __GFP_NOWARN). + */ + #define DMA_ATTR_NO_WARN (1UL << 8) +-/* +- * This is a hint to the DMA-mapping subsystem that the device is expected +- * to overwrite the entire mapped size, thus the caller does not require any +- * of the previous buffer contents to be preserved. This allows +- * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. +- */ +-#define DMA_ATTR_OVERWRITE (1UL << 10) + + /* + * A dma_addr_t can hold any valid DMA or bus address for the platform. +--- a/lib/swiotlb.c ++++ b/lib/swiotlb.c +@@ -532,10 +532,14 @@ found: + */ + for (i = 0; i < nslots; i++) + io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); +- if (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || +- dir == DMA_BIDIRECTIONAL) +- swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); +- ++ /* ++ * When dir == DMA_FROM_DEVICE we could omit the copy from the orig ++ * to the tlb buffer, if we knew for sure the device will ++ * overwirte the entire current content. But we don't. Thus ++ * unconditional bounce may prevent leaking swiotlb content (i.e. ++ * kernel memory) to user-space. ++ */ ++ swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); + return tlb_addr; + } + EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single); diff --git a/queue-4.9/s390-mm-use-non-quiescing-sske-for-kvm-switch-to-keyed-guest.patch b/queue-4.9/s390-mm-use-non-quiescing-sske-for-kvm-switch-to-keyed-guest.patch new file mode 100644 index 00000000000..f138e79fefb --- /dev/null +++ b/queue-4.9/s390-mm-use-non-quiescing-sske-for-kvm-switch-to-keyed-guest.patch @@ -0,0 +1,35 @@ +From 3ae11dbcfac906a8c3a480e98660a823130dc16a Mon Sep 17 00:00:00 2001 +From: Christian Borntraeger +Date: Mon, 30 May 2022 11:27:06 +0200 +Subject: s390/mm: use non-quiescing sske for KVM switch to keyed guest + +From: Christian Borntraeger + +commit 3ae11dbcfac906a8c3a480e98660a823130dc16a upstream. + +The switch to a keyed guest does not require a classic sske as the other +guest CPUs are not accessing the key before the switch is complete. +By using the NQ SSKE things are faster especially with multiple guests. + +Signed-off-by: Christian Borntraeger +Suggested-by: Janis Schoetterl-Glausch +Reviewed-by: Claudio Imbrenda +Link: https://lore.kernel.org/r/20220530092706.11637-3-borntraeger@linux.ibm.com +Signed-off-by: Christian Borntraeger +Signed-off-by: Heiko Carstens +Signed-off-by: Greg Kroah-Hartman +--- + arch/s390/mm/pgtable.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/s390/mm/pgtable.c ++++ b/arch/s390/mm/pgtable.c +@@ -595,7 +595,7 @@ void ptep_zap_key(struct mm_struct *mm, + PGSTE_GR_BIT | PGSTE_GC_BIT); + ptev = pte_val(*ptep); + if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) +- page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); ++ page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); + pgste_set_unlock(ptep, pgste); + preempt_enable(); + } diff --git a/queue-4.9/secure_seq-use-the-64-bits-of-the-siphash-for-port-offset-calculation.patch b/queue-4.9/secure_seq-use-the-64-bits-of-the-siphash-for-port-offset-calculation.patch new file mode 100644 index 00000000000..8b505b23737 --- /dev/null +++ b/queue-4.9/secure_seq-use-the-64-bits-of-the-siphash-for-port-offset-calculation.patch @@ -0,0 +1,138 @@ +From foo@baz Thu Jun 23 05:59:40 PM CEST 2022 +From: Willy Tarreau +Date: Mon, 2 May 2022 10:46:08 +0200 +Subject: secure_seq: use the 64 bits of the siphash for port offset calculation + +From: Willy Tarreau + +commit b2d057560b8107c633b39aabe517ff9d93f285e3 upstream. + +SipHash replaced MD5 in secure_ipv{4,6}_port_ephemeral() via commit +7cd23e5300c1 ("secure_seq: use SipHash in place of MD5"), but the output +remained truncated to 32-bit only. In order to exploit more bits from the +hash, let's make the functions return the full 64-bit of siphash_3u32(). +We also make sure the port offset calculation in __inet_hash_connect() +remains done on 32-bit to avoid the need for div_u64_rem() and an extra +cost on 32-bit systems. + +Cc: Jason A. Donenfeld +Cc: Moshe Kol +Cc: Yossi Gilad +Cc: Amit Klein +Reviewed-by: Eric Dumazet +Signed-off-by: Willy Tarreau +Signed-off-by: Jakub Kicinski +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + include/net/inet_hashtables.h | 2 +- + include/net/secure_seq.h | 4 ++-- + net/core/secure_seq.c | 4 ++-- + net/ipv4/inet_hashtables.c | 10 ++++++---- + net/ipv6/inet6_hashtables.c | 4 ++-- + 5 files changed, 13 insertions(+), 11 deletions(-) + +--- a/include/net/inet_hashtables.h ++++ b/include/net/inet_hashtables.h +@@ -382,7 +382,7 @@ static inline void sk_rcv_saddr_set(stru + } + + int __inet_hash_connect(struct inet_timewait_death_row *death_row, +- struct sock *sk, u32 port_offset, ++ struct sock *sk, u64 port_offset, + int (*check_established)(struct inet_timewait_death_row *, + struct sock *, __u16, + struct inet_timewait_sock **)); +--- a/include/net/secure_seq.h ++++ b/include/net/secure_seq.h +@@ -3,8 +3,8 @@ + + #include + +-u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); +-u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, ++u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); ++u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, + __be16 dport); + __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport); +--- a/net/core/secure_seq.c ++++ b/net/core/secure_seq.c +@@ -62,7 +62,7 @@ __u32 secure_tcpv6_sequence_number(const + } + EXPORT_SYMBOL(secure_tcpv6_sequence_number); + +-u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, ++u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, + __be16 dport) + { + u32 secret[MD5_MESSAGE_BYTES / 4]; +@@ -102,7 +102,7 @@ __u32 secure_tcp_sequence_number(__be32 + return seq_scale(hash[0]); + } + +-u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) ++u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) + { + u32 hash[MD5_DIGEST_WORDS]; + +--- a/net/ipv4/inet_hashtables.c ++++ b/net/ipv4/inet_hashtables.c +@@ -382,7 +382,7 @@ not_unique: + return -EADDRNOTAVAIL; + } + +-static u32 inet_sk_port_offset(const struct sock *sk) ++static u64 inet_sk_port_offset(const struct sock *sk) + { + const struct inet_sock *inet = inet_sk(sk); + +@@ -549,7 +549,7 @@ EXPORT_SYMBOL_GPL(inet_unhash); + static u32 table_perturb[1 << INET_TABLE_PERTURB_SHIFT]; + + int __inet_hash_connect(struct inet_timewait_death_row *death_row, +- struct sock *sk, u32 port_offset, ++ struct sock *sk, u64 port_offset, + int (*check_established)(struct inet_timewait_death_row *, + struct sock *, __u16, struct inet_timewait_sock **)) + { +@@ -589,7 +589,9 @@ int __inet_hash_connect(struct inet_time + net_get_random_once(table_perturb, sizeof(table_perturb)); + index = hash_32(port_offset, INET_TABLE_PERTURB_SHIFT); + +- offset = (READ_ONCE(table_perturb[index]) + port_offset) % remaining; ++ offset = READ_ONCE(table_perturb[index]) + port_offset; ++ offset %= remaining; ++ + /* In first pass we try ports of @low parity. + * inet_csk_get_port() does the opposite choice. + */ +@@ -670,7 +672,7 @@ ok: + int inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) + { +- u32 port_offset = 0; ++ u64 port_offset = 0; + + if (!inet_sk(sk)->inet_num) + port_offset = inet_sk_port_offset(sk); +--- a/net/ipv6/inet6_hashtables.c ++++ b/net/ipv6/inet6_hashtables.c +@@ -242,7 +242,7 @@ not_unique: + return -EADDRNOTAVAIL; + } + +-static u32 inet6_sk_port_offset(const struct sock *sk) ++static u64 inet6_sk_port_offset(const struct sock *sk) + { + const struct inet_sock *inet = inet_sk(sk); + +@@ -254,7 +254,7 @@ static u32 inet6_sk_port_offset(const st + int inet6_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) + { +- u32 port_offset = 0; ++ u64 port_offset = 0; + + if (!inet_sk(sk)->inet_num) + port_offset = inet6_sk_port_offset(sk); diff --git a/queue-4.9/series b/queue-4.9/series index 8215c5972d8..c5a5351e877 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -250,3 +250,16 @@ ext4-make-variable-count-signed.patch ext4-add-reserved-gdt-blocks-check.patch l2tp-don-t-use-inet_shutdown-on-ppp-session-destroy.patch l2tp-fix-race-in-pppol2tp_release-with-session-object-destroy.patch +s390-mm-use-non-quiescing-sske-for-kvm-switch-to-keyed-guest.patch +xprtrdma-fix-incorrect-header-size-calculations.patch +swiotlb-fix-info-leak-with-dma_from_device.patch +reinstate-some-of-swiotlb-rework-fix-info-leak-with-dma_from_device.patch +fuse-fix-pipe-buffer-lifetime-for-direct_io.patch +tcp-change-source-port-randomizarion-at-connect-time.patch +tcp-add-some-entropy-in-__inet_hash_connect.patch +secure_seq-use-the-64-bits-of-the-siphash-for-port-offset-calculation.patch +tcp-use-different-parts-of-the-port_offset-for-index-and-offset.patch +tcp-add-small-random-increments-to-the-source-port.patch +tcp-dynamically-allocate-the-perturb-table-used-by-source-ports.patch +tcp-increase-source-port-perturb-table-to-2-16.patch +tcp-drop-the-hash_32-part-from-the-index-calculation.patch diff --git a/queue-4.9/swiotlb-fix-info-leak-with-dma_from_device.patch b/queue-4.9/swiotlb-fix-info-leak-with-dma_from_device.patch new file mode 100644 index 00000000000..2713c40f0c0 --- /dev/null +++ b/queue-4.9/swiotlb-fix-info-leak-with-dma_from_device.patch @@ -0,0 +1,108 @@ +From foo@baz Thu Jun 23 05:59:40 PM CEST 2022 +From: Halil Pasic +Date: Fri, 11 Feb 2022 02:12:52 +0100 +Subject: swiotlb: fix info leak with DMA_FROM_DEVICE + +From: Halil Pasic + +commit ddbd89deb7d32b1fbb879f48d68fda1a8ac58e8e upstream. + +The problem I'm addressing was discovered by the LTP test covering +cve-2018-1000204. + +A short description of what happens follows: +1) The test case issues a command code 00 (TEST UNIT READY) via the SG_IO + interface with: dxfer_len == 524288, dxdfer_dir == SG_DXFER_FROM_DEV + and a corresponding dxferp. The peculiar thing about this is that TUR + is not reading from the device. +2) In sg_start_req() the invocation of blk_rq_map_user() effectively + bounces the user-space buffer. As if the device was to transfer into + it. Since commit a45b599ad808 ("scsi: sg: allocate with __GFP_ZERO in + sg_build_indirect()") we make sure this first bounce buffer is + allocated with GFP_ZERO. +3) For the rest of the story we keep ignoring that we have a TUR, so the + device won't touch the buffer we prepare as if the we had a + DMA_FROM_DEVICE type of situation. My setup uses a virtio-scsi device + and the buffer allocated by SG is mapped by the function + virtqueue_add_split() which uses DMA_FROM_DEVICE for the "in" sgs (here + scatter-gather and not scsi generics). This mapping involves bouncing + via the swiotlb (we need swiotlb to do virtio in protected guest like + s390 Secure Execution, or AMD SEV). +4) When the SCSI TUR is done, we first copy back the content of the second + (that is swiotlb) bounce buffer (which most likely contains some + previous IO data), to the first bounce buffer, which contains all + zeros. Then we copy back the content of the first bounce buffer to + the user-space buffer. +5) The test case detects that the buffer, which it zero-initialized, + ain't all zeros and fails. + +One can argue that this is an swiotlb problem, because without swiotlb +we leak all zeros, and the swiotlb should be transparent in a sense that +it does not affect the outcome (if all other participants are well +behaved). + +Copying the content of the original buffer into the swiotlb buffer is +the only way I can think of to make swiotlb transparent in such +scenarios. So let's do just that if in doubt, but allow the driver +to tell us that the whole mapped buffer is going to be overwritten, +in which case we can preserve the old behavior and avoid the performance +impact of the extra bounce. + +Signed-off-by: Halil Pasic +Signed-off-by: Christoph Hellwig +[OP: backport to 4.14: apply swiotlb_tbl_map_single() changes in lib/swiotlb.c] +Signed-off-by: Ovidiu Panait +Signed-off-by: Greg Kroah-Hartman +[bwh: Backported to 4.9: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/DMA-attributes.txt | 10 ++++++++++ + include/linux/dma-mapping.h | 7 +++++++ + lib/swiotlb.c | 3 ++- + 3 files changed, 19 insertions(+), 1 deletion(-) + +--- a/Documentation/DMA-attributes.txt ++++ b/Documentation/DMA-attributes.txt +@@ -143,3 +143,13 @@ So, this provides a way for drivers to a + where allocation failures are not a problem, and shouldn't bother the logs. + + NOTE: At the moment DMA_ATTR_NO_WARN is only implemented on PowerPC. ++ ++DMA_ATTR_PRIVILEGED ++------------------- ++ ++Some advanced peripherals such as remote processors and GPUs perform ++accesses to DMA buffers in both privileged "supervisor" and unprivileged ++"user" modes. This attribute is used to indicate to the DMA-mapping ++subsystem that the buffer is fully accessible at the elevated privilege ++level (and ideally inaccessible or at least read-only at the ++lesser-privileged levels). +--- a/include/linux/dma-mapping.h ++++ b/include/linux/dma-mapping.h +@@ -61,6 +61,13 @@ + * allocation failure reports (similarly to __GFP_NOWARN). + */ + #define DMA_ATTR_NO_WARN (1UL << 8) ++/* ++ * This is a hint to the DMA-mapping subsystem that the device is expected ++ * to overwrite the entire mapped size, thus the caller does not require any ++ * of the previous buffer contents to be preserved. This allows ++ * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. ++ */ ++#define DMA_ATTR_OVERWRITE (1UL << 10) + + /* + * A dma_addr_t can hold any valid DMA or bus address for the platform. +--- a/lib/swiotlb.c ++++ b/lib/swiotlb.c +@@ -532,7 +532,8 @@ found: + */ + for (i = 0; i < nslots; i++) + io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); +- if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) ++ if (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || ++ dir == DMA_BIDIRECTIONAL) + swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); + + return tlb_addr; diff --git a/queue-4.9/tcp-add-small-random-increments-to-the-source-port.patch b/queue-4.9/tcp-add-small-random-increments-to-the-source-port.patch new file mode 100644 index 00000000000..56d0b9dac03 --- /dev/null +++ b/queue-4.9/tcp-add-small-random-increments-to-the-source-port.patch @@ -0,0 +1,53 @@ +From foo@baz Thu Jun 23 05:59:40 PM CEST 2022 +From: Willy Tarreau +Date: Mon, 2 May 2022 10:46:11 +0200 +Subject: tcp: add small random increments to the source port + +From: Willy Tarreau + +commit ca7af0402550f9a0b3316d5f1c30904e42ed257d upstream. + +Here we're randomly adding between 0 and 7 random increments to the +selected source port in order to add some noise in the source port +selection that will make the next port less predictable. + +With the default port range of 32768-60999 this means a worst case +reuse scenario of 14116/8=1764 connections between two consecutive +uses of the same port, with an average of 14116/4.5=3137. This code +was stressed at more than 800000 connections per second to a fixed +target with all connections closed by the client using RSTs (worst +condition) and only 2 connections failed among 13 billion, despite +the hash being reseeded every 10 seconds, indicating a perfectly +safe situation. + +Cc: Moshe Kol +Cc: Yossi Gilad +Cc: Amit Klein +Reviewed-by: Eric Dumazet +Signed-off-by: Willy Tarreau +Signed-off-by: Jakub Kicinski +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/inet_hashtables.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/net/ipv4/inet_hashtables.c ++++ b/net/ipv4/inet_hashtables.c +@@ -644,11 +644,12 @@ next_port: + return -EADDRNOTAVAIL; + + ok: +- /* If our first attempt found a candidate, skip next candidate +- * in 1/16 of cases to add some noise. ++ /* Here we want to add a little bit of randomness to the next source ++ * port that will be chosen. We use a max() with a random here so that ++ * on low contention the randomness is maximal and on high contention ++ * it may be inexistent. + */ +- if (!i && !(prandom_u32() % 16)) +- i = 2; ++ i = max_t(int, i, (prandom_u32() & 7) * 2); + WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); + + /* Head lock still held and bh's disabled */ diff --git a/queue-4.9/tcp-add-some-entropy-in-__inet_hash_connect.patch b/queue-4.9/tcp-add-some-entropy-in-__inet_hash_connect.patch new file mode 100644 index 00000000000..2d254e36d0d --- /dev/null +++ b/queue-4.9/tcp-add-some-entropy-in-__inet_hash_connect.patch @@ -0,0 +1,49 @@ +From foo@baz Thu Jun 23 05:59:40 PM CEST 2022 +From: Eric Dumazet +Date: Tue, 9 Feb 2021 11:20:28 -0800 +Subject: tcp: add some entropy in __inet_hash_connect() + +From: Eric Dumazet + +commit c579bd1b4021c42ae247108f1e6f73dd3f08600c upstream. + +Even when implementing RFC 6056 3.3.4 (Algorithm 4: Double-Hash +Port Selection Algorithm), a patient attacker could still be able +to collect enough state from an otherwise idle host. + +Idea of this patch is to inject some noise, in the +cases __inet_hash_connect() found a candidate in the first +attempt. + +This noise should not significantly reduce the collision +avoidance, and should be zero if connection table +is already well used. + +Note that this is not implementing RFC 6056 3.3.5 +because we think Algorithm 5 could hurt typical +workloads. + +Signed-off-by: Eric Dumazet +Cc: David Dworken +Cc: Willem de Bruijn +Signed-off-by: David S. Miller +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/inet_hashtables.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/net/ipv4/inet_hashtables.c ++++ b/net/ipv4/inet_hashtables.c +@@ -642,6 +642,11 @@ next_port: + return -EADDRNOTAVAIL; + + ok: ++ /* If our first attempt found a candidate, skip next candidate ++ * in 1/16 of cases to add some noise. ++ */ ++ if (!i && !(prandom_u32() % 16)) ++ i = 2; + WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); + + /* Head lock still held and bh's disabled */ diff --git a/queue-4.9/tcp-change-source-port-randomizarion-at-connect-time.patch b/queue-4.9/tcp-change-source-port-randomizarion-at-connect-time.patch new file mode 100644 index 00000000000..f974ec3b2c5 --- /dev/null +++ b/queue-4.9/tcp-change-source-port-randomizarion-at-connect-time.patch @@ -0,0 +1,98 @@ +From foo@baz Thu Jun 23 05:59:40 PM CEST 2022 +From: Eric Dumazet +Date: Tue, 9 Feb 2021 11:20:27 -0800 +Subject: tcp: change source port randomizarion at connect() time + +From: Eric Dumazet + +commit 190cc82489f46f9d88e73c81a47e14f80a791e1a upstream. + +RFC 6056 (Recommendations for Transport-Protocol Port Randomization) +provides good summary of why source selection needs extra care. + +David Dworken reminded us that linux implements Algorithm 3 +as described in RFC 6056 3.3.3 + +Quoting David : + In the context of the web, this creates an interesting info leak where + websites can count how many TCP connections a user's computer is + establishing over time. For example, this allows a website to count + exactly how many subresources a third party website loaded. + This also allows: + - Distinguishing between different users behind a VPN based on + distinct source port ranges. + - Tracking users over time across multiple networks. + - Covert communication channels between different browsers/browser + profiles running on the same computer + - Tracking what applications are running on a computer based on + the pattern of how fast source ports are getting incremented. + +Section 3.3.4 describes an enhancement, that reduces +attackers ability to use the basic information currently +stored into the shared 'u32 hint'. + +This change also decreases collision rate when +multiple applications need to connect() to +different destinations. + +Signed-off-by: Eric Dumazet +Reported-by: David Dworken +Cc: Willem de Bruijn +Signed-off-by: David S. Miller +[bwh: Backported to 4.9: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/inet_hashtables.c | 20 +++++++++++++++++--- + 1 file changed, 17 insertions(+), 3 deletions(-) + +--- a/net/ipv4/inet_hashtables.c ++++ b/net/ipv4/inet_hashtables.c +@@ -537,6 +537,17 @@ void inet_unhash(struct sock *sk) + } + EXPORT_SYMBOL_GPL(inet_unhash); + ++/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm ++ * Note that we use 32bit integers (vs RFC 'short integers') ++ * because 2^16 is not a multiple of num_ephemeral and this ++ * property might be used by clever attacker. ++ * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, ++ * we use 256 instead to really give more isolation and ++ * privacy, this only consumes 1 KB of kernel memory. ++ */ ++#define INET_TABLE_PERTURB_SHIFT 8 ++static u32 table_perturb[1 << INET_TABLE_PERTURB_SHIFT]; ++ + int __inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk, u32 port_offset, + int (*check_established)(struct inet_timewait_death_row *, +@@ -550,7 +561,7 @@ int __inet_hash_connect(struct inet_time + struct inet_bind_bucket *tb; + u32 remaining, offset; + int ret, i, low, high; +- static u32 hint; ++ u32 index; + + if (port) { + head = &hinfo->bhash[inet_bhashfn(net, port, +@@ -575,7 +586,10 @@ int __inet_hash_connect(struct inet_time + if (likely(remaining > 1)) + remaining &= ~1U; + +- offset = (hint + port_offset) % remaining; ++ net_get_random_once(table_perturb, sizeof(table_perturb)); ++ index = hash_32(port_offset, INET_TABLE_PERTURB_SHIFT); ++ ++ offset = (READ_ONCE(table_perturb[index]) + port_offset) % remaining; + /* In first pass we try ports of @low parity. + * inet_csk_get_port() does the opposite choice. + */ +@@ -628,7 +642,7 @@ next_port: + return -EADDRNOTAVAIL; + + ok: +- hint += i + 2; ++ WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); diff --git a/queue-4.9/tcp-drop-the-hash_32-part-from-the-index-calculation.patch b/queue-4.9/tcp-drop-the-hash_32-part-from-the-index-calculation.patch new file mode 100644 index 00000000000..6adcf876a89 --- /dev/null +++ b/queue-4.9/tcp-drop-the-hash_32-part-from-the-index-calculation.patch @@ -0,0 +1,37 @@ +From foo@baz Thu Jun 23 05:59:40 PM CEST 2022 +From: Willy Tarreau +Date: Mon, 2 May 2022 10:46:14 +0200 +Subject: tcp: drop the hash_32() part from the index calculation + +From: Willy Tarreau + +commit e8161345ddbb66e449abde10d2fdce93f867eba9 upstream. + +In commit 190cc82489f4 ("tcp: change source port randomizarion at +connect() time"), the table_perturb[] array was introduced and an +index was taken from the port_offset via hash_32(). But it turns +out that hash_32() performs a multiplication while the input here +comes from the output of SipHash in secure_seq, that is well +distributed enough to avoid the need for yet another hash. + +Suggested-by: Amit Klein +Reviewed-by: Eric Dumazet +Signed-off-by: Willy Tarreau +Signed-off-by: Jakub Kicinski +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/inet_hashtables.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv4/inet_hashtables.c ++++ b/net/ipv4/inet_hashtables.c +@@ -590,7 +590,7 @@ int __inet_hash_connect(struct inet_time + + net_get_random_once(table_perturb, + INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); +- index = hash_32(port_offset, INET_TABLE_PERTURB_SHIFT); ++ index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); + + offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); + offset %= remaining; diff --git a/queue-4.9/tcp-dynamically-allocate-the-perturb-table-used-by-source-ports.patch b/queue-4.9/tcp-dynamically-allocate-the-perturb-table-used-by-source-ports.patch new file mode 100644 index 00000000000..22080f9ec74 --- /dev/null +++ b/queue-4.9/tcp-dynamically-allocate-the-perturb-table-used-by-source-ports.patch @@ -0,0 +1,68 @@ +From foo@baz Thu Jun 23 05:59:40 PM CEST 2022 +From: Willy Tarreau +Date: Mon, 2 May 2022 10:46:12 +0200 +Subject: tcp: dynamically allocate the perturb table used by source ports + +From: Willy Tarreau + +commit e9261476184be1abd486c9434164b2acbe0ed6c2 upstream. + +We'll need to further increase the size of this table and it's likely +that at some point its size will not be suitable anymore for a static +table. Let's allocate it on boot from inet_hashinfo2_init(), which is +called from tcp_init(). + +Cc: Moshe Kol +Cc: Yossi Gilad +Cc: Amit Klein +Reviewed-by: Eric Dumazet +Signed-off-by: Willy Tarreau +Signed-off-by: Jakub Kicinski +[bwh: Backported to 4.9: + - There is no inet_hashinfo2_init(), so allocate the table in + inet_hashinfo_init() when called by TCP + - Adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/inet_hashtables.c | 15 +++++++++++++-- + 1 file changed, 13 insertions(+), 2 deletions(-) + +--- a/net/ipv4/inet_hashtables.c ++++ b/net/ipv4/inet_hashtables.c +@@ -546,7 +546,8 @@ EXPORT_SYMBOL_GPL(inet_unhash); + * privacy, this only consumes 1 KB of kernel memory. + */ + #define INET_TABLE_PERTURB_SHIFT 8 +-static u32 table_perturb[1 << INET_TABLE_PERTURB_SHIFT]; ++#define INET_TABLE_PERTURB_SIZE (1 << INET_TABLE_PERTURB_SHIFT) ++static u32 *table_perturb; + + int __inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk, u64 port_offset, +@@ -586,7 +587,8 @@ int __inet_hash_connect(struct inet_time + if (likely(remaining > 1)) + remaining &= ~1U; + +- net_get_random_once(table_perturb, sizeof(table_perturb)); ++ net_get_random_once(table_perturb, ++ INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); + index = hash_32(port_offset, INET_TABLE_PERTURB_SHIFT); + + offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); +@@ -691,6 +693,15 @@ void inet_hashinfo_init(struct inet_hash + INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head, + i + LISTENING_NULLS_BASE); + } ++ ++ if (h != &tcp_hashinfo) ++ return; ++ ++ /* this one is used for source ports of outgoing connections */ ++ table_perturb = kmalloc_array(INET_TABLE_PERTURB_SIZE, ++ sizeof(*table_perturb), GFP_KERNEL); ++ if (!table_perturb) ++ panic("TCP: failed to alloc table_perturb"); + } + EXPORT_SYMBOL_GPL(inet_hashinfo_init); + diff --git a/queue-4.9/tcp-increase-source-port-perturb-table-to-2-16.patch b/queue-4.9/tcp-increase-source-port-perturb-table-to-2-16.patch new file mode 100644 index 00000000000..52f9a455726 --- /dev/null +++ b/queue-4.9/tcp-increase-source-port-perturb-table-to-2-16.patch @@ -0,0 +1,60 @@ +From foo@baz Thu Jun 23 05:59:40 PM CEST 2022 +From: Willy Tarreau +Date: Mon, 2 May 2022 10:46:13 +0200 +Subject: tcp: increase source port perturb table to 2^16 + +From: Willy Tarreau + +commit 4c2c8f03a5ab7cb04ec64724d7d176d00bcc91e5 upstream. + +Moshe Kol, Amit Klein, and Yossi Gilad reported being able to accurately +identify a client by forcing it to emit only 40 times more connections +than there are entries in the table_perturb[] table. The previous two +improvements consisting in resalting the secret every 10s and adding +randomness to each port selection only slightly improved the situation, +and the current value of 2^8 was too small as it's not very difficult +to make a client emit 10k connections in less than 10 seconds. + +Thus we're increasing the perturb table from 2^8 to 2^16 so that the +same precision now requires 2.6M connections, which is more difficult in +this time frame and harder to hide as a background activity. The impact +is that the table now uses 256 kB instead of 1 kB, which could mostly +affect devices making frequent outgoing connections. However such +components usually target a small set of destinations (load balancers, +database clients, perf assessment tools), and in practice only a few +entries will be visited, like before. + +A live test at 1 million connections per second showed no performance +difference from the previous value. + +Reported-by: Moshe Kol +Reported-by: Yossi Gilad +Reported-by: Amit Klein +Reviewed-by: Eric Dumazet +Signed-off-by: Willy Tarreau +Signed-off-by: Jakub Kicinski +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/inet_hashtables.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/net/ipv4/inet_hashtables.c ++++ b/net/ipv4/inet_hashtables.c +@@ -541,11 +541,12 @@ EXPORT_SYMBOL_GPL(inet_unhash); + * Note that we use 32bit integers (vs RFC 'short integers') + * because 2^16 is not a multiple of num_ephemeral and this + * property might be used by clever attacker. +- * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, +- * we use 256 instead to really give more isolation and +- * privacy, this only consumes 1 KB of kernel memory. ++ * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though ++ * attacks were since demonstrated, thus we use 65536 instead to really ++ * give more isolation and privacy, at the expense of 256kB of kernel ++ * memory. + */ +-#define INET_TABLE_PERTURB_SHIFT 8 ++#define INET_TABLE_PERTURB_SHIFT 16 + #define INET_TABLE_PERTURB_SIZE (1 << INET_TABLE_PERTURB_SHIFT) + static u32 *table_perturb; + diff --git a/queue-4.9/tcp-use-different-parts-of-the-port_offset-for-index-and-offset.patch b/queue-4.9/tcp-use-different-parts-of-the-port_offset-for-index-and-offset.patch new file mode 100644 index 00000000000..bca17ae8140 --- /dev/null +++ b/queue-4.9/tcp-use-different-parts-of-the-port_offset-for-index-and-offset.patch @@ -0,0 +1,37 @@ +From foo@baz Thu Jun 23 05:59:40 PM CEST 2022 +From: Willy Tarreau +Date: Mon, 2 May 2022 10:46:09 +0200 +Subject: tcp: use different parts of the port_offset for index and offset + +From: Willy Tarreau + +commit 9e9b70ae923baf2b5e8a0ea4fd0c8451801ac526 upstream. + +Amit Klein suggests that we use different parts of port_offset for the +table's index and the port offset so that there is no direct relation +between them. + +Cc: Jason A. Donenfeld +Cc: Moshe Kol +Cc: Yossi Gilad +Cc: Amit Klein +Reviewed-by: Eric Dumazet +Signed-off-by: Willy Tarreau +Signed-off-by: Jakub Kicinski +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/inet_hashtables.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv4/inet_hashtables.c ++++ b/net/ipv4/inet_hashtables.c +@@ -589,7 +589,7 @@ int __inet_hash_connect(struct inet_time + net_get_random_once(table_perturb, sizeof(table_perturb)); + index = hash_32(port_offset, INET_TABLE_PERTURB_SHIFT); + +- offset = READ_ONCE(table_perturb[index]) + port_offset; ++ offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); + offset %= remaining; + + /* In first pass we try ports of @low parity. diff --git a/queue-4.9/xprtrdma-fix-incorrect-header-size-calculations.patch b/queue-4.9/xprtrdma-fix-incorrect-header-size-calculations.patch new file mode 100644 index 00000000000..d6cbeecfa66 --- /dev/null +++ b/queue-4.9/xprtrdma-fix-incorrect-header-size-calculations.patch @@ -0,0 +1,46 @@ +From foo@baz Thu Jun 23 05:59:40 PM CEST 2022 +From: Colin Ian King +Date: Wed, 15 Jul 2020 17:26:04 +0100 +Subject: xprtrdma: fix incorrect header size calculations + +From: Colin Ian King + +commit 912288442cb2f431bf3c8cb097a5de83bc6dbac1 upstream. + +Currently the header size calculations are using an assignment +operator instead of a += operator when accumulating the header +size leading to incorrect sizes. Fix this by using the correct +operator. + +Addresses-Coverity: ("Unused value") +Fixes: 302d3deb2068 ("xprtrdma: Prevent inline overflow") +Signed-off-by: Colin Ian King +Reviewed-by: Chuck Lever +Signed-off-by: Anna Schumaker +[bwh: Backported to 4.9: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/sunrpc/xprtrdma/rpc_rdma.c ++++ b/net/sunrpc/xprtrdma/rpc_rdma.c +@@ -75,7 +75,7 @@ static unsigned int rpcrdma_max_call_hea + + /* Maximum Read list size */ + maxsegs += 2; /* segment for head and tail buffers */ +- size = maxsegs * sizeof(struct rpcrdma_read_chunk); ++ size += maxsegs * sizeof(struct rpcrdma_read_chunk); + + /* Minimal Read chunk size */ + size += sizeof(__be32); /* segment count */ +@@ -101,7 +101,7 @@ static unsigned int rpcrdma_max_reply_he + + /* Maximum Write list size */ + maxsegs += 2; /* segment for head and tail buffers */ +- size = sizeof(__be32); /* segment count */ ++ size += sizeof(__be32); /* segment count */ + size += maxsegs * sizeof(struct rpcrdma_segment); + size += sizeof(__be32); /* list discriminator */ +