--- /dev/null
+From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Sat, 17 Aug 2019 00:00:08 +0100
+Subject: bpf: add bpf_jit_limit knob to restrict unpriv allocations
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, Sasha Levin <sashal@kernel.org>
+Cc: stable <stable@vger.kernel.org>
+Message-ID: <20190816230008.GG9843@xylophone.i.decadent.org.uk>
+Content-Disposition: inline
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit ede95a63b5e84ddeea6b0c473b36ab8bfd8c6ce3 upstream.
+
+Rick reported that the BPF JIT could potentially fill the entire module
+space with BPF programs from unprivileged users which would prevent later
+attempts to load normal kernel modules or privileged BPF programs, for
+example. If JIT was enabled but unsuccessful to generate the image, then
+before commit 290af86629b2 ("bpf: introduce BPF_JIT_ALWAYS_ON config")
+we would always fall back to the BPF interpreter. Nowadays in the case
+where the CONFIG_BPF_JIT_ALWAYS_ON could be set, then the load will abort
+with a failure since the BPF interpreter was compiled out.
+
+Add a global limit and enforce it for unprivileged users such that in case
+of BPF interpreter compiled out we fail once the limit has been reached
+or we fall back to BPF interpreter earlier w/o using module mem if latter
+was compiled in. In a next step, fair share among unprivileged users can
+be resolved in particular for the case where we would fail hard once limit
+is reached.
+
+Fixes: 290af86629b2 ("bpf: introduce BPF_JIT_ALWAYS_ON config")
+Fixes: 0a14842f5a3c ("net: filter: Just In Time compiler for x86-64")
+Co-Developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Cc: Eric Dumazet <eric.dumazet@gmail.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Kees Cook <keescook@chromium.org>
+Cc: LKML <linux-kernel@vger.kernel.org>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+[bwh: Backported to 4.9: adjust context]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/sysctl/net.txt | 8 +++++++
+ include/linux/filter.h | 1
+ kernel/bpf/core.c | 49 ++++++++++++++++++++++++++++++++++++++++---
+ net/core/sysctl_net_core.c | 10 +++++++-
+ 4 files changed, 63 insertions(+), 5 deletions(-)
+
+--- a/Documentation/sysctl/net.txt
++++ b/Documentation/sysctl/net.txt
+@@ -54,6 +54,14 @@ Values :
+ 1 - enable JIT hardening for unprivileged users only
+ 2 - enable JIT hardening for all users
+
++bpf_jit_limit
++-------------
++
++This enforces a global limit for memory allocations to the BPF JIT
++compiler in order to reject unprivileged JIT requests once it has
++been surpassed. bpf_jit_limit contains the value of the global limit
++in bytes.
++
+ dev_weight
+ --------------
+
+--- a/include/linux/filter.h
++++ b/include/linux/filter.h
+@@ -599,6 +599,7 @@ void bpf_warn_invalid_xdp_action(u32 act
+ #ifdef CONFIG_BPF_JIT
+ extern int bpf_jit_enable;
+ extern int bpf_jit_harden;
++extern int bpf_jit_limit;
+
+ typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
+
+--- a/kernel/bpf/core.c
++++ b/kernel/bpf/core.c
+@@ -208,9 +208,43 @@ struct bpf_prog *bpf_patch_insn_single(s
+ }
+
+ #ifdef CONFIG_BPF_JIT
++# define BPF_JIT_LIMIT_DEFAULT (PAGE_SIZE * 40000)
++
+ /* All BPF JIT sysctl knobs here. */
+ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+ int bpf_jit_harden __read_mostly;
++int bpf_jit_limit __read_mostly = BPF_JIT_LIMIT_DEFAULT;
++
++static atomic_long_t bpf_jit_current;
++
++#if defined(MODULES_VADDR)
++static int __init bpf_jit_charge_init(void)
++{
++ /* Only used as heuristic here to derive limit. */
++ bpf_jit_limit = min_t(u64, round_up((MODULES_END - MODULES_VADDR) >> 2,
++ PAGE_SIZE), INT_MAX);
++ return 0;
++}
++pure_initcall(bpf_jit_charge_init);
++#endif
++
++static int bpf_jit_charge_modmem(u32 pages)
++{
++ if (atomic_long_add_return(pages, &bpf_jit_current) >
++ (bpf_jit_limit >> PAGE_SHIFT)) {
++ if (!capable(CAP_SYS_ADMIN)) {
++ atomic_long_sub(pages, &bpf_jit_current);
++ return -EPERM;
++ }
++ }
++
++ return 0;
++}
++
++static void bpf_jit_uncharge_modmem(u32 pages)
++{
++ atomic_long_sub(pages, &bpf_jit_current);
++}
+
+ struct bpf_binary_header *
+ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
+@@ -218,21 +252,27 @@ bpf_jit_binary_alloc(unsigned int progle
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
+ {
+ struct bpf_binary_header *hdr;
+- unsigned int size, hole, start;
++ u32 size, hole, start, pages;
+
+ /* Most of BPF filters are really small, but if some of them
+ * fill a page, allow at least 128 extra bytes to insert a
+ * random section of illegal instructions.
+ */
+ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
++ pages = size / PAGE_SIZE;
++
++ if (bpf_jit_charge_modmem(pages))
++ return NULL;
+ hdr = module_alloc(size);
+- if (hdr == NULL)
++ if (!hdr) {
++ bpf_jit_uncharge_modmem(pages);
+ return NULL;
++ }
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(hdr, size);
+
+- hdr->pages = size / PAGE_SIZE;
++ hdr->pages = pages;
+ hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
+ PAGE_SIZE - sizeof(*hdr));
+ start = (get_random_int() % hole) & ~(alignment - 1);
+@@ -245,7 +285,10 @@ bpf_jit_binary_alloc(unsigned int progle
+
+ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
+ {
++ u32 pages = hdr->pages;
++
+ module_memfree(hdr);
++ bpf_jit_uncharge_modmem(pages);
+ }
+
+ static int bpf_jit_blind_insn(const struct bpf_insn *from,
+--- a/net/core/sysctl_net_core.c
++++ b/net/core/sysctl_net_core.c
+@@ -253,7 +253,6 @@ static int proc_dointvec_minmax_bpf_enab
+ return ret;
+ }
+
+-# ifdef CONFIG_HAVE_EBPF_JIT
+ static int
+ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+@@ -264,7 +263,6 @@ proc_dointvec_minmax_bpf_restricted(stru
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ }
+-# endif
+ #endif
+
+ static struct ctl_table net_core_table[] = {
+@@ -348,6 +346,14 @@ static struct ctl_table net_core_table[]
+ .extra2 = &two,
+ },
+ # endif
++ {
++ .procname = "bpf_jit_limit",
++ .data = &bpf_jit_limit,
++ .maxlen = sizeof(int),
++ .mode = 0600,
++ .proc_handler = proc_dointvec_minmax_bpf_restricted,
++ .extra1 = &one,
++ },
+ #endif
+ {
+ .procname = "netdev_tstamp_prequeue",
--- /dev/null
+From foo@baz Sat 17 Aug 2019 06:38:21 PM CEST
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Fri, 16 Aug 2019 23:59:20 +0100
+Subject: bpf: get rid of pure_initcall dependency to enable jits
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, Sasha Levin <sashal@kernel.org>
+Cc: stable <stable@vger.kernel.org>
+Message-ID: <20190816225920.GE9843@xylophone.i.decadent.org.uk>
+Content-Disposition: inline
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit fa9dd599b4dae841924b022768354cfde9affecb upstream.
+
+Having a pure_initcall() callback just to permanently enable BPF
+JITs under CONFIG_BPF_JIT_ALWAYS_ON is unnecessary and could leave
+a small race window in future where JIT is still disabled on boot.
+Since we know about the setting at compilation time anyway, just
+initialize it properly there. Also consolidate all the individual
+bpf_jit_enable variables into a single one and move them under one
+location. Moreover, don't allow for setting unspecified garbage
+values on them.
+
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+[bwh: Backported to 4.9 as dependency of commit 2e4a30983b0f
+ "bpf: restrict access to core bpf sysctls":
+ - Drop change in arch/mips/net/ebpf_jit.c
+ - Drop change to bpf_jit_kallsyms
+ - Adjust filenames, context]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/net/bpf_jit_32.c | 2 --
+ arch/arm64/net/bpf_jit_comp.c | 2 --
+ arch/mips/net/bpf_jit.c | 2 --
+ arch/powerpc/net/bpf_jit_comp.c | 2 --
+ arch/powerpc/net/bpf_jit_comp64.c | 2 --
+ arch/s390/net/bpf_jit_comp.c | 2 --
+ arch/sparc/net/bpf_jit_comp.c | 2 --
+ arch/x86/net/bpf_jit_comp.c | 2 --
+ kernel/bpf/core.c | 15 +++++++++++----
+ net/core/sysctl_net_core.c | 14 +++++++++-----
+ net/socket.c | 9 ---------
+ 11 files changed, 20 insertions(+), 34 deletions(-)
+
+--- a/arch/arm/net/bpf_jit_32.c
++++ b/arch/arm/net/bpf_jit_32.c
+@@ -72,8 +72,6 @@ struct jit_ctx {
+ #endif
+ };
+
+-int bpf_jit_enable __read_mostly;
+-
+ static inline int call_neg_helper(struct sk_buff *skb, int offset, void *ret,
+ unsigned int size)
+ {
+--- a/arch/arm64/net/bpf_jit_comp.c
++++ b/arch/arm64/net/bpf_jit_comp.c
+@@ -30,8 +30,6 @@
+
+ #include "bpf_jit.h"
+
+-int bpf_jit_enable __read_mostly;
+-
+ #define TMP_REG_1 (MAX_BPF_JIT_REG + 0)
+ #define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
+ #define TCALL_CNT (MAX_BPF_JIT_REG + 2)
+--- a/arch/mips/net/bpf_jit.c
++++ b/arch/mips/net/bpf_jit.c
+@@ -1194,8 +1194,6 @@ jmp_cmp:
+ return 0;
+ }
+
+-int bpf_jit_enable __read_mostly;
+-
+ void bpf_jit_compile(struct bpf_prog *fp)
+ {
+ struct jit_ctx ctx;
+--- a/arch/powerpc/net/bpf_jit_comp.c
++++ b/arch/powerpc/net/bpf_jit_comp.c
+@@ -18,8 +18,6 @@
+
+ #include "bpf_jit32.h"
+
+-int bpf_jit_enable __read_mostly;
+-
+ static inline void bpf_flush_icache(void *start, void *end)
+ {
+ smp_wmb();
+--- a/arch/powerpc/net/bpf_jit_comp64.c
++++ b/arch/powerpc/net/bpf_jit_comp64.c
+@@ -21,8 +21,6 @@
+
+ #include "bpf_jit64.h"
+
+-int bpf_jit_enable __read_mostly;
+-
+ static void bpf_jit_fill_ill_insns(void *area, unsigned int size)
+ {
+ int *p = area;
+--- a/arch/s390/net/bpf_jit_comp.c
++++ b/arch/s390/net/bpf_jit_comp.c
+@@ -28,8 +28,6 @@
+ #include <asm/nospec-branch.h>
+ #include "bpf_jit.h"
+
+-int bpf_jit_enable __read_mostly;
+-
+ struct bpf_jit {
+ u32 seen; /* Flags to remember seen eBPF instructions */
+ u32 seen_reg[16]; /* Array to remember which registers are used */
+--- a/arch/sparc/net/bpf_jit_comp.c
++++ b/arch/sparc/net/bpf_jit_comp.c
+@@ -10,8 +10,6 @@
+
+ #include "bpf_jit.h"
+
+-int bpf_jit_enable __read_mostly;
+-
+ static inline bool is_simm13(unsigned int value)
+ {
+ return value + 0x1000 < 0x2000;
+--- a/arch/x86/net/bpf_jit_comp.c
++++ b/arch/x86/net/bpf_jit_comp.c
+@@ -15,8 +15,6 @@
+ #include <asm/nospec-branch.h>
+ #include <linux/bpf.h>
+
+-int bpf_jit_enable __read_mostly;
+-
+ /*
+ * assembly code in arch/x86/net/bpf_jit.S
+ */
+--- a/kernel/bpf/core.c
++++ b/kernel/bpf/core.c
+@@ -208,6 +208,10 @@ struct bpf_prog *bpf_patch_insn_single(s
+ }
+
+ #ifdef CONFIG_BPF_JIT
++/* All BPF JIT sysctl knobs here. */
++int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
++int bpf_jit_harden __read_mostly;
++
+ struct bpf_binary_header *
+ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+@@ -244,8 +248,6 @@ void bpf_jit_binary_free(struct bpf_bina
+ module_memfree(hdr);
+ }
+
+-int bpf_jit_harden __read_mostly;
+-
+ static int bpf_jit_blind_insn(const struct bpf_insn *from,
+ const struct bpf_insn *aux,
+ struct bpf_insn *to_buff)
+@@ -925,8 +927,13 @@ load_byte:
+ STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */
+
+ #else
+-static unsigned int __bpf_prog_ret0(void *ctx, const struct bpf_insn *insn)
++static unsigned int __bpf_prog_ret0_warn(void *ctx,
++ const struct bpf_insn *insn)
+ {
++ /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
++ * is not working properly, so warn about it!
++ */
++ WARN_ON_ONCE(1);
+ return 0;
+ }
+ #endif
+@@ -981,7 +988,7 @@ struct bpf_prog *bpf_prog_select_runtime
+ #ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ fp->bpf_func = (void *) __bpf_prog_run;
+ #else
+- fp->bpf_func = (void *) __bpf_prog_ret0;
++ fp->bpf_func = (void *) __bpf_prog_ret0_warn;
+ #endif
+
+ /* eBPF JITs can rewrite the program in case constant
+--- a/net/core/sysctl_net_core.c
++++ b/net/core/sysctl_net_core.c
+@@ -24,6 +24,7 @@
+
+ static int zero = 0;
+ static int one = 1;
++static int two __maybe_unused = 2;
+ static int min_sndbuf = SOCK_MIN_SNDBUF;
+ static int min_rcvbuf = SOCK_MIN_RCVBUF;
+ static int max_skb_frags = MAX_SKB_FRAGS;
+@@ -292,13 +293,14 @@ static struct ctl_table net_core_table[]
+ .data = &bpf_jit_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+-#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+- .proc_handler = proc_dointvec
+-#else
+ .proc_handler = proc_dointvec_minmax,
++# ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ .extra1 = &one,
+ .extra2 = &one,
+-#endif
++# else
++ .extra1 = &zero,
++ .extra2 = &two,
++# endif
+ },
+ # ifdef CONFIG_HAVE_EBPF_JIT
+ {
+@@ -306,7 +308,9 @@ static struct ctl_table net_core_table[]
+ .data = &bpf_jit_harden,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+- .proc_handler = proc_dointvec,
++ .proc_handler = proc_dointvec_minmax,
++ .extra1 = &zero,
++ .extra2 = &two,
+ },
+ # endif
+ #endif
+--- a/net/socket.c
++++ b/net/socket.c
+@@ -2550,15 +2550,6 @@ out_fs:
+
+ core_initcall(sock_init); /* early initcall */
+
+-static int __init jit_init(void)
+-{
+-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+- bpf_jit_enable = 1;
+-#endif
+- return 0;
+-}
+-pure_initcall(jit_init);
+-
+ #ifdef CONFIG_PROC_FS
+ void socket_seq_show(struct seq_file *seq)
+ {
--- /dev/null
+From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Fri, 16 Aug 2019 23:59:56 +0100
+Subject: bpf: restrict access to core bpf sysctls
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, Sasha Levin <sashal@kernel.org>
+Cc: stable <stable@vger.kernel.org>
+Message-ID: <20190816225956.GF9843@xylophone.i.decadent.org.uk>
+Content-Disposition: inline
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit 2e4a30983b0f9b19b59e38bbf7427d7fdd480d98 upstream.
+
+Given BPF reaches far beyond just networking these days, it was
+never intended to allow setting and in some cases reading those
+knobs out of a user namespace root running without CAP_SYS_ADMIN,
+thus tighten such access.
+
+Also the bpf_jit_enable = 2 debugging mode should only be allowed
+if kptr_restrict is not set since it otherwise can leak addresses
+to the kernel log. Dump a note to the kernel log that this is for
+debugging JITs only when enabled.
+
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+[bwh: Backported to 4.9:
+ - We don't have bpf_dump_raw_ok(), so drop the condition based on it. This
+ condition only made it a bit harder for a privileged user to do something
+ silly.
+ - Drop change to bpf_jit_kallsyms]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/sysctl_net_core.c | 39 +++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 37 insertions(+), 2 deletions(-)
+
+--- a/net/core/sysctl_net_core.c
++++ b/net/core/sysctl_net_core.c
+@@ -232,6 +232,41 @@ static int proc_do_rss_key(struct ctl_ta
+ return proc_dostring(&fake_table, write, buffer, lenp, ppos);
+ }
+
++#ifdef CONFIG_BPF_JIT
++static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
++ void __user *buffer, size_t *lenp,
++ loff_t *ppos)
++{
++ int ret, jit_enable = *(int *)table->data;
++ struct ctl_table tmp = *table;
++
++ if (write && !capable(CAP_SYS_ADMIN))
++ return -EPERM;
++
++ tmp.data = &jit_enable;
++ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
++ if (write && !ret) {
++ *(int *)table->data = jit_enable;
++ if (jit_enable == 2)
++ pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n");
++ }
++ return ret;
++}
++
++# ifdef CONFIG_HAVE_EBPF_JIT
++static int
++proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
++ void __user *buffer, size_t *lenp,
++ loff_t *ppos)
++{
++ if (!capable(CAP_SYS_ADMIN))
++ return -EPERM;
++
++ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
++}
++# endif
++#endif
++
+ static struct ctl_table net_core_table[] = {
+ #ifdef CONFIG_NET
+ {
+@@ -293,7 +328,7 @@ static struct ctl_table net_core_table[]
+ .data = &bpf_jit_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+- .proc_handler = proc_dointvec_minmax,
++ .proc_handler = proc_dointvec_minmax_bpf_enable,
+ # ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ .extra1 = &one,
+ .extra2 = &one,
+@@ -308,7 +343,7 @@ static struct ctl_table net_core_table[]
+ .data = &bpf_jit_harden,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+- .proc_handler = proc_dointvec_minmax,
++ .proc_handler = proc_dointvec_minmax_bpf_restricted,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
--- /dev/null
+From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Sat, 17 Aug 2019 00:01:27 +0100
+Subject: inet: switch IP ID generator to siphash
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, Sasha Levin <sashal@kernel.org>
+Cc: stable <stable@vger.kernel.org>
+Message-ID: <20190816230127.GP9843@xylophone.i.decadent.org.uk>
+Content-Disposition: inline
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit df453700e8d81b1bdafdf684365ee2b9431fb702 upstream.
+
+According to Amit Klein and Benny Pinkas, IP ID generation is too weak
+and might be used by attackers.
+
+Even with recent net_hash_mix() fix (netns: provide pure entropy for net_hash_mix())
+having 64bit key and Jenkins hash is risky.
+
+It is time to switch to siphash and its 128bit keys.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Amit Klein <aksecurity@gmail.com>
+Reported-by: Benny Pinkas <benny@pinkas.net>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[bwh: Backported to 4.9: adjust context]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/siphash.h | 5 +++++
+ include/net/netns/ipv4.h | 2 ++
+ net/ipv4/route.c | 12 +++++++-----
+ net/ipv6/output_core.c | 30 ++++++++++++++++--------------
+ 4 files changed, 30 insertions(+), 19 deletions(-)
+
+--- a/include/linux/siphash.h
++++ b/include/linux/siphash.h
+@@ -21,6 +21,11 @@ typedef struct {
+ u64 key[2];
+ } siphash_key_t;
+
++static inline bool siphash_key_is_zero(const siphash_key_t *key)
++{
++ return !(key->key[0] | key->key[1]);
++}
++
+ u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key);
+ #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key);
+--- a/include/net/netns/ipv4.h
++++ b/include/net/netns/ipv4.h
+@@ -8,6 +8,7 @@
+ #include <linux/uidgid.h>
+ #include <net/inet_frag.h>
+ #include <linux/rcupdate.h>
++#include <linux/siphash.h>
+
+ struct tcpm_hash_bucket;
+ struct ctl_table_header;
+@@ -137,5 +138,6 @@ struct netns_ipv4 {
+ int sysctl_fib_multipath_use_neigh;
+ #endif
+ atomic_t rt_genid;
++ siphash_key_t ip_id_key;
+ };
+ #endif
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -496,15 +496,17 @@ EXPORT_SYMBOL(ip_idents_reserve);
+
+ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
+ {
+- static u32 ip_idents_hashrnd __read_mostly;
+ u32 hash, id;
+
+- net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
++ /* Note the following code is not safe, but this is okay. */
++ if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
++ get_random_bytes(&net->ipv4.ip_id_key,
++ sizeof(net->ipv4.ip_id_key));
+
+- hash = jhash_3words((__force u32)iph->daddr,
++ hash = siphash_3u32((__force u32)iph->daddr,
+ (__force u32)iph->saddr,
+- iph->protocol ^ net_hash_mix(net),
+- ip_idents_hashrnd);
++ iph->protocol,
++ &net->ipv4.ip_id_key);
+ id = ip_idents_reserve(hash, segs);
+ iph->id = htons(id);
+ }
+--- a/net/ipv6/output_core.c
++++ b/net/ipv6/output_core.c
+@@ -10,15 +10,25 @@
+ #include <net/secure_seq.h>
+ #include <linux/netfilter.h>
+
+-static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
++static u32 __ipv6_select_ident(struct net *net,
+ const struct in6_addr *dst,
+ const struct in6_addr *src)
+ {
++ const struct {
++ struct in6_addr dst;
++ struct in6_addr src;
++ } __aligned(SIPHASH_ALIGNMENT) combined = {
++ .dst = *dst,
++ .src = *src,
++ };
+ u32 hash, id;
+
+- hash = __ipv6_addr_jhash(dst, hashrnd);
+- hash = __ipv6_addr_jhash(src, hash);
+- hash ^= net_hash_mix(net);
++ /* Note the following code is not safe, but this is okay. */
++ if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
++ get_random_bytes(&net->ipv4.ip_id_key,
++ sizeof(net->ipv4.ip_id_key));
++
++ hash = siphash(&combined, sizeof(combined), &net->ipv4.ip_id_key);
+
+ /* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve,
+ * set the hight order instead thus minimizing possible future
+@@ -41,7 +51,6 @@ static u32 __ipv6_select_ident(struct ne
+ */
+ void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
+ {
+- static u32 ip6_proxy_idents_hashrnd __read_mostly;
+ struct in6_addr buf[2];
+ struct in6_addr *addrs;
+ u32 id;
+@@ -53,11 +62,7 @@ void ipv6_proxy_select_ident(struct net
+ if (!addrs)
+ return;
+
+- net_get_random_once(&ip6_proxy_idents_hashrnd,
+- sizeof(ip6_proxy_idents_hashrnd));
+-
+- id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd,
+- &addrs[1], &addrs[0]);
++ id = __ipv6_select_ident(net, &addrs[1], &addrs[0]);
+ skb_shinfo(skb)->ip6_frag_id = htonl(id);
+ }
+ EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
+@@ -66,12 +71,9 @@ __be32 ipv6_select_ident(struct net *net
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+ {
+- static u32 ip6_idents_hashrnd __read_mostly;
+ u32 id;
+
+- net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
+-
+- id = __ipv6_select_ident(net, ip6_idents_hashrnd, daddr, saddr);
++ id = __ipv6_select_ident(net, daddr, saddr);
+ return htonl(id);
+ }
+ EXPORT_SYMBOL(ipv6_select_ident);
--- /dev/null
+From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Sat, 17 Aug 2019 00:01:34 +0100
+Subject: netfilter: ctnetlink: don't use conntrack/expect object addresses as id
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, Sasha Levin <sashal@kernel.org>
+Cc: stable <stable@vger.kernel.org>
+Message-ID: <20190816230134.GQ9843@xylophone.i.decadent.org.uk>
+Content-Disposition: inline
+
+From: Florian Westphal <fw@strlen.de>
+
+commit 3c79107631db1f7fd32cf3f7368e4672004a3010 upstream.
+
+else, we leak the addresses to userspace via ctnetlink events
+and dumps.
+
+Compute an ID on demand based on the immutable parts of nf_conn struct.
+
+Another advantage compared to using an address is that there is no
+immediate re-use of the same ID in case the conntrack entry is freed and
+reallocated again immediately.
+
+Fixes: 3583240249ef ("[NETFILTER]: nf_conntrack_expect: kill unique ID")
+Fixes: 7f85f914721f ("[NETFILTER]: nf_conntrack: kill unique ID")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/netfilter/nf_conntrack.h | 2 ++
+ net/netfilter/nf_conntrack_core.c | 35 +++++++++++++++++++++++++++++++++++
+ net/netfilter/nf_conntrack_netlink.c | 34 +++++++++++++++++++++++++++++-----
+ 3 files changed, 66 insertions(+), 5 deletions(-)
+
+--- a/include/net/netfilter/nf_conntrack.h
++++ b/include/net/netfilter/nf_conntrack.h
+@@ -336,6 +336,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct
+ gfp_t flags);
+ void nf_ct_tmpl_free(struct nf_conn *tmpl);
+
++u32 nf_ct_get_id(const struct nf_conn *ct);
++
+ #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count)
+ #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
+ #define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v))
+--- a/net/netfilter/nf_conntrack_core.c
++++ b/net/netfilter/nf_conntrack_core.c
+@@ -25,6 +25,7 @@
+ #include <linux/slab.h>
+ #include <linux/random.h>
+ #include <linux/jhash.h>
++#include <linux/siphash.h>
+ #include <linux/err.h>
+ #include <linux/percpu.h>
+ #include <linux/moduleparam.h>
+@@ -301,6 +302,40 @@ nf_ct_invert_tuple(struct nf_conntrack_t
+ }
+ EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
+
++/* Generate a almost-unique pseudo-id for a given conntrack.
++ *
++ * intentionally doesn't re-use any of the seeds used for hash
++ * table location, we assume id gets exposed to userspace.
++ *
++ * Following nf_conn items do not change throughout lifetime
++ * of the nf_conn after it has been committed to main hash table:
++ *
++ * 1. nf_conn address
++ * 2. nf_conn->ext address
++ * 3. nf_conn->master address (normally NULL)
++ * 4. tuple
++ * 5. the associated net namespace
++ */
++u32 nf_ct_get_id(const struct nf_conn *ct)
++{
++ static __read_mostly siphash_key_t ct_id_seed;
++ unsigned long a, b, c, d;
++
++ net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
++
++ a = (unsigned long)ct;
++ b = (unsigned long)ct->master ^ net_hash_mix(nf_ct_net(ct));
++ c = (unsigned long)ct->ext;
++ d = (unsigned long)siphash(&ct->tuplehash, sizeof(ct->tuplehash),
++ &ct_id_seed);
++#ifdef CONFIG_64BIT
++ return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed);
++#else
++ return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed);
++#endif
++}
++EXPORT_SYMBOL_GPL(nf_ct_get_id);
++
+ static void
+ clean_from_lists(struct nf_conn *ct)
+ {
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -29,6 +29,7 @@
+ #include <linux/spinlock.h>
+ #include <linux/interrupt.h>
+ #include <linux/slab.h>
++#include <linux/siphash.h>
+
+ #include <linux/netfilter.h>
+ #include <net/netlink.h>
+@@ -441,7 +442,9 @@ static int ctnetlink_dump_ct_seq_adj(str
+
+ static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
+ {
+- if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct)))
++ __be32 id = (__force __be32)nf_ct_get_id(ct);
++
++ if (nla_put_be32(skb, CTA_ID, id))
+ goto nla_put_failure;
+ return 0;
+
+@@ -1166,8 +1169,9 @@ static int ctnetlink_del_conntrack(struc
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ if (cda[CTA_ID]) {
+- u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
+- if (id != (u32)(unsigned long)ct) {
++ __be32 id = nla_get_be32(cda[CTA_ID]);
++
++ if (id != (__force __be32)nf_ct_get_id(ct)) {
+ nf_ct_put(ct);
+ return -ENOENT;
+ }
+@@ -2472,6 +2476,25 @@ nla_put_failure:
+
+ static const union nf_inet_addr any_addr;
+
++static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp)
++{
++ static __read_mostly siphash_key_t exp_id_seed;
++ unsigned long a, b, c, d;
++
++ net_get_random_once(&exp_id_seed, sizeof(exp_id_seed));
++
++ a = (unsigned long)exp;
++ b = (unsigned long)exp->helper;
++ c = (unsigned long)exp->master;
++ d = (unsigned long)siphash(&exp->tuple, sizeof(exp->tuple), &exp_id_seed);
++
++#ifdef CONFIG_64BIT
++ return (__force __be32)siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &exp_id_seed);
++#else
++ return (__force __be32)siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &exp_id_seed);
++#endif
++}
++
+ static int
+ ctnetlink_exp_dump_expect(struct sk_buff *skb,
+ const struct nf_conntrack_expect *exp)
+@@ -2519,7 +2542,7 @@ ctnetlink_exp_dump_expect(struct sk_buff
+ }
+ #endif
+ if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) ||
+- nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) ||
++ nla_put_be32(skb, CTA_EXPECT_ID, nf_expect_get_id(exp)) ||
+ nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) ||
+ nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class)))
+ goto nla_put_failure;
+@@ -2818,7 +2841,8 @@ static int ctnetlink_get_expect(struct n
+
+ if (cda[CTA_EXPECT_ID]) {
+ __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
+- if (ntohl(id) != (u32)(unsigned long)exp) {
++
++ if (id != nf_expect_get_id(exp)) {
+ nf_ct_expect_put(exp);
+ return -ENOENT;
+ }
sh-kernel-hw_breakpoint-fix-missing-break-in-switch-statement.patch
mm-usercopy-use-memory-range-to-be-accessed-for-wraparound-check.patch
mm-memcontrol.c-fix-use-after-free-in-mem_cgroup_iter.patch
+bpf-get-rid-of-pure_initcall-dependency-to-enable-jits.patch
+bpf-restrict-access-to-core-bpf-sysctls.patch
+bpf-add-bpf_jit_limit-knob-to-restrict-unpriv-allocations.patch
+vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch
+vhost_net-use-packet-weight-for-rx-handler-too.patch
+vhost_net-introduce-vhost_exceeds_weight.patch
+vhost-introduce-vhost_exceeds_weight.patch
+vhost_net-fix-possible-infinite-loop.patch
+vhost-scsi-add-weight-support.patch
+siphash-add-cryptographically-secure-prf.patch
+siphash-implement-halfsiphash1-3-for-hash-tables.patch
+inet-switch-ip-id-generator-to-siphash.patch
+netfilter-ctnetlink-don-t-use-conntrack-expect-object-addresses-as-id.patch
--- /dev/null
+From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Sat, 17 Aug 2019 00:01:12 +0100
+Subject: siphash: add cryptographically secure PRF
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, Sasha Levin <sashal@kernel.org>
+Cc: stable <stable@vger.kernel.org>
+Message-ID: <20190816230112.GN9843@xylophone.i.decadent.org.uk>
+Content-Disposition: inline
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit 2c956a60778cbb6a27e0c7a8a52a91378c90e1d1 upstream.
+
+SipHash is a 64-bit keyed hash function that is actually a
+cryptographically secure PRF, like HMAC. Except SipHash is super fast,
+and is meant to be used as a hashtable keyed lookup function, or as a
+general PRF for short input use cases, such as sequence numbers or RNG
+chaining.
+
+For the first usage:
+
+There are a variety of attacks known as "hashtable poisoning" in which an
+attacker forms some data such that the hash of that data will be the
+same, and then preceeds to fill up all entries of a hashbucket. This is
+a realistic and well-known denial-of-service vector. Currently
+hashtables use jhash, which is fast but not secure, and some kind of
+rotating key scheme (or none at all, which isn't good). SipHash is meant
+as a replacement for jhash in these cases.
+
+There are a modicum of places in the kernel that are vulnerable to
+hashtable poisoning attacks, either via userspace vectors or network
+vectors, and there's not a reliable mechanism inside the kernel at the
+moment to fix it. The first step toward fixing these issues is actually
+getting a secure primitive into the kernel for developers to use. Then
+we can, bit by bit, port things over to it as deemed appropriate.
+
+While SipHash is extremely fast for a cryptographically secure function,
+it is likely a bit slower than the insecure jhash, and so replacements
+will be evaluated on a case-by-case basis based on whether or not the
+difference in speed is negligible and whether or not the current jhash usage
+poses a real security risk.
+
+For the second usage:
+
+A few places in the kernel are using MD5 or SHA1 for creating secure
+sequence numbers, syn cookies, port numbers, or fast random numbers.
+SipHash is a faster and more fitting, and more secure replacement for MD5
+in those situations. Replacing MD5 and SHA1 with SipHash for these uses is
+obvious and straight-forward, and so is submitted along with this patch
+series. There shouldn't be much of a debate over its efficacy.
+
+Dozens of languages are already using this internally for their hash
+tables and PRFs. Some of the BSDs already use this in their kernels.
+SipHash is a widely known high-speed solution to a widely known set of
+problems, and it's time we catch-up.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Reviewed-by: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Eric Biggers <ebiggers3@gmail.com>
+Cc: David Laight <David.Laight@aculab.com>
+Cc: Eric Dumazet <eric.dumazet@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[bwh: Backported to 4.9 as dependency of commits df453700e8d8 "inet: switch
+ IP ID generator to siphash" and 3c79107631db "netfilter: ctnetlink: don't
+ use conntrack/expect object addresses as id"]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/siphash.txt | 100 +++++++++++++++++++
+ MAINTAINERS | 7 +
+ include/linux/siphash.h | 85 ++++++++++++++++
+ lib/Kconfig.debug | 6 -
+ lib/Makefile | 5
+ lib/siphash.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++
+ lib/test_siphash.c | 131 +++++++++++++++++++++++++
+ 7 files changed, 561 insertions(+), 5 deletions(-)
+ create mode 100644 Documentation/siphash.txt
+ create mode 100644 include/linux/siphash.h
+ create mode 100644 lib/siphash.c
+ create mode 100644 lib/test_siphash.c
+
+--- /dev/null
++++ b/Documentation/siphash.txt
+@@ -0,0 +1,100 @@
++ SipHash - a short input PRF
++-----------------------------------------------
++Written by Jason A. Donenfeld <jason@zx2c4.com>
++
++SipHash is a cryptographically secure PRF -- a keyed hash function -- that
++performs very well for short inputs, hence the name. It was designed by
++cryptographers Daniel J. Bernstein and Jean-Philippe Aumasson. It is intended
++as a replacement for some uses of: `jhash`, `md5_transform`, `sha_transform`,
++and so forth.
++
++SipHash takes a secret key filled with randomly generated numbers and either
++an input buffer or several input integers. It spits out an integer that is
++indistinguishable from random. You may then use that integer as part of secure
++sequence numbers, secure cookies, or mask it off for use in a hash table.
++
++1. Generating a key
++
++Keys should always be generated from a cryptographically secure source of
++random numbers, either using get_random_bytes or get_random_once:
++
++siphash_key_t key;
++get_random_bytes(&key, sizeof(key));
++
++If you're not deriving your key from here, you're doing it wrong.
++
++2. Using the functions
++
++There are two variants of the function, one that takes a list of integers, and
++one that takes a buffer:
++
++u64 siphash(const void *data, size_t len, const siphash_key_t *key);
++
++And:
++
++u64 siphash_1u64(u64, const siphash_key_t *key);
++u64 siphash_2u64(u64, u64, const siphash_key_t *key);
++u64 siphash_3u64(u64, u64, u64, const siphash_key_t *key);
++u64 siphash_4u64(u64, u64, u64, u64, const siphash_key_t *key);
++u64 siphash_1u32(u32, const siphash_key_t *key);
++u64 siphash_2u32(u32, u32, const siphash_key_t *key);
++u64 siphash_3u32(u32, u32, u32, const siphash_key_t *key);
++u64 siphash_4u32(u32, u32, u32, u32, const siphash_key_t *key);
++
++If you pass the generic siphash function something of a constant length, it
++will constant fold at compile-time and automatically choose one of the
++optimized functions.
++
++3. Hashtable key function usage:
++
++struct some_hashtable {
++ DECLARE_HASHTABLE(hashtable, 8);
++ siphash_key_t key;
++};
++
++void init_hashtable(struct some_hashtable *table)
++{
++ get_random_bytes(&table->key, sizeof(table->key));
++}
++
++static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input)
++{
++ return &table->hashtable[siphash(input, sizeof(*input), &table->key) & (HASH_SIZE(table->hashtable) - 1)];
++}
++
++You may then iterate like usual over the returned hash bucket.
++
++4. Security
++
++SipHash has a very high security margin, with its 128-bit key. So long as the
++key is kept secret, it is impossible for an attacker to guess the outputs of
++the function, even if being able to observe many outputs, since 2^128 outputs
++is significant.
++
++Linux implements the "2-4" variant of SipHash.
++
++5. Struct-passing Pitfalls
++
++Often times the XuY functions will not be large enough, and instead you'll
++want to pass a pre-filled struct to siphash. When doing this, it's important
++to always ensure the struct has no padding holes. The easiest way to do this
++is to simply arrange the members of the struct in descending order of size,
++and to use offsetendof() instead of sizeof() for getting the size. For
++performance reasons, if possible, it's probably a good thing to align the
++struct to the right boundary. Here's an example:
++
++const struct {
++ struct in6_addr saddr;
++ u32 counter;
++ u16 dport;
++} __aligned(SIPHASH_ALIGNMENT) combined = {
++ .saddr = *(struct in6_addr *)saddr,
++ .counter = counter,
++ .dport = dport
++};
++u64 h = siphash(&combined, offsetofend(typeof(combined), dport), &secret);
++
++6. Resources
++
++Read the SipHash paper if you're interested in learning more:
++https://131002.net/siphash/siphash.pdf
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -11068,6 +11068,13 @@ F: arch/arm/mach-s3c24xx/mach-bast.c
+ F: arch/arm/mach-s3c24xx/bast-ide.c
+ F: arch/arm/mach-s3c24xx/bast-irq.c
+
++SIPHASH PRF ROUTINES
++M: Jason A. Donenfeld <Jason@zx2c4.com>
++S: Maintained
++F: lib/siphash.c
++F: lib/test_siphash.c
++F: include/linux/siphash.h
++
+ TI DAVINCI MACHINE SUPPORT
+ M: Sekhar Nori <nsekhar@ti.com>
+ M: Kevin Hilman <khilman@kernel.org>
+--- /dev/null
++++ b/include/linux/siphash.h
+@@ -0,0 +1,85 @@
++/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * This file is provided under a dual BSD/GPLv2 license.
++ *
++ * SipHash: a fast short-input PRF
++ * https://131002.net/siphash/
++ *
++ * This implementation is specifically for SipHash2-4.
++ */
++
++#ifndef _LINUX_SIPHASH_H
++#define _LINUX_SIPHASH_H
++
++#include <linux/types.h>
++#include <linux/kernel.h>
++
++#define SIPHASH_ALIGNMENT __alignof__(u64)
++typedef struct {
++ u64 key[2];
++} siphash_key_t;
++
++u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key);
++#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
++u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key);
++#endif
++
++u64 siphash_1u64(const u64 a, const siphash_key_t *key);
++u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key);
++u64 siphash_3u64(const u64 a, const u64 b, const u64 c,
++ const siphash_key_t *key);
++u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d,
++ const siphash_key_t *key);
++u64 siphash_1u32(const u32 a, const siphash_key_t *key);
++u64 siphash_3u32(const u32 a, const u32 b, const u32 c,
++ const siphash_key_t *key);
++
++static inline u64 siphash_2u32(const u32 a, const u32 b,
++ const siphash_key_t *key)
++{
++ return siphash_1u64((u64)b << 32 | a, key);
++}
++static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c,
++ const u32 d, const siphash_key_t *key)
++{
++ return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key);
++}
++
++
++static inline u64 ___siphash_aligned(const __le64 *data, size_t len,
++ const siphash_key_t *key)
++{
++ if (__builtin_constant_p(len) && len == 4)
++ return siphash_1u32(le32_to_cpup((const __le32 *)data), key);
++ if (__builtin_constant_p(len) && len == 8)
++ return siphash_1u64(le64_to_cpu(data[0]), key);
++ if (__builtin_constant_p(len) && len == 16)
++ return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
++ key);
++ if (__builtin_constant_p(len) && len == 24)
++ return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
++ le64_to_cpu(data[2]), key);
++ if (__builtin_constant_p(len) && len == 32)
++ return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
++ le64_to_cpu(data[2]), le64_to_cpu(data[3]),
++ key);
++ return __siphash_aligned(data, len, key);
++}
++
++/**
++ * siphash - compute 64-bit siphash PRF value
++ * @data: buffer to hash
++ * @size: size of @data
++ * @key: the siphash key
++ */
++static inline u64 siphash(const void *data, size_t len,
++ const siphash_key_t *key)
++{
++#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
++ if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT))
++ return __siphash_unaligned(data, len, key);
++#endif
++ return ___siphash_aligned(data, len, key);
++}
++
++#endif /* _LINUX_SIPHASH_H */
+--- a/lib/Kconfig.debug
++++ b/lib/Kconfig.debug
+@@ -1822,9 +1822,9 @@ config TEST_HASH
+ tristate "Perform selftest on hash functions"
+ default n
+ help
+- Enable this option to test the kernel's integer (<linux/hash,h>)
+- and string (<linux/stringhash.h>) hash functions on boot
+- (or module load).
++ Enable this option to test the kernel's integer (<linux/hash.h>),
++ string (<linux/stringhash.h>), and siphash (<linux/siphash.h>)
++ hash functions on boot (or module load).
+
+ This is intended to help people writing architecture-specific
+ optimized versions. If unsure, say N.
+--- a/lib/Makefile
++++ b/lib/Makefile
+@@ -22,7 +22,8 @@ lib-y := ctype.o string.o vsprintf.o cmd
+ sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
+ flex_proportions.o ratelimit.o show_mem.o \
+ is_single_threaded.o plist.o decompress.o kobject_uevent.o \
+- earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o
++ earlycpio.o seq_buf.o siphash.o \
++ nmi_backtrace.o nodemask.o win_minmax.o
+
+ lib-$(CONFIG_MMU) += ioremap.o
+ lib-$(CONFIG_SMP) += cpumask.o
+@@ -44,7 +45,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdu
+ obj-y += kstrtox.o
+ obj-$(CONFIG_TEST_BPF) += test_bpf.o
+ obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
+-obj-$(CONFIG_TEST_HASH) += test_hash.o
++obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o
+ obj-$(CONFIG_TEST_KASAN) += test_kasan.o
+ CFLAGS_test_kasan.o += -fno-builtin
+ obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
+--- /dev/null
++++ b/lib/siphash.c
+@@ -0,0 +1,232 @@
++/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * This file is provided under a dual BSD/GPLv2 license.
++ *
++ * SipHash: a fast short-input PRF
++ * https://131002.net/siphash/
++ *
++ * This implementation is specifically for SipHash2-4.
++ */
++
++#include <linux/siphash.h>
++#include <asm/unaligned.h>
++
++#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
++#include <linux/dcache.h>
++#include <asm/word-at-a-time.h>
++#endif
++
++#define SIPROUND \
++ do { \
++ v0 += v1; v1 = rol64(v1, 13); v1 ^= v0; v0 = rol64(v0, 32); \
++ v2 += v3; v3 = rol64(v3, 16); v3 ^= v2; \
++ v0 += v3; v3 = rol64(v3, 21); v3 ^= v0; \
++ v2 += v1; v1 = rol64(v1, 17); v1 ^= v2; v2 = rol64(v2, 32); \
++ } while (0)
++
++#define PREAMBLE(len) \
++ u64 v0 = 0x736f6d6570736575ULL; \
++ u64 v1 = 0x646f72616e646f6dULL; \
++ u64 v2 = 0x6c7967656e657261ULL; \
++ u64 v3 = 0x7465646279746573ULL; \
++ u64 b = ((u64)(len)) << 56; \
++ v3 ^= key->key[1]; \
++ v2 ^= key->key[0]; \
++ v1 ^= key->key[1]; \
++ v0 ^= key->key[0];
++
++#define POSTAMBLE \
++ v3 ^= b; \
++ SIPROUND; \
++ SIPROUND; \
++ v0 ^= b; \
++ v2 ^= 0xff; \
++ SIPROUND; \
++ SIPROUND; \
++ SIPROUND; \
++ SIPROUND; \
++ return (v0 ^ v1) ^ (v2 ^ v3);
++
++u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key)
++{
++ const u8 *end = data + len - (len % sizeof(u64));
++ const u8 left = len & (sizeof(u64) - 1);
++ u64 m;
++ PREAMBLE(len)
++ for (; data != end; data += sizeof(u64)) {
++ m = le64_to_cpup(data);
++ v3 ^= m;
++ SIPROUND;
++ SIPROUND;
++ v0 ^= m;
++ }
++#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
++ if (left)
++ b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
++ bytemask_from_count(left)));
++#else
++ switch (left) {
++ case 7: b |= ((u64)end[6]) << 48;
++ case 6: b |= ((u64)end[5]) << 40;
++ case 5: b |= ((u64)end[4]) << 32;
++ case 4: b |= le32_to_cpup(data); break;
++ case 3: b |= ((u64)end[2]) << 16;
++ case 2: b |= le16_to_cpup(data); break;
++ case 1: b |= end[0];
++ }
++#endif
++ POSTAMBLE
++}
++EXPORT_SYMBOL(__siphash_aligned);
++
++#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
++u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key)
++{
++ const u8 *end = data + len - (len % sizeof(u64));
++ const u8 left = len & (sizeof(u64) - 1);
++ u64 m;
++ PREAMBLE(len)
++ for (; data != end; data += sizeof(u64)) {
++ m = get_unaligned_le64(data);
++ v3 ^= m;
++ SIPROUND;
++ SIPROUND;
++ v0 ^= m;
++ }
++#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
++ if (left)
++ b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
++ bytemask_from_count(left)));
++#else
++ switch (left) {
++ case 7: b |= ((u64)end[6]) << 48;
++ case 6: b |= ((u64)end[5]) << 40;
++ case 5: b |= ((u64)end[4]) << 32;
++ case 4: b |= get_unaligned_le32(end); break;
++ case 3: b |= ((u64)end[2]) << 16;
++ case 2: b |= get_unaligned_le16(end); break;
++ case 1: b |= end[0];
++ }
++#endif
++ POSTAMBLE
++}
++EXPORT_SYMBOL(__siphash_unaligned);
++#endif
++
++/**
++ * siphash_1u64 - compute 64-bit siphash PRF value of a u64
++ * @first: first u64
++ * @key: the siphash key
++ */
++u64 siphash_1u64(const u64 first, const siphash_key_t *key)
++{
++ PREAMBLE(8)
++ v3 ^= first;
++ SIPROUND;
++ SIPROUND;
++ v0 ^= first;
++ POSTAMBLE
++}
++EXPORT_SYMBOL(siphash_1u64);
++
++/**
++ * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64
++ * @first: first u64
++ * @second: second u64
++ * @key: the siphash key
++ */
++u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key)
++{
++ PREAMBLE(16)
++ v3 ^= first;
++ SIPROUND;
++ SIPROUND;
++ v0 ^= first;
++ v3 ^= second;
++ SIPROUND;
++ SIPROUND;
++ v0 ^= second;
++ POSTAMBLE
++}
++EXPORT_SYMBOL(siphash_2u64);
++
++/**
++ * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64
++ * @first: first u64
++ * @second: second u64
++ * @third: third u64
++ * @key: the siphash key
++ */
++u64 siphash_3u64(const u64 first, const u64 second, const u64 third,
++ const siphash_key_t *key)
++{
++ PREAMBLE(24)
++ v3 ^= first;
++ SIPROUND;
++ SIPROUND;
++ v0 ^= first;
++ v3 ^= second;
++ SIPROUND;
++ SIPROUND;
++ v0 ^= second;
++ v3 ^= third;
++ SIPROUND;
++ SIPROUND;
++ v0 ^= third;
++ POSTAMBLE
++}
++EXPORT_SYMBOL(siphash_3u64);
++
++/**
++ * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64
++ * @first: first u64
++ * @second: second u64
++ * @third: third u64
++ * @forth: forth u64
++ * @key: the siphash key
++ */
++u64 siphash_4u64(const u64 first, const u64 second, const u64 third,
++ const u64 forth, const siphash_key_t *key)
++{
++ PREAMBLE(32)
++ v3 ^= first;
++ SIPROUND;
++ SIPROUND;
++ v0 ^= first;
++ v3 ^= second;
++ SIPROUND;
++ SIPROUND;
++ v0 ^= second;
++ v3 ^= third;
++ SIPROUND;
++ SIPROUND;
++ v0 ^= third;
++ v3 ^= forth;
++ SIPROUND;
++ SIPROUND;
++ v0 ^= forth;
++ POSTAMBLE
++}
++EXPORT_SYMBOL(siphash_4u64);
++
++u64 siphash_1u32(const u32 first, const siphash_key_t *key)
++{
++ PREAMBLE(4)
++ b |= first;
++ POSTAMBLE
++}
++EXPORT_SYMBOL(siphash_1u32);
++
++u64 siphash_3u32(const u32 first, const u32 second, const u32 third,
++ const siphash_key_t *key)
++{
++ u64 combined = (u64)second << 32 | first;
++ PREAMBLE(12)
++ v3 ^= combined;
++ SIPROUND;
++ SIPROUND;
++ v0 ^= combined;
++ b |= third;
++ POSTAMBLE
++}
++EXPORT_SYMBOL(siphash_3u32);
+--- /dev/null
++++ b/lib/test_siphash.c
+@@ -0,0 +1,131 @@
++/* Test cases for siphash.c
++ *
++ * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * This file is provided under a dual BSD/GPLv2 license.
++ *
++ * SipHash: a fast short-input PRF
++ * https://131002.net/siphash/
++ *
++ * This implementation is specifically for SipHash2-4.
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include <linux/siphash.h>
++#include <linux/kernel.h>
++#include <linux/string.h>
++#include <linux/errno.h>
++#include <linux/module.h>
++
++/* Test vectors taken from official reference source available at:
++ * https://131002.net/siphash/siphash24.c
++ */
++
++static const siphash_key_t test_key_siphash =
++ {{ 0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL }};
++
++static const u64 test_vectors_siphash[64] = {
++ 0x726fdb47dd0e0e31ULL, 0x74f839c593dc67fdULL, 0x0d6c8009d9a94f5aULL,
++ 0x85676696d7fb7e2dULL, 0xcf2794e0277187b7ULL, 0x18765564cd99a68dULL,
++ 0xcbc9466e58fee3ceULL, 0xab0200f58b01d137ULL, 0x93f5f5799a932462ULL,
++ 0x9e0082df0ba9e4b0ULL, 0x7a5dbbc594ddb9f3ULL, 0xf4b32f46226bada7ULL,
++ 0x751e8fbc860ee5fbULL, 0x14ea5627c0843d90ULL, 0xf723ca908e7af2eeULL,
++ 0xa129ca6149be45e5ULL, 0x3f2acc7f57c29bdbULL, 0x699ae9f52cbe4794ULL,
++ 0x4bc1b3f0968dd39cULL, 0xbb6dc91da77961bdULL, 0xbed65cf21aa2ee98ULL,
++ 0xd0f2cbb02e3b67c7ULL, 0x93536795e3a33e88ULL, 0xa80c038ccd5ccec8ULL,
++ 0xb8ad50c6f649af94ULL, 0xbce192de8a85b8eaULL, 0x17d835b85bbb15f3ULL,
++ 0x2f2e6163076bcfadULL, 0xde4daaaca71dc9a5ULL, 0xa6a2506687956571ULL,
++ 0xad87a3535c49ef28ULL, 0x32d892fad841c342ULL, 0x7127512f72f27cceULL,
++ 0xa7f32346f95978e3ULL, 0x12e0b01abb051238ULL, 0x15e034d40fa197aeULL,
++ 0x314dffbe0815a3b4ULL, 0x027990f029623981ULL, 0xcadcd4e59ef40c4dULL,
++ 0x9abfd8766a33735cULL, 0x0e3ea96b5304a7d0ULL, 0xad0c42d6fc585992ULL,
++ 0x187306c89bc215a9ULL, 0xd4a60abcf3792b95ULL, 0xf935451de4f21df2ULL,
++ 0xa9538f0419755787ULL, 0xdb9acddff56ca510ULL, 0xd06c98cd5c0975ebULL,
++ 0xe612a3cb9ecba951ULL, 0xc766e62cfcadaf96ULL, 0xee64435a9752fe72ULL,
++ 0xa192d576b245165aULL, 0x0a8787bf8ecb74b2ULL, 0x81b3e73d20b49b6fULL,
++ 0x7fa8220ba3b2eceaULL, 0x245731c13ca42499ULL, 0xb78dbfaf3a8d83bdULL,
++ 0xea1ad565322a1a0bULL, 0x60e61c23a3795013ULL, 0x6606d7e446282b93ULL,
++ 0x6ca4ecb15c5f91e1ULL, 0x9f626da15c9625f3ULL, 0xe51b38608ef25f57ULL,
++ 0x958a324ceb064572ULL
++};
++
++static int __init siphash_test_init(void)
++{
++ u8 in[64] __aligned(SIPHASH_ALIGNMENT);
++ u8 in_unaligned[65] __aligned(SIPHASH_ALIGNMENT);
++ u8 i;
++ int ret = 0;
++
++ for (i = 0; i < 64; ++i) {
++ in[i] = i;
++ in_unaligned[i + 1] = i;
++ if (siphash(in, i, &test_key_siphash) !=
++ test_vectors_siphash[i]) {
++ pr_info("siphash self-test aligned %u: FAIL\n", i + 1);
++ ret = -EINVAL;
++ }
++ if (siphash(in_unaligned + 1, i, &test_key_siphash) !=
++ test_vectors_siphash[i]) {
++ pr_info("siphash self-test unaligned %u: FAIL\n", i + 1);
++ ret = -EINVAL;
++ }
++ }
++ if (siphash_1u64(0x0706050403020100ULL, &test_key_siphash) !=
++ test_vectors_siphash[8]) {
++ pr_info("siphash self-test 1u64: FAIL\n");
++ ret = -EINVAL;
++ }
++ if (siphash_2u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL,
++ &test_key_siphash) != test_vectors_siphash[16]) {
++ pr_info("siphash self-test 2u64: FAIL\n");
++ ret = -EINVAL;
++ }
++ if (siphash_3u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL,
++ 0x1716151413121110ULL, &test_key_siphash) !=
++ test_vectors_siphash[24]) {
++ pr_info("siphash self-test 3u64: FAIL\n");
++ ret = -EINVAL;
++ }
++ if (siphash_4u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL,
++ 0x1716151413121110ULL, 0x1f1e1d1c1b1a1918ULL,
++ &test_key_siphash) != test_vectors_siphash[32]) {
++ pr_info("siphash self-test 4u64: FAIL\n");
++ ret = -EINVAL;
++ }
++ if (siphash_1u32(0x03020100U, &test_key_siphash) !=
++ test_vectors_siphash[4]) {
++ pr_info("siphash self-test 1u32: FAIL\n");
++ ret = -EINVAL;
++ }
++ if (siphash_2u32(0x03020100U, 0x07060504U, &test_key_siphash) !=
++ test_vectors_siphash[8]) {
++ pr_info("siphash self-test 2u32: FAIL\n");
++ ret = -EINVAL;
++ }
++ if (siphash_3u32(0x03020100U, 0x07060504U,
++ 0x0b0a0908U, &test_key_siphash) !=
++ test_vectors_siphash[12]) {
++ pr_info("siphash self-test 3u32: FAIL\n");
++ ret = -EINVAL;
++ }
++ if (siphash_4u32(0x03020100U, 0x07060504U,
++ 0x0b0a0908U, 0x0f0e0d0cU, &test_key_siphash) !=
++ test_vectors_siphash[16]) {
++ pr_info("siphash self-test 4u32: FAIL\n");
++ ret = -EINVAL;
++ }
++ if (!ret)
++ pr_info("self-tests: pass\n");
++ return ret;
++}
++
++static void __exit siphash_test_exit(void)
++{
++}
++
++module_init(siphash_test_init);
++module_exit(siphash_test_exit);
++
++MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
++MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null
+From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Sat, 17 Aug 2019 00:01:19 +0100
+Subject: siphash: implement HalfSipHash1-3 for hash tables
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, Sasha Levin <sashal@kernel.org>
+Cc: stable <stable@vger.kernel.org>
+Message-ID: <20190816230119.GO9843@xylophone.i.decadent.org.uk>
+Content-Disposition: inline
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit 1ae2324f732c9c4e2fa4ebd885fa1001b70d52e1 upstream.
+
+HalfSipHash, or hsiphash, is a shortened version of SipHash, which
+generates 32-bit outputs using a weaker 64-bit key. It has *much* lower
+security margins, and shouldn't be used for anything too sensitive, but
+it could be used as a hashtable key function replacement, if the output
+is never exposed, and if the security requirement is not too high.
+
+The goal is to make this something that performance-critical jhash users
+would be willing to use.
+
+On 64-bit machines, HalfSipHash1-3 is slower than SipHash1-3, so we alias
+SipHash1-3 to HalfSipHash1-3 on those systems.
+
+64-bit x86_64:
+[ 0.509409] test_siphash: SipHash2-4 cycles: 4049181
+[ 0.510650] test_siphash: SipHash1-3 cycles: 2512884
+[ 0.512205] test_siphash: HalfSipHash1-3 cycles: 3429920
+[ 0.512904] test_siphash: JenkinsHash cycles: 978267
+So, we map hsiphash() -> SipHash1-3
+
+32-bit x86:
+[ 0.509868] test_siphash: SipHash2-4 cycles: 14812892
+[ 0.513601] test_siphash: SipHash1-3 cycles: 9510710
+[ 0.515263] test_siphash: HalfSipHash1-3 cycles: 3856157
+[ 0.515952] test_siphash: JenkinsHash cycles: 1148567
+So, we map hsiphash() -> HalfSipHash1-3
+
+hsiphash() is roughly 3 times slower than jhash(), but comes with a
+considerable security improvement.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Reviewed-by: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[bwh: Backported to 4.9 to avoid regression for WireGuard with only half
+ the siphash API present]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/siphash.txt | 75 ++++++++++
+ include/linux/siphash.h | 57 ++++++++
+ lib/siphash.c | 321 +++++++++++++++++++++++++++++++++++++++++++++-
+ lib/test_siphash.c | 98 +++++++++++++-
+ 4 files changed, 546 insertions(+), 5 deletions(-)
+
+--- a/Documentation/siphash.txt
++++ b/Documentation/siphash.txt
+@@ -98,3 +98,78 @@ u64 h = siphash(&combined, offsetofend(t
+
+ Read the SipHash paper if you're interested in learning more:
+ https://131002.net/siphash/siphash.pdf
++
++
++~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~
++
++HalfSipHash - SipHash's insecure younger cousin
++-----------------------------------------------
++Written by Jason A. Donenfeld <jason@zx2c4.com>
++
++On the off-chance that SipHash is not fast enough for your needs, you might be
++able to justify using HalfSipHash, a terrifying but potentially useful
++possibility. HalfSipHash cuts SipHash's rounds down from "2-4" to "1-3" and,
++even scarier, uses an easily brute-forcable 64-bit key (with a 32-bit output)
++instead of SipHash's 128-bit key. However, this may appeal to some
++high-performance `jhash` users.
++
++Danger!
++
++Do not ever use HalfSipHash except for as a hashtable key function, and only
++then when you can be absolutely certain that the outputs will never be
++transmitted out of the kernel. This is only remotely useful over `jhash` as a
++means of mitigating hashtable flooding denial of service attacks.
++
++1. Generating a key
++
++Keys should always be generated from a cryptographically secure source of
++random numbers, either using get_random_bytes or get_random_once:
++
++hsiphash_key_t key;
++get_random_bytes(&key, sizeof(key));
++
++If you're not deriving your key from here, you're doing it wrong.
++
++2. Using the functions
++
++There are two variants of the function, one that takes a list of integers, and
++one that takes a buffer:
++
++u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key);
++
++And:
++
++u32 hsiphash_1u32(u32, const hsiphash_key_t *key);
++u32 hsiphash_2u32(u32, u32, const hsiphash_key_t *key);
++u32 hsiphash_3u32(u32, u32, u32, const hsiphash_key_t *key);
++u32 hsiphash_4u32(u32, u32, u32, u32, const hsiphash_key_t *key);
++
++If you pass the generic hsiphash function something of a constant length, it
++will constant fold at compile-time and automatically choose one of the
++optimized functions.
++
++3. Hashtable key function usage:
++
++struct some_hashtable {
++ DECLARE_HASHTABLE(hashtable, 8);
++ hsiphash_key_t key;
++};
++
++void init_hashtable(struct some_hashtable *table)
++{
++ get_random_bytes(&table->key, sizeof(table->key));
++}
++
++static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input)
++{
++ return &table->hashtable[hsiphash(input, sizeof(*input), &table->key) & (HASH_SIZE(table->hashtable) - 1)];
++}
++
++You may then iterate like usual over the returned hash bucket.
++
++4. Performance
++
++HalfSipHash is roughly 3 times slower than JenkinsHash. For many replacements,
++this will not be a problem, as the hashtable lookup isn't the bottleneck. And
++in general, this is probably a good sacrifice to make for the security and DoS
++resistance of HalfSipHash.
+--- a/include/linux/siphash.h
++++ b/include/linux/siphash.h
+@@ -5,7 +5,9 @@
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+- * This implementation is specifically for SipHash2-4.
++ * This implementation is specifically for SipHash2-4 for a secure PRF
++ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
++ * hashtables.
+ */
+
+ #ifndef _LINUX_SIPHASH_H
+@@ -82,4 +84,57 @@ static inline u64 siphash(const void *da
+ return ___siphash_aligned(data, len, key);
+ }
+
++#define HSIPHASH_ALIGNMENT __alignof__(unsigned long)
++typedef struct {
++ unsigned long key[2];
++} hsiphash_key_t;
++
++u32 __hsiphash_aligned(const void *data, size_t len,
++ const hsiphash_key_t *key);
++#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
++u32 __hsiphash_unaligned(const void *data, size_t len,
++ const hsiphash_key_t *key);
++#endif
++
++u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key);
++u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key);
++u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c,
++ const hsiphash_key_t *key);
++u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d,
++ const hsiphash_key_t *key);
++
++static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len,
++ const hsiphash_key_t *key)
++{
++ if (__builtin_constant_p(len) && len == 4)
++ return hsiphash_1u32(le32_to_cpu(data[0]), key);
++ if (__builtin_constant_p(len) && len == 8)
++ return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
++ key);
++ if (__builtin_constant_p(len) && len == 12)
++ return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
++ le32_to_cpu(data[2]), key);
++ if (__builtin_constant_p(len) && len == 16)
++ return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
++ le32_to_cpu(data[2]), le32_to_cpu(data[3]),
++ key);
++ return __hsiphash_aligned(data, len, key);
++}
++
++/**
++ * hsiphash - compute 32-bit hsiphash PRF value
++ * @data: buffer to hash
++ * @size: size of @data
++ * @key: the hsiphash key
++ */
++static inline u32 hsiphash(const void *data, size_t len,
++ const hsiphash_key_t *key)
++{
++#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
++ if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT))
++ return __hsiphash_unaligned(data, len, key);
++#endif
++ return ___hsiphash_aligned(data, len, key);
++}
++
+ #endif /* _LINUX_SIPHASH_H */
+--- a/lib/siphash.c
++++ b/lib/siphash.c
+@@ -5,7 +5,9 @@
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+- * This implementation is specifically for SipHash2-4.
++ * This implementation is specifically for SipHash2-4 for a secure PRF
++ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
++ * hashtables.
+ */
+
+ #include <linux/siphash.h>
+@@ -230,3 +232,320 @@ u64 siphash_3u32(const u32 first, const
+ POSTAMBLE
+ }
+ EXPORT_SYMBOL(siphash_3u32);
++
++#if BITS_PER_LONG == 64
++/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for
++ * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3.
++ */
++
++#define HSIPROUND SIPROUND
++#define HPREAMBLE(len) PREAMBLE(len)
++#define HPOSTAMBLE \
++ v3 ^= b; \
++ HSIPROUND; \
++ v0 ^= b; \
++ v2 ^= 0xff; \
++ HSIPROUND; \
++ HSIPROUND; \
++ HSIPROUND; \
++ return (v0 ^ v1) ^ (v2 ^ v3);
++
++u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
++{
++ const u8 *end = data + len - (len % sizeof(u64));
++ const u8 left = len & (sizeof(u64) - 1);
++ u64 m;
++ HPREAMBLE(len)
++ for (; data != end; data += sizeof(u64)) {
++ m = le64_to_cpup(data);
++ v3 ^= m;
++ HSIPROUND;
++ v0 ^= m;
++ }
++#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
++ if (left)
++ b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
++ bytemask_from_count(left)));
++#else
++ switch (left) {
++ case 7: b |= ((u64)end[6]) << 48;
++ case 6: b |= ((u64)end[5]) << 40;
++ case 5: b |= ((u64)end[4]) << 32;
++ case 4: b |= le32_to_cpup(data); break;
++ case 3: b |= ((u64)end[2]) << 16;
++ case 2: b |= le16_to_cpup(data); break;
++ case 1: b |= end[0];
++ }
++#endif
++ HPOSTAMBLE
++}
++EXPORT_SYMBOL(__hsiphash_aligned);
++
++#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
++u32 __hsiphash_unaligned(const void *data, size_t len,
++ const hsiphash_key_t *key)
++{
++ const u8 *end = data + len - (len % sizeof(u64));
++ const u8 left = len & (sizeof(u64) - 1);
++ u64 m;
++ HPREAMBLE(len)
++ for (; data != end; data += sizeof(u64)) {
++ m = get_unaligned_le64(data);
++ v3 ^= m;
++ HSIPROUND;
++ v0 ^= m;
++ }
++#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
++ if (left)
++ b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
++ bytemask_from_count(left)));
++#else
++ switch (left) {
++ case 7: b |= ((u64)end[6]) << 48;
++ case 6: b |= ((u64)end[5]) << 40;
++ case 5: b |= ((u64)end[4]) << 32;
++ case 4: b |= get_unaligned_le32(end); break;
++ case 3: b |= ((u64)end[2]) << 16;
++ case 2: b |= get_unaligned_le16(end); break;
++ case 1: b |= end[0];
++ }
++#endif
++ HPOSTAMBLE
++}
++EXPORT_SYMBOL(__hsiphash_unaligned);
++#endif
++
++/**
++ * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32
++ * @first: first u32
++ * @key: the hsiphash key
++ */
++u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
++{
++ HPREAMBLE(4)
++ b |= first;
++ HPOSTAMBLE
++}
++EXPORT_SYMBOL(hsiphash_1u32);
++
++/**
++ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
++ * @first: first u32
++ * @second: second u32
++ * @key: the hsiphash key
++ */
++u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
++{
++ u64 combined = (u64)second << 32 | first;
++ HPREAMBLE(8)
++ v3 ^= combined;
++ HSIPROUND;
++ v0 ^= combined;
++ HPOSTAMBLE
++}
++EXPORT_SYMBOL(hsiphash_2u32);
++
++/**
++ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
++ * @first: first u32
++ * @second: second u32
++ * @third: third u32
++ * @key: the hsiphash key
++ */
++u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
++ const hsiphash_key_t *key)
++{
++ u64 combined = (u64)second << 32 | first;
++ HPREAMBLE(12)
++ v3 ^= combined;
++ HSIPROUND;
++ v0 ^= combined;
++ b |= third;
++ HPOSTAMBLE
++}
++EXPORT_SYMBOL(hsiphash_3u32);
++
++/**
++ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
++ * @first: first u32
++ * @second: second u32
++ * @third: third u32
++ * @forth: forth u32
++ * @key: the hsiphash key
++ */
++u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
++ const u32 forth, const hsiphash_key_t *key)
++{
++ u64 combined = (u64)second << 32 | first;
++ HPREAMBLE(16)
++ v3 ^= combined;
++ HSIPROUND;
++ v0 ^= combined;
++ combined = (u64)forth << 32 | third;
++ v3 ^= combined;
++ HSIPROUND;
++ v0 ^= combined;
++ HPOSTAMBLE
++}
++EXPORT_SYMBOL(hsiphash_4u32);
++#else
++#define HSIPROUND \
++ do { \
++ v0 += v1; v1 = rol32(v1, 5); v1 ^= v0; v0 = rol32(v0, 16); \
++ v2 += v3; v3 = rol32(v3, 8); v3 ^= v2; \
++ v0 += v3; v3 = rol32(v3, 7); v3 ^= v0; \
++ v2 += v1; v1 = rol32(v1, 13); v1 ^= v2; v2 = rol32(v2, 16); \
++ } while (0)
++
++#define HPREAMBLE(len) \
++ u32 v0 = 0; \
++ u32 v1 = 0; \
++ u32 v2 = 0x6c796765U; \
++ u32 v3 = 0x74656462U; \
++ u32 b = ((u32)(len)) << 24; \
++ v3 ^= key->key[1]; \
++ v2 ^= key->key[0]; \
++ v1 ^= key->key[1]; \
++ v0 ^= key->key[0];
++
++#define HPOSTAMBLE \
++ v3 ^= b; \
++ HSIPROUND; \
++ v0 ^= b; \
++ v2 ^= 0xff; \
++ HSIPROUND; \
++ HSIPROUND; \
++ HSIPROUND; \
++ return v1 ^ v3;
++
++u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
++{
++ const u8 *end = data + len - (len % sizeof(u32));
++ const u8 left = len & (sizeof(u32) - 1);
++ u32 m;
++ HPREAMBLE(len)
++ for (; data != end; data += sizeof(u32)) {
++ m = le32_to_cpup(data);
++ v3 ^= m;
++ HSIPROUND;
++ v0 ^= m;
++ }
++ switch (left) {
++ case 3: b |= ((u32)end[2]) << 16;
++ case 2: b |= le16_to_cpup(data); break;
++ case 1: b |= end[0];
++ }
++ HPOSTAMBLE
++}
++EXPORT_SYMBOL(__hsiphash_aligned);
++
++#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
++u32 __hsiphash_unaligned(const void *data, size_t len,
++ const hsiphash_key_t *key)
++{
++ const u8 *end = data + len - (len % sizeof(u32));
++ const u8 left = len & (sizeof(u32) - 1);
++ u32 m;
++ HPREAMBLE(len)
++ for (; data != end; data += sizeof(u32)) {
++ m = get_unaligned_le32(data);
++ v3 ^= m;
++ HSIPROUND;
++ v0 ^= m;
++ }
++ switch (left) {
++ case 3: b |= ((u32)end[2]) << 16;
++ case 2: b |= get_unaligned_le16(end); break;
++ case 1: b |= end[0];
++ }
++ HPOSTAMBLE
++}
++EXPORT_SYMBOL(__hsiphash_unaligned);
++#endif
++
++/**
++ * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32
++ * @first: first u32
++ * @key: the hsiphash key
++ */
++u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
++{
++ HPREAMBLE(4)
++ v3 ^= first;
++ HSIPROUND;
++ v0 ^= first;
++ HPOSTAMBLE
++}
++EXPORT_SYMBOL(hsiphash_1u32);
++
++/**
++ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
++ * @first: first u32
++ * @second: second u32
++ * @key: the hsiphash key
++ */
++u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
++{
++ HPREAMBLE(8)
++ v3 ^= first;
++ HSIPROUND;
++ v0 ^= first;
++ v3 ^= second;
++ HSIPROUND;
++ v0 ^= second;
++ HPOSTAMBLE
++}
++EXPORT_SYMBOL(hsiphash_2u32);
++
++/**
++ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
++ * @first: first u32
++ * @second: second u32
++ * @third: third u32
++ * @key: the hsiphash key
++ */
++u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
++ const hsiphash_key_t *key)
++{
++ HPREAMBLE(12)
++ v3 ^= first;
++ HSIPROUND;
++ v0 ^= first;
++ v3 ^= second;
++ HSIPROUND;
++ v0 ^= second;
++ v3 ^= third;
++ HSIPROUND;
++ v0 ^= third;
++ HPOSTAMBLE
++}
++EXPORT_SYMBOL(hsiphash_3u32);
++
++/**
++ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
++ * @first: first u32
++ * @second: second u32
++ * @third: third u32
++ * @forth: forth u32
++ * @key: the hsiphash key
++ */
++u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
++ const u32 forth, const hsiphash_key_t *key)
++{
++ HPREAMBLE(16)
++ v3 ^= first;
++ HSIPROUND;
++ v0 ^= first;
++ v3 ^= second;
++ HSIPROUND;
++ v0 ^= second;
++ v3 ^= third;
++ HSIPROUND;
++ v0 ^= third;
++ v3 ^= forth;
++ HSIPROUND;
++ v0 ^= forth;
++ HPOSTAMBLE
++}
++EXPORT_SYMBOL(hsiphash_4u32);
++#endif
+--- a/lib/test_siphash.c
++++ b/lib/test_siphash.c
+@@ -7,7 +7,9 @@
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+- * This implementation is specifically for SipHash2-4.
++ * This implementation is specifically for SipHash2-4 for a secure PRF
++ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
++ * hashtables.
+ */
+
+ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+@@ -18,8 +20,8 @@
+ #include <linux/errno.h>
+ #include <linux/module.h>
+
+-/* Test vectors taken from official reference source available at:
+- * https://131002.net/siphash/siphash24.c
++/* Test vectors taken from reference source available at:
++ * https://github.com/veorq/SipHash
+ */
+
+ static const siphash_key_t test_key_siphash =
+@@ -50,6 +52,64 @@ static const u64 test_vectors_siphash[64
+ 0x958a324ceb064572ULL
+ };
+
++#if BITS_PER_LONG == 64
++static const hsiphash_key_t test_key_hsiphash =
++ {{ 0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL }};
++
++static const u32 test_vectors_hsiphash[64] = {
++ 0x050fc4dcU, 0x7d57ca93U, 0x4dc7d44dU,
++ 0xe7ddf7fbU, 0x88d38328U, 0x49533b67U,
++ 0xc59f22a7U, 0x9bb11140U, 0x8d299a8eU,
++ 0x6c063de4U, 0x92ff097fU, 0xf94dc352U,
++ 0x57b4d9a2U, 0x1229ffa7U, 0xc0f95d34U,
++ 0x2a519956U, 0x7d908b66U, 0x63dbd80cU,
++ 0xb473e63eU, 0x8d297d1cU, 0xa6cce040U,
++ 0x2b45f844U, 0xa320872eU, 0xdae6c123U,
++ 0x67349c8cU, 0x705b0979U, 0xca9913a5U,
++ 0x4ade3b35U, 0xef6cd00dU, 0x4ab1e1f4U,
++ 0x43c5e663U, 0x8c21d1bcU, 0x16a7b60dU,
++ 0x7a8ff9bfU, 0x1f2a753eU, 0xbf186b91U,
++ 0xada26206U, 0xa3c33057U, 0xae3a36a1U,
++ 0x7b108392U, 0x99e41531U, 0x3f1ad944U,
++ 0xc8138825U, 0xc28949a6U, 0xfaf8876bU,
++ 0x9f042196U, 0x68b1d623U, 0x8b5114fdU,
++ 0xdf074c46U, 0x12cc86b3U, 0x0a52098fU,
++ 0x9d292f9aU, 0xa2f41f12U, 0x43a71ed0U,
++ 0x73f0bce6U, 0x70a7e980U, 0x243c6d75U,
++ 0xfdb71513U, 0xa67d8a08U, 0xb7e8f148U,
++ 0xf7a644eeU, 0x0f1837f2U, 0x4b6694e0U,
++ 0xb7bbb3a8U
++};
++#else
++static const hsiphash_key_t test_key_hsiphash =
++ {{ 0x03020100U, 0x07060504U }};
++
++static const u32 test_vectors_hsiphash[64] = {
++ 0x5814c896U, 0xe7e864caU, 0xbc4b0e30U,
++ 0x01539939U, 0x7e059ea6U, 0x88e3d89bU,
++ 0xa0080b65U, 0x9d38d9d6U, 0x577999b1U,
++ 0xc839caedU, 0xe4fa32cfU, 0x959246eeU,
++ 0x6b28096cU, 0x66dd9cd6U, 0x16658a7cU,
++ 0xd0257b04U, 0x8b31d501U, 0x2b1cd04bU,
++ 0x06712339U, 0x522aca67U, 0x911bb605U,
++ 0x90a65f0eU, 0xf826ef7bU, 0x62512debU,
++ 0x57150ad7U, 0x5d473507U, 0x1ec47442U,
++ 0xab64afd3U, 0x0a4100d0U, 0x6d2ce652U,
++ 0x2331b6a3U, 0x08d8791aU, 0xbc6dda8dU,
++ 0xe0f6c934U, 0xb0652033U, 0x9b9851ccU,
++ 0x7c46fb7fU, 0x732ba8cbU, 0xf142997aU,
++ 0xfcc9aa1bU, 0x05327eb2U, 0xe110131cU,
++ 0xf9e5e7c0U, 0xa7d708a6U, 0x11795ab1U,
++ 0x65671619U, 0x9f5fff91U, 0xd89c5267U,
++ 0x007783ebU, 0x95766243U, 0xab639262U,
++ 0x9c7e1390U, 0xc368dda6U, 0x38ddc455U,
++ 0xfa13d379U, 0x979ea4e8U, 0x53ecd77eU,
++ 0x2ee80657U, 0x33dbb66aU, 0xae3f0577U,
++ 0x88b4c4ccU, 0x3e7f480bU, 0x74c1ebf8U,
++ 0x87178304U
++};
++#endif
++
+ static int __init siphash_test_init(void)
+ {
+ u8 in[64] __aligned(SIPHASH_ALIGNMENT);
+@@ -70,6 +130,16 @@ static int __init siphash_test_init(void
+ pr_info("siphash self-test unaligned %u: FAIL\n", i + 1);
+ ret = -EINVAL;
+ }
++ if (hsiphash(in, i, &test_key_hsiphash) !=
++ test_vectors_hsiphash[i]) {
++ pr_info("hsiphash self-test aligned %u: FAIL\n", i + 1);
++ ret = -EINVAL;
++ }
++ if (hsiphash(in_unaligned + 1, i, &test_key_hsiphash) !=
++ test_vectors_hsiphash[i]) {
++ pr_info("hsiphash self-test unaligned %u: FAIL\n", i + 1);
++ ret = -EINVAL;
++ }
+ }
+ if (siphash_1u64(0x0706050403020100ULL, &test_key_siphash) !=
+ test_vectors_siphash[8]) {
+@@ -115,6 +185,28 @@ static int __init siphash_test_init(void
+ pr_info("siphash self-test 4u32: FAIL\n");
+ ret = -EINVAL;
+ }
++ if (hsiphash_1u32(0x03020100U, &test_key_hsiphash) !=
++ test_vectors_hsiphash[4]) {
++ pr_info("hsiphash self-test 1u32: FAIL\n");
++ ret = -EINVAL;
++ }
++ if (hsiphash_2u32(0x03020100U, 0x07060504U, &test_key_hsiphash) !=
++ test_vectors_hsiphash[8]) {
++ pr_info("hsiphash self-test 2u32: FAIL\n");
++ ret = -EINVAL;
++ }
++ if (hsiphash_3u32(0x03020100U, 0x07060504U,
++ 0x0b0a0908U, &test_key_hsiphash) !=
++ test_vectors_hsiphash[12]) {
++ pr_info("hsiphash self-test 3u32: FAIL\n");
++ ret = -EINVAL;
++ }
++ if (hsiphash_4u32(0x03020100U, 0x07060504U,
++ 0x0b0a0908U, 0x0f0e0d0cU, &test_key_hsiphash) !=
++ test_vectors_hsiphash[16]) {
++ pr_info("hsiphash self-test 4u32: FAIL\n");
++ ret = -EINVAL;
++ }
+ if (!ret)
+ pr_info("self-tests: pass\n");
+ return ret;
--- /dev/null
+From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Sat, 17 Aug 2019 00:00:44 +0100
+Subject: vhost: introduce vhost_exceeds_weight()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, Sasha Levin <sashal@kernel.org>
+Cc: stable <stable@vger.kernel.org>
+Message-ID: <20190816230044.GK9843@xylophone.i.decadent.org.uk>
+Content-Disposition: inline
+
+From: Jason Wang <jasowang@redhat.com>
+
+commit e82b9b0727ff6d665fff2d326162b460dded554d upstream.
+
+We used to have vhost_exceeds_weight() for vhost-net to:
+
+- prevent vhost kthread from hogging the cpu
+- balance the time spent between TX and RX
+
+This function could be useful for vsock and scsi as well. So move it
+to vhost.c. Device must specify a weight which counts the number of
+requests, or it can also specific a byte_weight which counts the
+number of bytes that has been processed.
+
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+[bwh: Backported to 4.9:
+ - In vhost_net, both Tx modes are handled in one loop in handle_tx()
+ - Adjust context]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/vhost/net.c | 18 +++++-------------
+ drivers/vhost/scsi.c | 9 ++++++++-
+ drivers/vhost/vhost.c | 20 +++++++++++++++++++-
+ drivers/vhost/vhost.h | 6 +++++-
+ drivers/vhost/vsock.c | 12 +++++++++++-
+ 5 files changed, 48 insertions(+), 17 deletions(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -357,12 +357,6 @@ static int vhost_net_tx_get_vq_desc(stru
+ return r;
+ }
+
+-static bool vhost_exceeds_weight(int pkts, int total_len)
+-{
+- return total_len >= VHOST_NET_WEIGHT ||
+- pkts >= VHOST_NET_PKT_WEIGHT;
+-}
+-
+ /* Expects to be always run from workqueue - which acts as
+ * read-size critical section for our kind of RCU. */
+ static void handle_tx(struct vhost_net *net)
+@@ -487,10 +481,9 @@ static void handle_tx(struct vhost_net *
+ vhost_zerocopy_signal_used(net, vq);
+ total_len += len;
+ vhost_net_tx_packet(net);
+- if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
+- vhost_poll_queue(&vq->poll);
++ if (unlikely(vhost_exceeds_weight(vq, ++sent_pkts,
++ total_len)))
+ break;
+- }
+ }
+ out:
+ mutex_unlock(&vq->mutex);
+@@ -768,10 +761,8 @@ static void handle_rx(struct vhost_net *
+ vhost_log_write(vq, vq_log, log, vhost_len,
+ vq->iov, in);
+ total_len += vhost_len;
+- if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
+- vhost_poll_queue(&vq->poll);
++ if (unlikely(vhost_exceeds_weight(vq, ++recv_pkts, total_len)))
+ goto out;
+- }
+ }
+ vhost_net_enable_vq(net, vq);
+ out:
+@@ -842,7 +833,8 @@ static int vhost_net_open(struct inode *
+ n->vqs[i].vhost_hlen = 0;
+ n->vqs[i].sock_hlen = 0;
+ }
+- vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
++ vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
++ VHOST_NET_WEIGHT, VHOST_NET_PKT_WEIGHT);
+
+ vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
+ vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
+--- a/drivers/vhost/scsi.c
++++ b/drivers/vhost/scsi.c
+@@ -58,6 +58,12 @@
+ #define VHOST_SCSI_PREALLOC_UPAGES 2048
+ #define VHOST_SCSI_PREALLOC_PROT_SGLS 512
+
++/* Max number of requests before requeueing the job.
++ * Using this limit prevents one virtqueue from starving others with
++ * request.
++ */
++#define VHOST_SCSI_WEIGHT 256
++
+ struct vhost_scsi_inflight {
+ /* Wait for the flush operation to finish */
+ struct completion comp;
+@@ -1433,7 +1439,8 @@ static int vhost_scsi_open(struct inode
+ vqs[i] = &vs->vqs[i].vq;
+ vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
+ }
+- vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ);
++ vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ,
++ VHOST_SCSI_WEIGHT, 0);
+
+ vhost_scsi_init_inflight(vs, NULL);
+
+--- a/drivers/vhost/vhost.c
++++ b/drivers/vhost/vhost.c
+@@ -393,8 +393,24 @@ static void vhost_dev_free_iovecs(struct
+ vhost_vq_free_iovecs(dev->vqs[i]);
+ }
+
++bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
++ int pkts, int total_len)
++{
++ struct vhost_dev *dev = vq->dev;
++
++ if ((dev->byte_weight && total_len >= dev->byte_weight) ||
++ pkts >= dev->weight) {
++ vhost_poll_queue(&vq->poll);
++ return true;
++ }
++
++ return false;
++}
++EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
++
+ void vhost_dev_init(struct vhost_dev *dev,
+- struct vhost_virtqueue **vqs, int nvqs)
++ struct vhost_virtqueue **vqs, int nvqs,
++ int weight, int byte_weight)
+ {
+ struct vhost_virtqueue *vq;
+ int i;
+@@ -408,6 +424,8 @@ void vhost_dev_init(struct vhost_dev *de
+ dev->iotlb = NULL;
+ dev->mm = NULL;
+ dev->worker = NULL;
++ dev->weight = weight;
++ dev->byte_weight = byte_weight;
+ init_llist_head(&dev->work_list);
+ init_waitqueue_head(&dev->wait);
+ INIT_LIST_HEAD(&dev->read_list);
+--- a/drivers/vhost/vhost.h
++++ b/drivers/vhost/vhost.h
+@@ -164,9 +164,13 @@ struct vhost_dev {
+ struct list_head read_list;
+ struct list_head pending_list;
+ wait_queue_head_t wait;
++ int weight;
++ int byte_weight;
+ };
+
+-void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
++bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len);
++void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs,
++ int nvqs, int weight, int byte_weight);
+ long vhost_dev_set_owner(struct vhost_dev *dev);
+ bool vhost_dev_has_owner(struct vhost_dev *dev);
+ long vhost_dev_check_owner(struct vhost_dev *);
+--- a/drivers/vhost/vsock.c
++++ b/drivers/vhost/vsock.c
+@@ -21,6 +21,14 @@
+ #include "vhost.h"
+
+ #define VHOST_VSOCK_DEFAULT_HOST_CID 2
++/* Max number of bytes transferred before requeueing the job.
++ * Using this limit prevents one virtqueue from starving others. */
++#define VHOST_VSOCK_WEIGHT 0x80000
++/* Max number of packets transferred before requeueing the job.
++ * Using this limit prevents one virtqueue from starving others with
++ * small pkts.
++ */
++#define VHOST_VSOCK_PKT_WEIGHT 256
+
+ enum {
+ VHOST_VSOCK_FEATURES = VHOST_FEATURES,
+@@ -529,7 +537,9 @@ static int vhost_vsock_dev_open(struct i
+ vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick;
+ vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick;
+
+- vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs));
++ vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs),
++ VHOST_VSOCK_PKT_WEIGHT,
++ VHOST_VSOCK_WEIGHT);
+
+ file->private_data = vsock;
+ spin_lock_init(&vsock->send_pkt_list_lock);
--- /dev/null
+From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Sat, 17 Aug 2019 00:00:19 +0100
+Subject: vhost-net: set packet weight of tx polling to 2 * vq size
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, Sasha Levin <sashal@kernel.org>
+Cc: stable <stable@vger.kernel.org>
+Message-ID: <20190816230019.GH9843@xylophone.i.decadent.org.uk>
+Content-Disposition: inline
+
+From: haibinzhang(å¼ æµ·æ–Œ) <haibinzhang@tencent.com>
+
+commit a2ac99905f1ea8b15997a6ec39af69aa28a3653b upstream.
+
+handle_tx will delay rx for tens or even hundreds of milliseconds when tx busy
+polling udp packets with small length(e.g. 1byte udp payload), because setting
+VHOST_NET_WEIGHT takes into account only sent-bytes but no single packet length.
+
+Ping-Latencies shown below were tested between two Virtual Machines using
+netperf (UDP_STREAM, len=1), and then another machine pinged the client:
+
+vq size=256
+Packet-Weight Ping-Latencies(millisecond)
+ min avg max
+Origin 3.319 18.489 57.303
+64 1.643 2.021 2.552
+128 1.825 2.600 3.224
+256 1.997 2.710 4.295
+512 1.860 3.171 4.631
+1024 2.002 4.173 9.056
+2048 2.257 5.650 9.688
+4096 2.093 8.508 15.943
+
+vq size=512
+Packet-Weight Ping-Latencies(millisecond)
+ min avg max
+Origin 6.537 29.177 66.245
+64 2.798 3.614 4.403
+128 2.861 3.820 4.775
+256 3.008 4.018 4.807
+512 3.254 4.523 5.824
+1024 3.079 5.335 7.747
+2048 3.944 8.201 12.762
+4096 4.158 11.057 19.985
+
+Seems pretty consistent, a small dip at 2 VQ sizes.
+Ring size is a hint from device about a burst size it can tolerate. Based on
+benchmarks, set the weight to 2 * vq size.
+
+To evaluate this change, another tests were done using netperf(RR, TX) between
+two machines with Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz, and vq size was
+tweaked through qemu. Results shown below does not show obvious changes.
+
+vq size=256 TCP_RR vq size=512 TCP_RR
+size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize%
+ 1/ 1/ -7%/ -2% 1/ 1/ 0%/ -2%
+ 1/ 4/ +1%/ 0% 1/ 4/ +1%/ 0%
+ 1/ 8/ +1%/ -2% 1/ 8/ 0%/ +1%
+ 64/ 1/ -6%/ 0% 64/ 1/ +7%/ +3%
+ 64/ 4/ 0%/ +2% 64/ 4/ -1%/ +1%
+ 64/ 8/ 0%/ 0% 64/ 8/ -1%/ -2%
+ 256/ 1/ -3%/ -4% 256/ 1/ -4%/ -2%
+ 256/ 4/ +3%/ +4% 256/ 4/ +1%/ +2%
+ 256/ 8/ +2%/ 0% 256/ 8/ +1%/ -1%
+
+vq size=256 UDP_RR vq size=512 UDP_RR
+size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize%
+ 1/ 1/ -5%/ +1% 1/ 1/ -3%/ -2%
+ 1/ 4/ +4%/ +1% 1/ 4/ -2%/ +2%
+ 1/ 8/ -1%/ -1% 1/ 8/ -1%/ 0%
+ 64/ 1/ -2%/ -3% 64/ 1/ +1%/ +1%
+ 64/ 4/ -5%/ -1% 64/ 4/ +2%/ 0%
+ 64/ 8/ 0%/ -1% 64/ 8/ -2%/ +1%
+ 256/ 1/ +7%/ +1% 256/ 1/ -7%/ 0%
+ 256/ 4/ +1%/ +1% 256/ 4/ -3%/ -4%
+ 256/ 8/ +2%/ +2% 256/ 8/ +1%/ +1%
+
+vq size=256 TCP_STREAM vq size=512 TCP_STREAM
+size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize%
+ 64/ 1/ 0%/ -3% 64/ 1/ 0%/ 0%
+ 64/ 4/ +3%/ -1% 64/ 4/ -2%/ +4%
+ 64/ 8/ +9%/ -4% 64/ 8/ -1%/ +2%
+ 256/ 1/ +1%/ -4% 256/ 1/ +1%/ +1%
+ 256/ 4/ -1%/ -1% 256/ 4/ -3%/ 0%
+ 256/ 8/ +7%/ +5% 256/ 8/ -3%/ 0%
+ 512/ 1/ +1%/ 0% 512/ 1/ -1%/ -1%
+ 512/ 4/ +1%/ -1% 512/ 4/ 0%/ 0%
+ 512/ 8/ +7%/ -5% 512/ 8/ +6%/ -1%
+1024/ 1/ 0%/ -1% 1024/ 1/ 0%/ +1%
+1024/ 4/ +3%/ 0% 1024/ 4/ +1%/ 0%
+1024/ 8/ +8%/ +5% 1024/ 8/ -1%/ 0%
+2048/ 1/ +2%/ +2% 2048/ 1/ -1%/ 0%
+2048/ 4/ +1%/ 0% 2048/ 4/ 0%/ -1%
+2048/ 8/ -2%/ 0% 2048/ 8/ 5%/ -1%
+4096/ 1/ -2%/ 0% 4096/ 1/ -2%/ 0%
+4096/ 4/ +2%/ 0% 4096/ 4/ 0%/ 0%
+4096/ 8/ +9%/ -2% 4096/ 8/ -5%/ -1%
+
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Haibin Zhang <haibinzhang@tencent.com>
+Signed-off-by: Yunfang Tai <yunfangtai@tencent.com>
+Signed-off-by: Lidong Chen <lidongchen@tencent.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/vhost/net.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -39,6 +39,10 @@ MODULE_PARM_DESC(experimental_zcopytx, "
+ * Using this limit prevents one virtqueue from starving others. */
+ #define VHOST_NET_WEIGHT 0x80000
+
++/* Max number of packets transferred before requeueing the job.
++ * Using this limit prevents one virtqueue from starving rx. */
++#define VHOST_NET_PKT_WEIGHT(vq) ((vq)->num * 2)
++
+ /* MAX number of TX used buffers for outstanding zerocopy */
+ #define VHOST_MAX_PEND 128
+ #define VHOST_GOODCOPY_LEN 256
+@@ -372,6 +376,7 @@ static void handle_tx(struct vhost_net *
+ struct socket *sock;
+ struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+ bool zcopy, zcopy_used;
++ int sent_pkts = 0;
+
+ mutex_lock(&vq->mutex);
+ sock = vq->private_data;
+@@ -474,7 +479,8 @@ static void handle_tx(struct vhost_net *
+ vhost_zerocopy_signal_used(net, vq);
+ total_len += len;
+ vhost_net_tx_packet(net);
+- if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
++ if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
++ unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT(vq))) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
--- /dev/null
+From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Sat, 17 Aug 2019 00:01:01 +0100
+Subject: vhost: scsi: add weight support
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, Sasha Levin <sashal@kernel.org>
+Cc: stable <stable@vger.kernel.org>
+Message-ID: <20190816230101.GM9843@xylophone.i.decadent.org.uk>
+Content-Disposition: inline
+
+From: Jason Wang <jasowang@redhat.com>
+
+commit c1ea02f15ab5efb3e93fc3144d895410bf79fcf2 upstream.
+
+This patch will check the weight and exit the loop if we exceeds the
+weight. This is useful for preventing scsi kthread from hogging cpu
+which is guest triggerable.
+
+This addresses CVE-2019-3900.
+
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Stefan Hajnoczi <stefanha@redhat.com>
+Fixes: 057cbf49a1f0 ("tcm_vhost: Initial merge for vhost level target fabric driver")
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+[bwh: Backported to 4.9:
+ - Drop changes in vhost_scsi_ctl_handle_vq()
+ - Adjust context]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/vhost/scsi.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/drivers/vhost/scsi.c
++++ b/drivers/vhost/scsi.c
+@@ -851,7 +851,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *
+ u64 tag;
+ u32 exp_data_len, data_direction;
+ unsigned out, in;
+- int head, ret, prot_bytes;
++ int head, ret, prot_bytes, c = 0;
+ size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp);
+ size_t out_size, in_size;
+ u16 lun;
+@@ -870,7 +870,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *
+
+ vhost_disable_notify(&vs->dev, vq);
+
+- for (;;) {
++ do {
+ head = vhost_get_vq_desc(vq, vq->iov,
+ ARRAY_SIZE(vq->iov), &out, &in,
+ NULL, NULL);
+@@ -1086,7 +1086,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *
+ */
+ INIT_WORK(&cmd->work, vhost_scsi_submission_work);
+ queue_work(vhost_scsi_workqueue, &cmd->work);
+- }
++ } while (likely(!vhost_exceeds_weight(vq, ++c, 0)));
+ out:
+ mutex_unlock(&vq->mutex);
+ }
--- /dev/null
+From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Sat, 17 Aug 2019 00:00:53 +0100
+Subject: vhost_net: fix possible infinite loop
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, Sasha Levin <sashal@kernel.org>
+Cc: stable <stable@vger.kernel.org>
+Message-ID: <20190816230053.GL9843@xylophone.i.decadent.org.uk>
+Content-Disposition: inline
+
+From: Jason Wang <jasowang@redhat.com>
+
+commit e2412c07f8f3040593dfb88207865a3cd58680c0 upstream.
+
+When the rx buffer is too small for a packet, we will discard the vq
+descriptor and retry it for the next packet:
+
+while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
+ &busyloop_intr))) {
+...
+ /* On overrun, truncate and discard */
+ if (unlikely(headcount > UIO_MAXIOV)) {
+ iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
+ err = sock->ops->recvmsg(sock, &msg,
+ 1, MSG_DONTWAIT | MSG_TRUNC);
+ pr_debug("Discarded rx packet: len %zd\n", sock_len);
+ continue;
+ }
+...
+}
+
+This makes it possible to trigger a infinite while..continue loop
+through the co-opreation of two VMs like:
+
+1) Malicious VM1 allocate 1 byte rx buffer and try to slow down the
+ vhost process as much as possible e.g using indirect descriptors or
+ other.
+2) Malicious VM2 generate packets to VM1 as fast as possible
+
+Fixing this by checking against weight at the end of RX and TX
+loop. This also eliminate other similar cases when:
+
+- userspace is consuming the packets in the meanwhile
+- theoretical TOCTOU attack if guest moving avail index back and forth
+ to hit the continue after vhost find guest just add new buffers
+
+This addresses CVE-2019-3900.
+
+Fixes: d8316f3991d20 ("vhost: fix total length when packets are too short")
+Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server")
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+[bwh: Backported to 4.9:
+ - Both Tx modes are handled in one loop in handle_tx()
+ - Adjust context]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/vhost/net.c | 22 +++++++++++-----------
+ 1 file changed, 11 insertions(+), 11 deletions(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -393,7 +393,7 @@ static void handle_tx(struct vhost_net *
+ hdr_size = nvq->vhost_hlen;
+ zcopy = nvq->ubufs;
+
+- for (;;) {
++ do {
+ /* Release DMAs done buffers first */
+ if (zcopy)
+ vhost_zerocopy_signal_used(net, vq);
+@@ -481,10 +481,7 @@ static void handle_tx(struct vhost_net *
+ vhost_zerocopy_signal_used(net, vq);
+ total_len += len;
+ vhost_net_tx_packet(net);
+- if (unlikely(vhost_exceeds_weight(vq, ++sent_pkts,
+- total_len)))
+- break;
+- }
++ } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
+ out:
+ mutex_unlock(&vq->mutex);
+ }
+@@ -682,7 +679,10 @@ static void handle_rx(struct vhost_net *
+ vq->log : NULL;
+ mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
+
+- while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
++ do {
++ sock_len = vhost_net_rx_peek_head_len(net, sock->sk);
++ if (!sock_len)
++ break;
+ sock_len += sock_hlen;
+ vhost_len = sock_len + vhost_hlen;
+ headcount = get_rx_bufs(vq, vq->heads, vhost_len,
+@@ -761,10 +761,10 @@ static void handle_rx(struct vhost_net *
+ vhost_log_write(vq, vq_log, log, vhost_len,
+ vq->iov, in);
+ total_len += vhost_len;
+- if (unlikely(vhost_exceeds_weight(vq, ++recv_pkts, total_len)))
+- goto out;
+- }
+- vhost_net_enable_vq(net, vq);
++ } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len)));
++
++ if (!sock_len)
++ vhost_net_enable_vq(net, vq);
+ out:
+ mutex_unlock(&vq->mutex);
+ }
+@@ -834,7 +834,7 @@ static int vhost_net_open(struct inode *
+ n->vqs[i].sock_hlen = 0;
+ }
+ vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
+- VHOST_NET_WEIGHT, VHOST_NET_PKT_WEIGHT);
++ VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT);
+
+ vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
+ vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
--- /dev/null
+From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Sat, 17 Aug 2019 00:00:36 +0100
+Subject: vhost_net: introduce vhost_exceeds_weight()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, Sasha Levin <sashal@kernel.org>
+Cc: stable <stable@vger.kernel.org>
+Message-ID: <20190816230036.GJ9843@xylophone.i.decadent.org.uk>
+Content-Disposition: inline
+
+From: Jason Wang <jasowang@redhat.com>
+
+commit 272f35cba53d088085e5952fd81d7a133ab90789 upstream.
+
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[bwh: Backported to 4.9: adjust context]
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/vhost/net.c | 12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -357,6 +357,12 @@ static int vhost_net_tx_get_vq_desc(stru
+ return r;
+ }
+
++static bool vhost_exceeds_weight(int pkts, int total_len)
++{
++ return total_len >= VHOST_NET_WEIGHT ||
++ pkts >= VHOST_NET_PKT_WEIGHT;
++}
++
+ /* Expects to be always run from workqueue - which acts as
+ * read-size critical section for our kind of RCU. */
+ static void handle_tx(struct vhost_net *net)
+@@ -481,8 +487,7 @@ static void handle_tx(struct vhost_net *
+ vhost_zerocopy_signal_used(net, vq);
+ total_len += len;
+ vhost_net_tx_packet(net);
+- if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
+- unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
++ if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+@@ -763,8 +768,7 @@ static void handle_rx(struct vhost_net *
+ vhost_log_write(vq, vq_log, log, vhost_len,
+ vq->iov, in);
+ total_len += vhost_len;
+- if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
+- unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
++ if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
+ vhost_poll_queue(&vq->poll);
+ goto out;
+ }
--- /dev/null
+From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST
+From: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Date: Sat, 17 Aug 2019 00:00:28 +0100
+Subject: vhost_net: use packet weight for rx handler, too
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, Sasha Levin <sashal@kernel.org>
+Cc: stable <stable@vger.kernel.org>
+Message-ID: <20190816230028.GI9843@xylophone.i.decadent.org.uk>
+Content-Disposition: inline
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit db688c24eada63b1efe6d0d7d835e5c3bdd71fd3 upstream.
+
+Similar to commit a2ac99905f1e ("vhost-net: set packet weight of
+tx polling to 2 * vq size"), we need a packet-based limit for
+handler_rx, too - elsewhere, under rx flood with small packets,
+tx can be delayed for a very long time, even without busypolling.
+
+The pkt limit applied to handle_rx must be the same applied by
+handle_tx, or we will get unfair scheduling between rx and tx.
+Tying such limit to the queue length makes it less effective for
+large queue length values and can introduce large process
+scheduler latencies, so a constant valued is used - likewise
+the existing bytes limit.
+
+The selected limit has been validated with PVP[1] performance
+test with different queue sizes:
+
+queue size 256 512 1024
+
+baseline 366 354 362
+weight 128 715 723 670
+weight 256 740 745 733
+weight 512 600 460 583
+weight 1024 423 427 418
+
+A packet weight of 256 gives peek performances in under all the
+tested scenarios.
+
+No measurable regression in unidirectional performance tests has
+been detected.
+
+[1] https://developers.redhat.com/blog/2017/06/05/measuring-and-comparing-open-vswitch-performance/
+
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Acked-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/vhost/net.c | 12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -40,8 +40,10 @@ MODULE_PARM_DESC(experimental_zcopytx, "
+ #define VHOST_NET_WEIGHT 0x80000
+
+ /* Max number of packets transferred before requeueing the job.
+- * Using this limit prevents one virtqueue from starving rx. */
+-#define VHOST_NET_PKT_WEIGHT(vq) ((vq)->num * 2)
++ * Using this limit prevents one virtqueue from starving others with small
++ * pkts.
++ */
++#define VHOST_NET_PKT_WEIGHT 256
+
+ /* MAX number of TX used buffers for outstanding zerocopy */
+ #define VHOST_MAX_PEND 128
+@@ -480,7 +482,7 @@ static void handle_tx(struct vhost_net *
+ total_len += len;
+ vhost_net_tx_packet(net);
+ if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
+- unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT(vq))) {
++ unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+@@ -662,6 +664,7 @@ static void handle_rx(struct vhost_net *
+ struct socket *sock;
+ struct iov_iter fixup;
+ __virtio16 num_buffers;
++ int recv_pkts = 0;
+
+ mutex_lock_nested(&vq->mutex, 0);
+ sock = vq->private_data;
+@@ -760,7 +763,8 @@ static void handle_rx(struct vhost_net *
+ vhost_log_write(vq, vq_log, log, vhost_len,
+ vq->iov, in);
+ total_len += vhost_len;
+- if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
++ if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
++ unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ goto out;
+ }