From: Greg Kroah-Hartman Date: Sat, 17 Aug 2019 16:41:17 +0000 (+0200) Subject: 4.9-stable patches X-Git-Tag: v4.19.68~59 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7ca3129aa386900c90defd9a8c67bc5509f614b6;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: bpf-add-bpf_jit_limit-knob-to-restrict-unpriv-allocations.patch bpf-get-rid-of-pure_initcall-dependency-to-enable-jits.patch bpf-restrict-access-to-core-bpf-sysctls.patch inet-switch-ip-id-generator-to-siphash.patch netfilter-ctnetlink-don-t-use-conntrack-expect-object-addresses-as-id.patch siphash-add-cryptographically-secure-prf.patch siphash-implement-halfsiphash1-3-for-hash-tables.patch vhost-introduce-vhost_exceeds_weight.patch vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch vhost-scsi-add-weight-support.patch vhost_net-fix-possible-infinite-loop.patch vhost_net-introduce-vhost_exceeds_weight.patch vhost_net-use-packet-weight-for-rx-handler-too.patch --- diff --git a/queue-4.9/bpf-add-bpf_jit_limit-knob-to-restrict-unpriv-allocations.patch b/queue-4.9/bpf-add-bpf_jit_limit-knob-to-restrict-unpriv-allocations.patch new file mode 100644 index 00000000000..285f3aeefd0 --- /dev/null +++ b/queue-4.9/bpf-add-bpf_jit_limit-knob-to-restrict-unpriv-allocations.patch @@ -0,0 +1,197 @@ +From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST +From: Ben Hutchings +Date: Sat, 17 Aug 2019 00:00:08 +0100 +Subject: bpf: add bpf_jit_limit knob to restrict unpriv allocations +To: Greg Kroah-Hartman , Sasha Levin +Cc: stable +Message-ID: <20190816230008.GG9843@xylophone.i.decadent.org.uk> +Content-Disposition: inline + +From: Daniel Borkmann + +commit ede95a63b5e84ddeea6b0c473b36ab8bfd8c6ce3 upstream. + +Rick reported that the BPF JIT could potentially fill the entire module +space with BPF programs from unprivileged users which would prevent later +attempts to load normal kernel modules or privileged BPF programs, for +example. If JIT was enabled but unsuccessful to generate the image, then +before commit 290af86629b2 ("bpf: introduce BPF_JIT_ALWAYS_ON config") +we would always fall back to the BPF interpreter. Nowadays in the case +where the CONFIG_BPF_JIT_ALWAYS_ON could be set, then the load will abort +with a failure since the BPF interpreter was compiled out. + +Add a global limit and enforce it for unprivileged users such that in case +of BPF interpreter compiled out we fail once the limit has been reached +or we fall back to BPF interpreter earlier w/o using module mem if latter +was compiled in. In a next step, fair share among unprivileged users can +be resolved in particular for the case where we would fail hard once limit +is reached. + +Fixes: 290af86629b2 ("bpf: introduce BPF_JIT_ALWAYS_ON config") +Fixes: 0a14842f5a3c ("net: filter: Just In Time compiler for x86-64") +Co-Developed-by: Rick Edgecombe +Signed-off-by: Daniel Borkmann +Acked-by: Alexei Starovoitov +Cc: Eric Dumazet +Cc: Jann Horn +Cc: Kees Cook +Cc: LKML +Signed-off-by: Alexei Starovoitov +[bwh: Backported to 4.9: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/sysctl/net.txt | 8 +++++++ + include/linux/filter.h | 1 + kernel/bpf/core.c | 49 ++++++++++++++++++++++++++++++++++++++++--- + net/core/sysctl_net_core.c | 10 +++++++- + 4 files changed, 63 insertions(+), 5 deletions(-) + +--- a/Documentation/sysctl/net.txt ++++ b/Documentation/sysctl/net.txt +@@ -54,6 +54,14 @@ Values : + 1 - enable JIT hardening for unprivileged users only + 2 - enable JIT hardening for all users + ++bpf_jit_limit ++------------- ++ ++This enforces a global limit for memory allocations to the BPF JIT ++compiler in order to reject unprivileged JIT requests once it has ++been surpassed. bpf_jit_limit contains the value of the global limit ++in bytes. ++ + dev_weight + -------------- + +--- a/include/linux/filter.h ++++ b/include/linux/filter.h +@@ -599,6 +599,7 @@ void bpf_warn_invalid_xdp_action(u32 act + #ifdef CONFIG_BPF_JIT + extern int bpf_jit_enable; + extern int bpf_jit_harden; ++extern int bpf_jit_limit; + + typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); + +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -208,9 +208,43 @@ struct bpf_prog *bpf_patch_insn_single(s + } + + #ifdef CONFIG_BPF_JIT ++# define BPF_JIT_LIMIT_DEFAULT (PAGE_SIZE * 40000) ++ + /* All BPF JIT sysctl knobs here. */ + int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); + int bpf_jit_harden __read_mostly; ++int bpf_jit_limit __read_mostly = BPF_JIT_LIMIT_DEFAULT; ++ ++static atomic_long_t bpf_jit_current; ++ ++#if defined(MODULES_VADDR) ++static int __init bpf_jit_charge_init(void) ++{ ++ /* Only used as heuristic here to derive limit. */ ++ bpf_jit_limit = min_t(u64, round_up((MODULES_END - MODULES_VADDR) >> 2, ++ PAGE_SIZE), INT_MAX); ++ return 0; ++} ++pure_initcall(bpf_jit_charge_init); ++#endif ++ ++static int bpf_jit_charge_modmem(u32 pages) ++{ ++ if (atomic_long_add_return(pages, &bpf_jit_current) > ++ (bpf_jit_limit >> PAGE_SHIFT)) { ++ if (!capable(CAP_SYS_ADMIN)) { ++ atomic_long_sub(pages, &bpf_jit_current); ++ return -EPERM; ++ } ++ } ++ ++ return 0; ++} ++ ++static void bpf_jit_uncharge_modmem(u32 pages) ++{ ++ atomic_long_sub(pages, &bpf_jit_current); ++} + + struct bpf_binary_header * + bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, +@@ -218,21 +252,27 @@ bpf_jit_binary_alloc(unsigned int progle + bpf_jit_fill_hole_t bpf_fill_ill_insns) + { + struct bpf_binary_header *hdr; +- unsigned int size, hole, start; ++ u32 size, hole, start, pages; + + /* Most of BPF filters are really small, but if some of them + * fill a page, allow at least 128 extra bytes to insert a + * random section of illegal instructions. + */ + size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); ++ pages = size / PAGE_SIZE; ++ ++ if (bpf_jit_charge_modmem(pages)) ++ return NULL; + hdr = module_alloc(size); +- if (hdr == NULL) ++ if (!hdr) { ++ bpf_jit_uncharge_modmem(pages); + return NULL; ++ } + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(hdr, size); + +- hdr->pages = size / PAGE_SIZE; ++ hdr->pages = pages; + hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), + PAGE_SIZE - sizeof(*hdr)); + start = (get_random_int() % hole) & ~(alignment - 1); +@@ -245,7 +285,10 @@ bpf_jit_binary_alloc(unsigned int progle + + void bpf_jit_binary_free(struct bpf_binary_header *hdr) + { ++ u32 pages = hdr->pages; ++ + module_memfree(hdr); ++ bpf_jit_uncharge_modmem(pages); + } + + static int bpf_jit_blind_insn(const struct bpf_insn *from, +--- a/net/core/sysctl_net_core.c ++++ b/net/core/sysctl_net_core.c +@@ -253,7 +253,6 @@ static int proc_dointvec_minmax_bpf_enab + return ret; + } + +-# ifdef CONFIG_HAVE_EBPF_JIT + static int + proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, +@@ -264,7 +263,6 @@ proc_dointvec_minmax_bpf_restricted(stru + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); + } +-# endif + #endif + + static struct ctl_table net_core_table[] = { +@@ -348,6 +346,14 @@ static struct ctl_table net_core_table[] + .extra2 = &two, + }, + # endif ++ { ++ .procname = "bpf_jit_limit", ++ .data = &bpf_jit_limit, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = proc_dointvec_minmax_bpf_restricted, ++ .extra1 = &one, ++ }, + #endif + { + .procname = "netdev_tstamp_prequeue", diff --git a/queue-4.9/bpf-get-rid-of-pure_initcall-dependency-to-enable-jits.patch b/queue-4.9/bpf-get-rid-of-pure_initcall-dependency-to-enable-jits.patch new file mode 100644 index 00000000000..793b8fa83c1 --- /dev/null +++ b/queue-4.9/bpf-get-rid-of-pure_initcall-dependency-to-enable-jits.patch @@ -0,0 +1,238 @@ +From foo@baz Sat 17 Aug 2019 06:38:21 PM CEST +From: Ben Hutchings +Date: Fri, 16 Aug 2019 23:59:20 +0100 +Subject: bpf: get rid of pure_initcall dependency to enable jits +To: Greg Kroah-Hartman , Sasha Levin +Cc: stable +Message-ID: <20190816225920.GE9843@xylophone.i.decadent.org.uk> +Content-Disposition: inline + +From: Daniel Borkmann + +commit fa9dd599b4dae841924b022768354cfde9affecb upstream. + +Having a pure_initcall() callback just to permanently enable BPF +JITs under CONFIG_BPF_JIT_ALWAYS_ON is unnecessary and could leave +a small race window in future where JIT is still disabled on boot. +Since we know about the setting at compilation time anyway, just +initialize it properly there. Also consolidate all the individual +bpf_jit_enable variables into a single one and move them under one +location. Moreover, don't allow for setting unspecified garbage +values on them. + +Signed-off-by: Daniel Borkmann +Acked-by: Alexei Starovoitov +Signed-off-by: Alexei Starovoitov +[bwh: Backported to 4.9 as dependency of commit 2e4a30983b0f + "bpf: restrict access to core bpf sysctls": + - Drop change in arch/mips/net/ebpf_jit.c + - Drop change to bpf_jit_kallsyms + - Adjust filenames, context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm/net/bpf_jit_32.c | 2 -- + arch/arm64/net/bpf_jit_comp.c | 2 -- + arch/mips/net/bpf_jit.c | 2 -- + arch/powerpc/net/bpf_jit_comp.c | 2 -- + arch/powerpc/net/bpf_jit_comp64.c | 2 -- + arch/s390/net/bpf_jit_comp.c | 2 -- + arch/sparc/net/bpf_jit_comp.c | 2 -- + arch/x86/net/bpf_jit_comp.c | 2 -- + kernel/bpf/core.c | 15 +++++++++++---- + net/core/sysctl_net_core.c | 14 +++++++++----- + net/socket.c | 9 --------- + 11 files changed, 20 insertions(+), 34 deletions(-) + +--- a/arch/arm/net/bpf_jit_32.c ++++ b/arch/arm/net/bpf_jit_32.c +@@ -72,8 +72,6 @@ struct jit_ctx { + #endif + }; + +-int bpf_jit_enable __read_mostly; +- + static inline int call_neg_helper(struct sk_buff *skb, int offset, void *ret, + unsigned int size) + { +--- a/arch/arm64/net/bpf_jit_comp.c ++++ b/arch/arm64/net/bpf_jit_comp.c +@@ -30,8 +30,6 @@ + + #include "bpf_jit.h" + +-int bpf_jit_enable __read_mostly; +- + #define TMP_REG_1 (MAX_BPF_JIT_REG + 0) + #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) + #define TCALL_CNT (MAX_BPF_JIT_REG + 2) +--- a/arch/mips/net/bpf_jit.c ++++ b/arch/mips/net/bpf_jit.c +@@ -1194,8 +1194,6 @@ jmp_cmp: + return 0; + } + +-int bpf_jit_enable __read_mostly; +- + void bpf_jit_compile(struct bpf_prog *fp) + { + struct jit_ctx ctx; +--- a/arch/powerpc/net/bpf_jit_comp.c ++++ b/arch/powerpc/net/bpf_jit_comp.c +@@ -18,8 +18,6 @@ + + #include "bpf_jit32.h" + +-int bpf_jit_enable __read_mostly; +- + static inline void bpf_flush_icache(void *start, void *end) + { + smp_wmb(); +--- a/arch/powerpc/net/bpf_jit_comp64.c ++++ b/arch/powerpc/net/bpf_jit_comp64.c +@@ -21,8 +21,6 @@ + + #include "bpf_jit64.h" + +-int bpf_jit_enable __read_mostly; +- + static void bpf_jit_fill_ill_insns(void *area, unsigned int size) + { + int *p = area; +--- a/arch/s390/net/bpf_jit_comp.c ++++ b/arch/s390/net/bpf_jit_comp.c +@@ -28,8 +28,6 @@ + #include + #include "bpf_jit.h" + +-int bpf_jit_enable __read_mostly; +- + struct bpf_jit { + u32 seen; /* Flags to remember seen eBPF instructions */ + u32 seen_reg[16]; /* Array to remember which registers are used */ +--- a/arch/sparc/net/bpf_jit_comp.c ++++ b/arch/sparc/net/bpf_jit_comp.c +@@ -10,8 +10,6 @@ + + #include "bpf_jit.h" + +-int bpf_jit_enable __read_mostly; +- + static inline bool is_simm13(unsigned int value) + { + return value + 0x1000 < 0x2000; +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -15,8 +15,6 @@ + #include + #include + +-int bpf_jit_enable __read_mostly; +- + /* + * assembly code in arch/x86/net/bpf_jit.S + */ +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -208,6 +208,10 @@ struct bpf_prog *bpf_patch_insn_single(s + } + + #ifdef CONFIG_BPF_JIT ++/* All BPF JIT sysctl knobs here. */ ++int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); ++int bpf_jit_harden __read_mostly; ++ + struct bpf_binary_header * + bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, +@@ -244,8 +248,6 @@ void bpf_jit_binary_free(struct bpf_bina + module_memfree(hdr); + } + +-int bpf_jit_harden __read_mostly; +- + static int bpf_jit_blind_insn(const struct bpf_insn *from, + const struct bpf_insn *aux, + struct bpf_insn *to_buff) +@@ -925,8 +927,13 @@ load_byte: + STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ + + #else +-static unsigned int __bpf_prog_ret0(void *ctx, const struct bpf_insn *insn) ++static unsigned int __bpf_prog_ret0_warn(void *ctx, ++ const struct bpf_insn *insn) + { ++ /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON ++ * is not working properly, so warn about it! ++ */ ++ WARN_ON_ONCE(1); + return 0; + } + #endif +@@ -981,7 +988,7 @@ struct bpf_prog *bpf_prog_select_runtime + #ifndef CONFIG_BPF_JIT_ALWAYS_ON + fp->bpf_func = (void *) __bpf_prog_run; + #else +- fp->bpf_func = (void *) __bpf_prog_ret0; ++ fp->bpf_func = (void *) __bpf_prog_ret0_warn; + #endif + + /* eBPF JITs can rewrite the program in case constant +--- a/net/core/sysctl_net_core.c ++++ b/net/core/sysctl_net_core.c +@@ -24,6 +24,7 @@ + + static int zero = 0; + static int one = 1; ++static int two __maybe_unused = 2; + static int min_sndbuf = SOCK_MIN_SNDBUF; + static int min_rcvbuf = SOCK_MIN_RCVBUF; + static int max_skb_frags = MAX_SKB_FRAGS; +@@ -292,13 +293,14 @@ static struct ctl_table net_core_table[] + .data = &bpf_jit_enable, + .maxlen = sizeof(int), + .mode = 0644, +-#ifndef CONFIG_BPF_JIT_ALWAYS_ON +- .proc_handler = proc_dointvec +-#else + .proc_handler = proc_dointvec_minmax, ++# ifdef CONFIG_BPF_JIT_ALWAYS_ON + .extra1 = &one, + .extra2 = &one, +-#endif ++# else ++ .extra1 = &zero, ++ .extra2 = &two, ++# endif + }, + # ifdef CONFIG_HAVE_EBPF_JIT + { +@@ -306,7 +308,9 @@ static struct ctl_table net_core_table[] + .data = &bpf_jit_harden, + .maxlen = sizeof(int), + .mode = 0600, +- .proc_handler = proc_dointvec, ++ .proc_handler = proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, + }, + # endif + #endif +--- a/net/socket.c ++++ b/net/socket.c +@@ -2550,15 +2550,6 @@ out_fs: + + core_initcall(sock_init); /* early initcall */ + +-static int __init jit_init(void) +-{ +-#ifdef CONFIG_BPF_JIT_ALWAYS_ON +- bpf_jit_enable = 1; +-#endif +- return 0; +-} +-pure_initcall(jit_init); +- + #ifdef CONFIG_PROC_FS + void socket_seq_show(struct seq_file *seq) + { diff --git a/queue-4.9/bpf-restrict-access-to-core-bpf-sysctls.patch b/queue-4.9/bpf-restrict-access-to-core-bpf-sysctls.patch new file mode 100644 index 00000000000..d76a194434d --- /dev/null +++ b/queue-4.9/bpf-restrict-access-to-core-bpf-sysctls.patch @@ -0,0 +1,99 @@ +From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST +From: Ben Hutchings +Date: Fri, 16 Aug 2019 23:59:56 +0100 +Subject: bpf: restrict access to core bpf sysctls +To: Greg Kroah-Hartman , Sasha Levin +Cc: stable +Message-ID: <20190816225956.GF9843@xylophone.i.decadent.org.uk> +Content-Disposition: inline + +From: Daniel Borkmann + +commit 2e4a30983b0f9b19b59e38bbf7427d7fdd480d98 upstream. + +Given BPF reaches far beyond just networking these days, it was +never intended to allow setting and in some cases reading those +knobs out of a user namespace root running without CAP_SYS_ADMIN, +thus tighten such access. + +Also the bpf_jit_enable = 2 debugging mode should only be allowed +if kptr_restrict is not set since it otherwise can leak addresses +to the kernel log. Dump a note to the kernel log that this is for +debugging JITs only when enabled. + +Signed-off-by: Daniel Borkmann +Acked-by: Alexei Starovoitov +Signed-off-by: Alexei Starovoitov +[bwh: Backported to 4.9: + - We don't have bpf_dump_raw_ok(), so drop the condition based on it. This + condition only made it a bit harder for a privileged user to do something + silly. + - Drop change to bpf_jit_kallsyms] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + net/core/sysctl_net_core.c | 39 +++++++++++++++++++++++++++++++++++++-- + 1 file changed, 37 insertions(+), 2 deletions(-) + +--- a/net/core/sysctl_net_core.c ++++ b/net/core/sysctl_net_core.c +@@ -232,6 +232,41 @@ static int proc_do_rss_key(struct ctl_ta + return proc_dostring(&fake_table, write, buffer, lenp, ppos); + } + ++#ifdef CONFIG_BPF_JIT ++static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, ++ loff_t *ppos) ++{ ++ int ret, jit_enable = *(int *)table->data; ++ struct ctl_table tmp = *table; ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ tmp.data = &jit_enable; ++ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); ++ if (write && !ret) { ++ *(int *)table->data = jit_enable; ++ if (jit_enable == 2) ++ pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); ++ } ++ return ret; ++} ++ ++# ifdef CONFIG_HAVE_EBPF_JIT ++static int ++proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, ++ loff_t *ppos) ++{ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ return proc_dointvec_minmax(table, write, buffer, lenp, ppos); ++} ++# endif ++#endif ++ + static struct ctl_table net_core_table[] = { + #ifdef CONFIG_NET + { +@@ -293,7 +328,7 @@ static struct ctl_table net_core_table[] + .data = &bpf_jit_enable, + .maxlen = sizeof(int), + .mode = 0644, +- .proc_handler = proc_dointvec_minmax, ++ .proc_handler = proc_dointvec_minmax_bpf_enable, + # ifdef CONFIG_BPF_JIT_ALWAYS_ON + .extra1 = &one, + .extra2 = &one, +@@ -308,7 +343,7 @@ static struct ctl_table net_core_table[] + .data = &bpf_jit_harden, + .maxlen = sizeof(int), + .mode = 0600, +- .proc_handler = proc_dointvec_minmax, ++ .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = &zero, + .extra2 = &two, + }, diff --git a/queue-4.9/inet-switch-ip-id-generator-to-siphash.patch b/queue-4.9/inet-switch-ip-id-generator-to-siphash.patch new file mode 100644 index 00000000000..64f9f9ef872 --- /dev/null +++ b/queue-4.9/inet-switch-ip-id-generator-to-siphash.patch @@ -0,0 +1,158 @@ +From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST +From: Ben Hutchings +Date: Sat, 17 Aug 2019 00:01:27 +0100 +Subject: inet: switch IP ID generator to siphash +To: Greg Kroah-Hartman , Sasha Levin +Cc: stable +Message-ID: <20190816230127.GP9843@xylophone.i.decadent.org.uk> +Content-Disposition: inline + +From: Eric Dumazet + +commit df453700e8d81b1bdafdf684365ee2b9431fb702 upstream. + +According to Amit Klein and Benny Pinkas, IP ID generation is too weak +and might be used by attackers. + +Even with recent net_hash_mix() fix (netns: provide pure entropy for net_hash_mix()) +having 64bit key and Jenkins hash is risky. + +It is time to switch to siphash and its 128bit keys. + +Signed-off-by: Eric Dumazet +Reported-by: Amit Klein +Reported-by: Benny Pinkas +Signed-off-by: David S. Miller +[bwh: Backported to 4.9: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/siphash.h | 5 +++++ + include/net/netns/ipv4.h | 2 ++ + net/ipv4/route.c | 12 +++++++----- + net/ipv6/output_core.c | 30 ++++++++++++++++-------------- + 4 files changed, 30 insertions(+), 19 deletions(-) + +--- a/include/linux/siphash.h ++++ b/include/linux/siphash.h +@@ -21,6 +21,11 @@ typedef struct { + u64 key[2]; + } siphash_key_t; + ++static inline bool siphash_key_is_zero(const siphash_key_t *key) ++{ ++ return !(key->key[0] | key->key[1]); ++} ++ + u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key); + #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key); +--- a/include/net/netns/ipv4.h ++++ b/include/net/netns/ipv4.h +@@ -8,6 +8,7 @@ + #include + #include + #include ++#include + + struct tcpm_hash_bucket; + struct ctl_table_header; +@@ -137,5 +138,6 @@ struct netns_ipv4 { + int sysctl_fib_multipath_use_neigh; + #endif + atomic_t rt_genid; ++ siphash_key_t ip_id_key; + }; + #endif +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -496,15 +496,17 @@ EXPORT_SYMBOL(ip_idents_reserve); + + void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) + { +- static u32 ip_idents_hashrnd __read_mostly; + u32 hash, id; + +- net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); ++ /* Note the following code is not safe, but this is okay. */ ++ if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) ++ get_random_bytes(&net->ipv4.ip_id_key, ++ sizeof(net->ipv4.ip_id_key)); + +- hash = jhash_3words((__force u32)iph->daddr, ++ hash = siphash_3u32((__force u32)iph->daddr, + (__force u32)iph->saddr, +- iph->protocol ^ net_hash_mix(net), +- ip_idents_hashrnd); ++ iph->protocol, ++ &net->ipv4.ip_id_key); + id = ip_idents_reserve(hash, segs); + iph->id = htons(id); + } +--- a/net/ipv6/output_core.c ++++ b/net/ipv6/output_core.c +@@ -10,15 +10,25 @@ + #include + #include + +-static u32 __ipv6_select_ident(struct net *net, u32 hashrnd, ++static u32 __ipv6_select_ident(struct net *net, + const struct in6_addr *dst, + const struct in6_addr *src) + { ++ const struct { ++ struct in6_addr dst; ++ struct in6_addr src; ++ } __aligned(SIPHASH_ALIGNMENT) combined = { ++ .dst = *dst, ++ .src = *src, ++ }; + u32 hash, id; + +- hash = __ipv6_addr_jhash(dst, hashrnd); +- hash = __ipv6_addr_jhash(src, hash); +- hash ^= net_hash_mix(net); ++ /* Note the following code is not safe, but this is okay. */ ++ if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) ++ get_random_bytes(&net->ipv4.ip_id_key, ++ sizeof(net->ipv4.ip_id_key)); ++ ++ hash = siphash(&combined, sizeof(combined), &net->ipv4.ip_id_key); + + /* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve, + * set the hight order instead thus minimizing possible future +@@ -41,7 +51,6 @@ static u32 __ipv6_select_ident(struct ne + */ + void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) + { +- static u32 ip6_proxy_idents_hashrnd __read_mostly; + struct in6_addr buf[2]; + struct in6_addr *addrs; + u32 id; +@@ -53,11 +62,7 @@ void ipv6_proxy_select_ident(struct net + if (!addrs) + return; + +- net_get_random_once(&ip6_proxy_idents_hashrnd, +- sizeof(ip6_proxy_idents_hashrnd)); +- +- id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd, +- &addrs[1], &addrs[0]); ++ id = __ipv6_select_ident(net, &addrs[1], &addrs[0]); + skb_shinfo(skb)->ip6_frag_id = htonl(id); + } + EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); +@@ -66,12 +71,9 @@ __be32 ipv6_select_ident(struct net *net + const struct in6_addr *daddr, + const struct in6_addr *saddr) + { +- static u32 ip6_idents_hashrnd __read_mostly; + u32 id; + +- net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); +- +- id = __ipv6_select_ident(net, ip6_idents_hashrnd, daddr, saddr); ++ id = __ipv6_select_ident(net, daddr, saddr); + return htonl(id); + } + EXPORT_SYMBOL(ipv6_select_ident); diff --git a/queue-4.9/netfilter-ctnetlink-don-t-use-conntrack-expect-object-addresses-as-id.patch b/queue-4.9/netfilter-ctnetlink-don-t-use-conntrack-expect-object-addresses-as-id.patch new file mode 100644 index 00000000000..5564722ee20 --- /dev/null +++ b/queue-4.9/netfilter-ctnetlink-don-t-use-conntrack-expect-object-addresses-as-id.patch @@ -0,0 +1,174 @@ +From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST +From: Ben Hutchings +Date: Sat, 17 Aug 2019 00:01:34 +0100 +Subject: netfilter: ctnetlink: don't use conntrack/expect object addresses as id +To: Greg Kroah-Hartman , Sasha Levin +Cc: stable +Message-ID: <20190816230134.GQ9843@xylophone.i.decadent.org.uk> +Content-Disposition: inline + +From: Florian Westphal + +commit 3c79107631db1f7fd32cf3f7368e4672004a3010 upstream. + +else, we leak the addresses to userspace via ctnetlink events +and dumps. + +Compute an ID on demand based on the immutable parts of nf_conn struct. + +Another advantage compared to using an address is that there is no +immediate re-use of the same ID in case the conntrack entry is freed and +reallocated again immediately. + +Fixes: 3583240249ef ("[NETFILTER]: nf_conntrack_expect: kill unique ID") +Fixes: 7f85f914721f ("[NETFILTER]: nf_conntrack: kill unique ID") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + include/net/netfilter/nf_conntrack.h | 2 ++ + net/netfilter/nf_conntrack_core.c | 35 +++++++++++++++++++++++++++++++++++ + net/netfilter/nf_conntrack_netlink.c | 34 +++++++++++++++++++++++++++++----- + 3 files changed, 66 insertions(+), 5 deletions(-) + +--- a/include/net/netfilter/nf_conntrack.h ++++ b/include/net/netfilter/nf_conntrack.h +@@ -336,6 +336,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct + gfp_t flags); + void nf_ct_tmpl_free(struct nf_conn *tmpl); + ++u32 nf_ct_get_id(const struct nf_conn *ct); ++ + #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) + #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) + #define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v)) +--- a/net/netfilter/nf_conntrack_core.c ++++ b/net/netfilter/nf_conntrack_core.c +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -301,6 +302,40 @@ nf_ct_invert_tuple(struct nf_conntrack_t + } + EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); + ++/* Generate a almost-unique pseudo-id for a given conntrack. ++ * ++ * intentionally doesn't re-use any of the seeds used for hash ++ * table location, we assume id gets exposed to userspace. ++ * ++ * Following nf_conn items do not change throughout lifetime ++ * of the nf_conn after it has been committed to main hash table: ++ * ++ * 1. nf_conn address ++ * 2. nf_conn->ext address ++ * 3. nf_conn->master address (normally NULL) ++ * 4. tuple ++ * 5. the associated net namespace ++ */ ++u32 nf_ct_get_id(const struct nf_conn *ct) ++{ ++ static __read_mostly siphash_key_t ct_id_seed; ++ unsigned long a, b, c, d; ++ ++ net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); ++ ++ a = (unsigned long)ct; ++ b = (unsigned long)ct->master ^ net_hash_mix(nf_ct_net(ct)); ++ c = (unsigned long)ct->ext; ++ d = (unsigned long)siphash(&ct->tuplehash, sizeof(ct->tuplehash), ++ &ct_id_seed); ++#ifdef CONFIG_64BIT ++ return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); ++#else ++ return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); ++#endif ++} ++EXPORT_SYMBOL_GPL(nf_ct_get_id); ++ + static void + clean_from_lists(struct nf_conn *ct) + { +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -441,7 +442,9 @@ static int ctnetlink_dump_ct_seq_adj(str + + static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) + { +- if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct))) ++ __be32 id = (__force __be32)nf_ct_get_id(ct); ++ ++ if (nla_put_be32(skb, CTA_ID, id)) + goto nla_put_failure; + return 0; + +@@ -1166,8 +1169,9 @@ static int ctnetlink_del_conntrack(struc + ct = nf_ct_tuplehash_to_ctrack(h); + + if (cda[CTA_ID]) { +- u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID])); +- if (id != (u32)(unsigned long)ct) { ++ __be32 id = nla_get_be32(cda[CTA_ID]); ++ ++ if (id != (__force __be32)nf_ct_get_id(ct)) { + nf_ct_put(ct); + return -ENOENT; + } +@@ -2472,6 +2476,25 @@ nla_put_failure: + + static const union nf_inet_addr any_addr; + ++static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp) ++{ ++ static __read_mostly siphash_key_t exp_id_seed; ++ unsigned long a, b, c, d; ++ ++ net_get_random_once(&exp_id_seed, sizeof(exp_id_seed)); ++ ++ a = (unsigned long)exp; ++ b = (unsigned long)exp->helper; ++ c = (unsigned long)exp->master; ++ d = (unsigned long)siphash(&exp->tuple, sizeof(exp->tuple), &exp_id_seed); ++ ++#ifdef CONFIG_64BIT ++ return (__force __be32)siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &exp_id_seed); ++#else ++ return (__force __be32)siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &exp_id_seed); ++#endif ++} ++ + static int + ctnetlink_exp_dump_expect(struct sk_buff *skb, + const struct nf_conntrack_expect *exp) +@@ -2519,7 +2542,7 @@ ctnetlink_exp_dump_expect(struct sk_buff + } + #endif + if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) || +- nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) || ++ nla_put_be32(skb, CTA_EXPECT_ID, nf_expect_get_id(exp)) || + nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) || + nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class))) + goto nla_put_failure; +@@ -2818,7 +2841,8 @@ static int ctnetlink_get_expect(struct n + + if (cda[CTA_EXPECT_ID]) { + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); +- if (ntohl(id) != (u32)(unsigned long)exp) { ++ ++ if (id != nf_expect_get_id(exp)) { + nf_ct_expect_put(exp); + return -ENOENT; + } diff --git a/queue-4.9/series b/queue-4.9/series index 11194e115bb..cf4ac5a8df4 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -43,3 +43,16 @@ scsi-mpt3sas-use-63-bit-dma-addressing-on-sas35-hba.patch sh-kernel-hw_breakpoint-fix-missing-break-in-switch-statement.patch mm-usercopy-use-memory-range-to-be-accessed-for-wraparound-check.patch mm-memcontrol.c-fix-use-after-free-in-mem_cgroup_iter.patch +bpf-get-rid-of-pure_initcall-dependency-to-enable-jits.patch +bpf-restrict-access-to-core-bpf-sysctls.patch +bpf-add-bpf_jit_limit-knob-to-restrict-unpriv-allocations.patch +vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch +vhost_net-use-packet-weight-for-rx-handler-too.patch +vhost_net-introduce-vhost_exceeds_weight.patch +vhost-introduce-vhost_exceeds_weight.patch +vhost_net-fix-possible-infinite-loop.patch +vhost-scsi-add-weight-support.patch +siphash-add-cryptographically-secure-prf.patch +siphash-implement-halfsiphash1-3-for-hash-tables.patch +inet-switch-ip-id-generator-to-siphash.patch +netfilter-ctnetlink-don-t-use-conntrack-expect-object-addresses-as-id.patch diff --git a/queue-4.9/siphash-add-cryptographically-secure-prf.patch b/queue-4.9/siphash-add-cryptographically-secure-prf.patch new file mode 100644 index 00000000000..2f2ee2f91e2 --- /dev/null +++ b/queue-4.9/siphash-add-cryptographically-secure-prf.patch @@ -0,0 +1,694 @@ +From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST +From: Ben Hutchings +Date: Sat, 17 Aug 2019 00:01:12 +0100 +Subject: siphash: add cryptographically secure PRF +To: Greg Kroah-Hartman , Sasha Levin +Cc: stable +Message-ID: <20190816230112.GN9843@xylophone.i.decadent.org.uk> +Content-Disposition: inline + +From: Jason A. Donenfeld + +commit 2c956a60778cbb6a27e0c7a8a52a91378c90e1d1 upstream. + +SipHash is a 64-bit keyed hash function that is actually a +cryptographically secure PRF, like HMAC. Except SipHash is super fast, +and is meant to be used as a hashtable keyed lookup function, or as a +general PRF for short input use cases, such as sequence numbers or RNG +chaining. + +For the first usage: + +There are a variety of attacks known as "hashtable poisoning" in which an +attacker forms some data such that the hash of that data will be the +same, and then preceeds to fill up all entries of a hashbucket. This is +a realistic and well-known denial-of-service vector. Currently +hashtables use jhash, which is fast but not secure, and some kind of +rotating key scheme (or none at all, which isn't good). SipHash is meant +as a replacement for jhash in these cases. + +There are a modicum of places in the kernel that are vulnerable to +hashtable poisoning attacks, either via userspace vectors or network +vectors, and there's not a reliable mechanism inside the kernel at the +moment to fix it. The first step toward fixing these issues is actually +getting a secure primitive into the kernel for developers to use. Then +we can, bit by bit, port things over to it as deemed appropriate. + +While SipHash is extremely fast for a cryptographically secure function, +it is likely a bit slower than the insecure jhash, and so replacements +will be evaluated on a case-by-case basis based on whether or not the +difference in speed is negligible and whether or not the current jhash usage +poses a real security risk. + +For the second usage: + +A few places in the kernel are using MD5 or SHA1 for creating secure +sequence numbers, syn cookies, port numbers, or fast random numbers. +SipHash is a faster and more fitting, and more secure replacement for MD5 +in those situations. Replacing MD5 and SHA1 with SipHash for these uses is +obvious and straight-forward, and so is submitted along with this patch +series. There shouldn't be much of a debate over its efficacy. + +Dozens of languages are already using this internally for their hash +tables and PRFs. Some of the BSDs already use this in their kernels. +SipHash is a widely known high-speed solution to a widely known set of +problems, and it's time we catch-up. + +Signed-off-by: Jason A. Donenfeld +Reviewed-by: Jean-Philippe Aumasson +Cc: Linus Torvalds +Cc: Eric Biggers +Cc: David Laight +Cc: Eric Dumazet +Signed-off-by: David S. Miller +[bwh: Backported to 4.9 as dependency of commits df453700e8d8 "inet: switch + IP ID generator to siphash" and 3c79107631db "netfilter: ctnetlink: don't + use conntrack/expect object addresses as id"] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/siphash.txt | 100 +++++++++++++++++++ + MAINTAINERS | 7 + + include/linux/siphash.h | 85 ++++++++++++++++ + lib/Kconfig.debug | 6 - + lib/Makefile | 5 + lib/siphash.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++ + lib/test_siphash.c | 131 +++++++++++++++++++++++++ + 7 files changed, 561 insertions(+), 5 deletions(-) + create mode 100644 Documentation/siphash.txt + create mode 100644 include/linux/siphash.h + create mode 100644 lib/siphash.c + create mode 100644 lib/test_siphash.c + +--- /dev/null ++++ b/Documentation/siphash.txt +@@ -0,0 +1,100 @@ ++ SipHash - a short input PRF ++----------------------------------------------- ++Written by Jason A. Donenfeld ++ ++SipHash is a cryptographically secure PRF -- a keyed hash function -- that ++performs very well for short inputs, hence the name. It was designed by ++cryptographers Daniel J. Bernstein and Jean-Philippe Aumasson. It is intended ++as a replacement for some uses of: `jhash`, `md5_transform`, `sha_transform`, ++and so forth. ++ ++SipHash takes a secret key filled with randomly generated numbers and either ++an input buffer or several input integers. It spits out an integer that is ++indistinguishable from random. You may then use that integer as part of secure ++sequence numbers, secure cookies, or mask it off for use in a hash table. ++ ++1. Generating a key ++ ++Keys should always be generated from a cryptographically secure source of ++random numbers, either using get_random_bytes or get_random_once: ++ ++siphash_key_t key; ++get_random_bytes(&key, sizeof(key)); ++ ++If you're not deriving your key from here, you're doing it wrong. ++ ++2. Using the functions ++ ++There are two variants of the function, one that takes a list of integers, and ++one that takes a buffer: ++ ++u64 siphash(const void *data, size_t len, const siphash_key_t *key); ++ ++And: ++ ++u64 siphash_1u64(u64, const siphash_key_t *key); ++u64 siphash_2u64(u64, u64, const siphash_key_t *key); ++u64 siphash_3u64(u64, u64, u64, const siphash_key_t *key); ++u64 siphash_4u64(u64, u64, u64, u64, const siphash_key_t *key); ++u64 siphash_1u32(u32, const siphash_key_t *key); ++u64 siphash_2u32(u32, u32, const siphash_key_t *key); ++u64 siphash_3u32(u32, u32, u32, const siphash_key_t *key); ++u64 siphash_4u32(u32, u32, u32, u32, const siphash_key_t *key); ++ ++If you pass the generic siphash function something of a constant length, it ++will constant fold at compile-time and automatically choose one of the ++optimized functions. ++ ++3. Hashtable key function usage: ++ ++struct some_hashtable { ++ DECLARE_HASHTABLE(hashtable, 8); ++ siphash_key_t key; ++}; ++ ++void init_hashtable(struct some_hashtable *table) ++{ ++ get_random_bytes(&table->key, sizeof(table->key)); ++} ++ ++static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input) ++{ ++ return &table->hashtable[siphash(input, sizeof(*input), &table->key) & (HASH_SIZE(table->hashtable) - 1)]; ++} ++ ++You may then iterate like usual over the returned hash bucket. ++ ++4. Security ++ ++SipHash has a very high security margin, with its 128-bit key. So long as the ++key is kept secret, it is impossible for an attacker to guess the outputs of ++the function, even if being able to observe many outputs, since 2^128 outputs ++is significant. ++ ++Linux implements the "2-4" variant of SipHash. ++ ++5. Struct-passing Pitfalls ++ ++Often times the XuY functions will not be large enough, and instead you'll ++want to pass a pre-filled struct to siphash. When doing this, it's important ++to always ensure the struct has no padding holes. The easiest way to do this ++is to simply arrange the members of the struct in descending order of size, ++and to use offsetendof() instead of sizeof() for getting the size. For ++performance reasons, if possible, it's probably a good thing to align the ++struct to the right boundary. Here's an example: ++ ++const struct { ++ struct in6_addr saddr; ++ u32 counter; ++ u16 dport; ++} __aligned(SIPHASH_ALIGNMENT) combined = { ++ .saddr = *(struct in6_addr *)saddr, ++ .counter = counter, ++ .dport = dport ++}; ++u64 h = siphash(&combined, offsetofend(typeof(combined), dport), &secret); ++ ++6. Resources ++ ++Read the SipHash paper if you're interested in learning more: ++https://131002.net/siphash/siphash.pdf +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -11068,6 +11068,13 @@ F: arch/arm/mach-s3c24xx/mach-bast.c + F: arch/arm/mach-s3c24xx/bast-ide.c + F: arch/arm/mach-s3c24xx/bast-irq.c + ++SIPHASH PRF ROUTINES ++M: Jason A. Donenfeld ++S: Maintained ++F: lib/siphash.c ++F: lib/test_siphash.c ++F: include/linux/siphash.h ++ + TI DAVINCI MACHINE SUPPORT + M: Sekhar Nori + M: Kevin Hilman +--- /dev/null ++++ b/include/linux/siphash.h +@@ -0,0 +1,85 @@ ++/* Copyright (C) 2016 Jason A. Donenfeld . All Rights Reserved. ++ * ++ * This file is provided under a dual BSD/GPLv2 license. ++ * ++ * SipHash: a fast short-input PRF ++ * https://131002.net/siphash/ ++ * ++ * This implementation is specifically for SipHash2-4. ++ */ ++ ++#ifndef _LINUX_SIPHASH_H ++#define _LINUX_SIPHASH_H ++ ++#include ++#include ++ ++#define SIPHASH_ALIGNMENT __alignof__(u64) ++typedef struct { ++ u64 key[2]; ++} siphash_key_t; ++ ++u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key); ++#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS ++u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key); ++#endif ++ ++u64 siphash_1u64(const u64 a, const siphash_key_t *key); ++u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key); ++u64 siphash_3u64(const u64 a, const u64 b, const u64 c, ++ const siphash_key_t *key); ++u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d, ++ const siphash_key_t *key); ++u64 siphash_1u32(const u32 a, const siphash_key_t *key); ++u64 siphash_3u32(const u32 a, const u32 b, const u32 c, ++ const siphash_key_t *key); ++ ++static inline u64 siphash_2u32(const u32 a, const u32 b, ++ const siphash_key_t *key) ++{ ++ return siphash_1u64((u64)b << 32 | a, key); ++} ++static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c, ++ const u32 d, const siphash_key_t *key) ++{ ++ return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key); ++} ++ ++ ++static inline u64 ___siphash_aligned(const __le64 *data, size_t len, ++ const siphash_key_t *key) ++{ ++ if (__builtin_constant_p(len) && len == 4) ++ return siphash_1u32(le32_to_cpup((const __le32 *)data), key); ++ if (__builtin_constant_p(len) && len == 8) ++ return siphash_1u64(le64_to_cpu(data[0]), key); ++ if (__builtin_constant_p(len) && len == 16) ++ return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), ++ key); ++ if (__builtin_constant_p(len) && len == 24) ++ return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), ++ le64_to_cpu(data[2]), key); ++ if (__builtin_constant_p(len) && len == 32) ++ return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), ++ le64_to_cpu(data[2]), le64_to_cpu(data[3]), ++ key); ++ return __siphash_aligned(data, len, key); ++} ++ ++/** ++ * siphash - compute 64-bit siphash PRF value ++ * @data: buffer to hash ++ * @size: size of @data ++ * @key: the siphash key ++ */ ++static inline u64 siphash(const void *data, size_t len, ++ const siphash_key_t *key) ++{ ++#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS ++ if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) ++ return __siphash_unaligned(data, len, key); ++#endif ++ return ___siphash_aligned(data, len, key); ++} ++ ++#endif /* _LINUX_SIPHASH_H */ +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -1822,9 +1822,9 @@ config TEST_HASH + tristate "Perform selftest on hash functions" + default n + help +- Enable this option to test the kernel's integer () +- and string () hash functions on boot +- (or module load). ++ Enable this option to test the kernel's integer (), ++ string (), and siphash () ++ hash functions on boot (or module load). + + This is intended to help people writing architecture-specific + optimized versions. If unsure, say N. +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -22,7 +22,8 @@ lib-y := ctype.o string.o vsprintf.o cmd + sha1.o chacha20.o md5.o irq_regs.o argv_split.o \ + flex_proportions.o ratelimit.o show_mem.o \ + is_single_threaded.o plist.o decompress.o kobject_uevent.o \ +- earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o ++ earlycpio.o seq_buf.o siphash.o \ ++ nmi_backtrace.o nodemask.o win_minmax.o + + lib-$(CONFIG_MMU) += ioremap.o + lib-$(CONFIG_SMP) += cpumask.o +@@ -44,7 +45,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdu + obj-y += kstrtox.o + obj-$(CONFIG_TEST_BPF) += test_bpf.o + obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o +-obj-$(CONFIG_TEST_HASH) += test_hash.o ++obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o + obj-$(CONFIG_TEST_KASAN) += test_kasan.o + CFLAGS_test_kasan.o += -fno-builtin + obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o +--- /dev/null ++++ b/lib/siphash.c +@@ -0,0 +1,232 @@ ++/* Copyright (C) 2016 Jason A. Donenfeld . All Rights Reserved. ++ * ++ * This file is provided under a dual BSD/GPLv2 license. ++ * ++ * SipHash: a fast short-input PRF ++ * https://131002.net/siphash/ ++ * ++ * This implementation is specifically for SipHash2-4. ++ */ ++ ++#include ++#include ++ ++#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 ++#include ++#include ++#endif ++ ++#define SIPROUND \ ++ do { \ ++ v0 += v1; v1 = rol64(v1, 13); v1 ^= v0; v0 = rol64(v0, 32); \ ++ v2 += v3; v3 = rol64(v3, 16); v3 ^= v2; \ ++ v0 += v3; v3 = rol64(v3, 21); v3 ^= v0; \ ++ v2 += v1; v1 = rol64(v1, 17); v1 ^= v2; v2 = rol64(v2, 32); \ ++ } while (0) ++ ++#define PREAMBLE(len) \ ++ u64 v0 = 0x736f6d6570736575ULL; \ ++ u64 v1 = 0x646f72616e646f6dULL; \ ++ u64 v2 = 0x6c7967656e657261ULL; \ ++ u64 v3 = 0x7465646279746573ULL; \ ++ u64 b = ((u64)(len)) << 56; \ ++ v3 ^= key->key[1]; \ ++ v2 ^= key->key[0]; \ ++ v1 ^= key->key[1]; \ ++ v0 ^= key->key[0]; ++ ++#define POSTAMBLE \ ++ v3 ^= b; \ ++ SIPROUND; \ ++ SIPROUND; \ ++ v0 ^= b; \ ++ v2 ^= 0xff; \ ++ SIPROUND; \ ++ SIPROUND; \ ++ SIPROUND; \ ++ SIPROUND; \ ++ return (v0 ^ v1) ^ (v2 ^ v3); ++ ++u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) ++{ ++ const u8 *end = data + len - (len % sizeof(u64)); ++ const u8 left = len & (sizeof(u64) - 1); ++ u64 m; ++ PREAMBLE(len) ++ for (; data != end; data += sizeof(u64)) { ++ m = le64_to_cpup(data); ++ v3 ^= m; ++ SIPROUND; ++ SIPROUND; ++ v0 ^= m; ++ } ++#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 ++ if (left) ++ b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & ++ bytemask_from_count(left))); ++#else ++ switch (left) { ++ case 7: b |= ((u64)end[6]) << 48; ++ case 6: b |= ((u64)end[5]) << 40; ++ case 5: b |= ((u64)end[4]) << 32; ++ case 4: b |= le32_to_cpup(data); break; ++ case 3: b |= ((u64)end[2]) << 16; ++ case 2: b |= le16_to_cpup(data); break; ++ case 1: b |= end[0]; ++ } ++#endif ++ POSTAMBLE ++} ++EXPORT_SYMBOL(__siphash_aligned); ++ ++#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS ++u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) ++{ ++ const u8 *end = data + len - (len % sizeof(u64)); ++ const u8 left = len & (sizeof(u64) - 1); ++ u64 m; ++ PREAMBLE(len) ++ for (; data != end; data += sizeof(u64)) { ++ m = get_unaligned_le64(data); ++ v3 ^= m; ++ SIPROUND; ++ SIPROUND; ++ v0 ^= m; ++ } ++#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 ++ if (left) ++ b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & ++ bytemask_from_count(left))); ++#else ++ switch (left) { ++ case 7: b |= ((u64)end[6]) << 48; ++ case 6: b |= ((u64)end[5]) << 40; ++ case 5: b |= ((u64)end[4]) << 32; ++ case 4: b |= get_unaligned_le32(end); break; ++ case 3: b |= ((u64)end[2]) << 16; ++ case 2: b |= get_unaligned_le16(end); break; ++ case 1: b |= end[0]; ++ } ++#endif ++ POSTAMBLE ++} ++EXPORT_SYMBOL(__siphash_unaligned); ++#endif ++ ++/** ++ * siphash_1u64 - compute 64-bit siphash PRF value of a u64 ++ * @first: first u64 ++ * @key: the siphash key ++ */ ++u64 siphash_1u64(const u64 first, const siphash_key_t *key) ++{ ++ PREAMBLE(8) ++ v3 ^= first; ++ SIPROUND; ++ SIPROUND; ++ v0 ^= first; ++ POSTAMBLE ++} ++EXPORT_SYMBOL(siphash_1u64); ++ ++/** ++ * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64 ++ * @first: first u64 ++ * @second: second u64 ++ * @key: the siphash key ++ */ ++u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key) ++{ ++ PREAMBLE(16) ++ v3 ^= first; ++ SIPROUND; ++ SIPROUND; ++ v0 ^= first; ++ v3 ^= second; ++ SIPROUND; ++ SIPROUND; ++ v0 ^= second; ++ POSTAMBLE ++} ++EXPORT_SYMBOL(siphash_2u64); ++ ++/** ++ * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64 ++ * @first: first u64 ++ * @second: second u64 ++ * @third: third u64 ++ * @key: the siphash key ++ */ ++u64 siphash_3u64(const u64 first, const u64 second, const u64 third, ++ const siphash_key_t *key) ++{ ++ PREAMBLE(24) ++ v3 ^= first; ++ SIPROUND; ++ SIPROUND; ++ v0 ^= first; ++ v3 ^= second; ++ SIPROUND; ++ SIPROUND; ++ v0 ^= second; ++ v3 ^= third; ++ SIPROUND; ++ SIPROUND; ++ v0 ^= third; ++ POSTAMBLE ++} ++EXPORT_SYMBOL(siphash_3u64); ++ ++/** ++ * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64 ++ * @first: first u64 ++ * @second: second u64 ++ * @third: third u64 ++ * @forth: forth u64 ++ * @key: the siphash key ++ */ ++u64 siphash_4u64(const u64 first, const u64 second, const u64 third, ++ const u64 forth, const siphash_key_t *key) ++{ ++ PREAMBLE(32) ++ v3 ^= first; ++ SIPROUND; ++ SIPROUND; ++ v0 ^= first; ++ v3 ^= second; ++ SIPROUND; ++ SIPROUND; ++ v0 ^= second; ++ v3 ^= third; ++ SIPROUND; ++ SIPROUND; ++ v0 ^= third; ++ v3 ^= forth; ++ SIPROUND; ++ SIPROUND; ++ v0 ^= forth; ++ POSTAMBLE ++} ++EXPORT_SYMBOL(siphash_4u64); ++ ++u64 siphash_1u32(const u32 first, const siphash_key_t *key) ++{ ++ PREAMBLE(4) ++ b |= first; ++ POSTAMBLE ++} ++EXPORT_SYMBOL(siphash_1u32); ++ ++u64 siphash_3u32(const u32 first, const u32 second, const u32 third, ++ const siphash_key_t *key) ++{ ++ u64 combined = (u64)second << 32 | first; ++ PREAMBLE(12) ++ v3 ^= combined; ++ SIPROUND; ++ SIPROUND; ++ v0 ^= combined; ++ b |= third; ++ POSTAMBLE ++} ++EXPORT_SYMBOL(siphash_3u32); +--- /dev/null ++++ b/lib/test_siphash.c +@@ -0,0 +1,131 @@ ++/* Test cases for siphash.c ++ * ++ * Copyright (C) 2016 Jason A. Donenfeld . All Rights Reserved. ++ * ++ * This file is provided under a dual BSD/GPLv2 license. ++ * ++ * SipHash: a fast short-input PRF ++ * https://131002.net/siphash/ ++ * ++ * This implementation is specifically for SipHash2-4. ++ */ ++ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* Test vectors taken from official reference source available at: ++ * https://131002.net/siphash/siphash24.c ++ */ ++ ++static const siphash_key_t test_key_siphash = ++ {{ 0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL }}; ++ ++static const u64 test_vectors_siphash[64] = { ++ 0x726fdb47dd0e0e31ULL, 0x74f839c593dc67fdULL, 0x0d6c8009d9a94f5aULL, ++ 0x85676696d7fb7e2dULL, 0xcf2794e0277187b7ULL, 0x18765564cd99a68dULL, ++ 0xcbc9466e58fee3ceULL, 0xab0200f58b01d137ULL, 0x93f5f5799a932462ULL, ++ 0x9e0082df0ba9e4b0ULL, 0x7a5dbbc594ddb9f3ULL, 0xf4b32f46226bada7ULL, ++ 0x751e8fbc860ee5fbULL, 0x14ea5627c0843d90ULL, 0xf723ca908e7af2eeULL, ++ 0xa129ca6149be45e5ULL, 0x3f2acc7f57c29bdbULL, 0x699ae9f52cbe4794ULL, ++ 0x4bc1b3f0968dd39cULL, 0xbb6dc91da77961bdULL, 0xbed65cf21aa2ee98ULL, ++ 0xd0f2cbb02e3b67c7ULL, 0x93536795e3a33e88ULL, 0xa80c038ccd5ccec8ULL, ++ 0xb8ad50c6f649af94ULL, 0xbce192de8a85b8eaULL, 0x17d835b85bbb15f3ULL, ++ 0x2f2e6163076bcfadULL, 0xde4daaaca71dc9a5ULL, 0xa6a2506687956571ULL, ++ 0xad87a3535c49ef28ULL, 0x32d892fad841c342ULL, 0x7127512f72f27cceULL, ++ 0xa7f32346f95978e3ULL, 0x12e0b01abb051238ULL, 0x15e034d40fa197aeULL, ++ 0x314dffbe0815a3b4ULL, 0x027990f029623981ULL, 0xcadcd4e59ef40c4dULL, ++ 0x9abfd8766a33735cULL, 0x0e3ea96b5304a7d0ULL, 0xad0c42d6fc585992ULL, ++ 0x187306c89bc215a9ULL, 0xd4a60abcf3792b95ULL, 0xf935451de4f21df2ULL, ++ 0xa9538f0419755787ULL, 0xdb9acddff56ca510ULL, 0xd06c98cd5c0975ebULL, ++ 0xe612a3cb9ecba951ULL, 0xc766e62cfcadaf96ULL, 0xee64435a9752fe72ULL, ++ 0xa192d576b245165aULL, 0x0a8787bf8ecb74b2ULL, 0x81b3e73d20b49b6fULL, ++ 0x7fa8220ba3b2eceaULL, 0x245731c13ca42499ULL, 0xb78dbfaf3a8d83bdULL, ++ 0xea1ad565322a1a0bULL, 0x60e61c23a3795013ULL, 0x6606d7e446282b93ULL, ++ 0x6ca4ecb15c5f91e1ULL, 0x9f626da15c9625f3ULL, 0xe51b38608ef25f57ULL, ++ 0x958a324ceb064572ULL ++}; ++ ++static int __init siphash_test_init(void) ++{ ++ u8 in[64] __aligned(SIPHASH_ALIGNMENT); ++ u8 in_unaligned[65] __aligned(SIPHASH_ALIGNMENT); ++ u8 i; ++ int ret = 0; ++ ++ for (i = 0; i < 64; ++i) { ++ in[i] = i; ++ in_unaligned[i + 1] = i; ++ if (siphash(in, i, &test_key_siphash) != ++ test_vectors_siphash[i]) { ++ pr_info("siphash self-test aligned %u: FAIL\n", i + 1); ++ ret = -EINVAL; ++ } ++ if (siphash(in_unaligned + 1, i, &test_key_siphash) != ++ test_vectors_siphash[i]) { ++ pr_info("siphash self-test unaligned %u: FAIL\n", i + 1); ++ ret = -EINVAL; ++ } ++ } ++ if (siphash_1u64(0x0706050403020100ULL, &test_key_siphash) != ++ test_vectors_siphash[8]) { ++ pr_info("siphash self-test 1u64: FAIL\n"); ++ ret = -EINVAL; ++ } ++ if (siphash_2u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, ++ &test_key_siphash) != test_vectors_siphash[16]) { ++ pr_info("siphash self-test 2u64: FAIL\n"); ++ ret = -EINVAL; ++ } ++ if (siphash_3u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, ++ 0x1716151413121110ULL, &test_key_siphash) != ++ test_vectors_siphash[24]) { ++ pr_info("siphash self-test 3u64: FAIL\n"); ++ ret = -EINVAL; ++ } ++ if (siphash_4u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, ++ 0x1716151413121110ULL, 0x1f1e1d1c1b1a1918ULL, ++ &test_key_siphash) != test_vectors_siphash[32]) { ++ pr_info("siphash self-test 4u64: FAIL\n"); ++ ret = -EINVAL; ++ } ++ if (siphash_1u32(0x03020100U, &test_key_siphash) != ++ test_vectors_siphash[4]) { ++ pr_info("siphash self-test 1u32: FAIL\n"); ++ ret = -EINVAL; ++ } ++ if (siphash_2u32(0x03020100U, 0x07060504U, &test_key_siphash) != ++ test_vectors_siphash[8]) { ++ pr_info("siphash self-test 2u32: FAIL\n"); ++ ret = -EINVAL; ++ } ++ if (siphash_3u32(0x03020100U, 0x07060504U, ++ 0x0b0a0908U, &test_key_siphash) != ++ test_vectors_siphash[12]) { ++ pr_info("siphash self-test 3u32: FAIL\n"); ++ ret = -EINVAL; ++ } ++ if (siphash_4u32(0x03020100U, 0x07060504U, ++ 0x0b0a0908U, 0x0f0e0d0cU, &test_key_siphash) != ++ test_vectors_siphash[16]) { ++ pr_info("siphash self-test 4u32: FAIL\n"); ++ ret = -EINVAL; ++ } ++ if (!ret) ++ pr_info("self-tests: pass\n"); ++ return ret; ++} ++ ++static void __exit siphash_test_exit(void) ++{ ++} ++ ++module_init(siphash_test_init); ++module_exit(siphash_test_exit); ++ ++MODULE_AUTHOR("Jason A. Donenfeld "); ++MODULE_LICENSE("Dual BSD/GPL"); diff --git a/queue-4.9/siphash-implement-halfsiphash1-3-for-hash-tables.patch b/queue-4.9/siphash-implement-halfsiphash1-3-for-hash-tables.patch new file mode 100644 index 00000000000..c70e926b442 --- /dev/null +++ b/queue-4.9/siphash-implement-halfsiphash1-3-for-hash-tables.patch @@ -0,0 +1,677 @@ +From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST +From: Ben Hutchings +Date: Sat, 17 Aug 2019 00:01:19 +0100 +Subject: siphash: implement HalfSipHash1-3 for hash tables +To: Greg Kroah-Hartman , Sasha Levin +Cc: stable +Message-ID: <20190816230119.GO9843@xylophone.i.decadent.org.uk> +Content-Disposition: inline + +From: Jason A. Donenfeld + +commit 1ae2324f732c9c4e2fa4ebd885fa1001b70d52e1 upstream. + +HalfSipHash, or hsiphash, is a shortened version of SipHash, which +generates 32-bit outputs using a weaker 64-bit key. It has *much* lower +security margins, and shouldn't be used for anything too sensitive, but +it could be used as a hashtable key function replacement, if the output +is never exposed, and if the security requirement is not too high. + +The goal is to make this something that performance-critical jhash users +would be willing to use. + +On 64-bit machines, HalfSipHash1-3 is slower than SipHash1-3, so we alias +SipHash1-3 to HalfSipHash1-3 on those systems. + +64-bit x86_64: +[ 0.509409] test_siphash: SipHash2-4 cycles: 4049181 +[ 0.510650] test_siphash: SipHash1-3 cycles: 2512884 +[ 0.512205] test_siphash: HalfSipHash1-3 cycles: 3429920 +[ 0.512904] test_siphash: JenkinsHash cycles: 978267 +So, we map hsiphash() -> SipHash1-3 + +32-bit x86: +[ 0.509868] test_siphash: SipHash2-4 cycles: 14812892 +[ 0.513601] test_siphash: SipHash1-3 cycles: 9510710 +[ 0.515263] test_siphash: HalfSipHash1-3 cycles: 3856157 +[ 0.515952] test_siphash: JenkinsHash cycles: 1148567 +So, we map hsiphash() -> HalfSipHash1-3 + +hsiphash() is roughly 3 times slower than jhash(), but comes with a +considerable security improvement. + +Signed-off-by: Jason A. Donenfeld +Reviewed-by: Jean-Philippe Aumasson +Signed-off-by: David S. Miller +[bwh: Backported to 4.9 to avoid regression for WireGuard with only half + the siphash API present] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/siphash.txt | 75 ++++++++++ + include/linux/siphash.h | 57 ++++++++ + lib/siphash.c | 321 +++++++++++++++++++++++++++++++++++++++++++++- + lib/test_siphash.c | 98 +++++++++++++- + 4 files changed, 546 insertions(+), 5 deletions(-) + +--- a/Documentation/siphash.txt ++++ b/Documentation/siphash.txt +@@ -98,3 +98,78 @@ u64 h = siphash(&combined, offsetofend(t + + Read the SipHash paper if you're interested in learning more: + https://131002.net/siphash/siphash.pdf ++ ++ ++~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~ ++ ++HalfSipHash - SipHash's insecure younger cousin ++----------------------------------------------- ++Written by Jason A. Donenfeld ++ ++On the off-chance that SipHash is not fast enough for your needs, you might be ++able to justify using HalfSipHash, a terrifying but potentially useful ++possibility. HalfSipHash cuts SipHash's rounds down from "2-4" to "1-3" and, ++even scarier, uses an easily brute-forcable 64-bit key (with a 32-bit output) ++instead of SipHash's 128-bit key. However, this may appeal to some ++high-performance `jhash` users. ++ ++Danger! ++ ++Do not ever use HalfSipHash except for as a hashtable key function, and only ++then when you can be absolutely certain that the outputs will never be ++transmitted out of the kernel. This is only remotely useful over `jhash` as a ++means of mitigating hashtable flooding denial of service attacks. ++ ++1. Generating a key ++ ++Keys should always be generated from a cryptographically secure source of ++random numbers, either using get_random_bytes or get_random_once: ++ ++hsiphash_key_t key; ++get_random_bytes(&key, sizeof(key)); ++ ++If you're not deriving your key from here, you're doing it wrong. ++ ++2. Using the functions ++ ++There are two variants of the function, one that takes a list of integers, and ++one that takes a buffer: ++ ++u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key); ++ ++And: ++ ++u32 hsiphash_1u32(u32, const hsiphash_key_t *key); ++u32 hsiphash_2u32(u32, u32, const hsiphash_key_t *key); ++u32 hsiphash_3u32(u32, u32, u32, const hsiphash_key_t *key); ++u32 hsiphash_4u32(u32, u32, u32, u32, const hsiphash_key_t *key); ++ ++If you pass the generic hsiphash function something of a constant length, it ++will constant fold at compile-time and automatically choose one of the ++optimized functions. ++ ++3. Hashtable key function usage: ++ ++struct some_hashtable { ++ DECLARE_HASHTABLE(hashtable, 8); ++ hsiphash_key_t key; ++}; ++ ++void init_hashtable(struct some_hashtable *table) ++{ ++ get_random_bytes(&table->key, sizeof(table->key)); ++} ++ ++static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input) ++{ ++ return &table->hashtable[hsiphash(input, sizeof(*input), &table->key) & (HASH_SIZE(table->hashtable) - 1)]; ++} ++ ++You may then iterate like usual over the returned hash bucket. ++ ++4. Performance ++ ++HalfSipHash is roughly 3 times slower than JenkinsHash. For many replacements, ++this will not be a problem, as the hashtable lookup isn't the bottleneck. And ++in general, this is probably a good sacrifice to make for the security and DoS ++resistance of HalfSipHash. +--- a/include/linux/siphash.h ++++ b/include/linux/siphash.h +@@ -5,7 +5,9 @@ + * SipHash: a fast short-input PRF + * https://131002.net/siphash/ + * +- * This implementation is specifically for SipHash2-4. ++ * This implementation is specifically for SipHash2-4 for a secure PRF ++ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for ++ * hashtables. + */ + + #ifndef _LINUX_SIPHASH_H +@@ -82,4 +84,57 @@ static inline u64 siphash(const void *da + return ___siphash_aligned(data, len, key); + } + ++#define HSIPHASH_ALIGNMENT __alignof__(unsigned long) ++typedef struct { ++ unsigned long key[2]; ++} hsiphash_key_t; ++ ++u32 __hsiphash_aligned(const void *data, size_t len, ++ const hsiphash_key_t *key); ++#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS ++u32 __hsiphash_unaligned(const void *data, size_t len, ++ const hsiphash_key_t *key); ++#endif ++ ++u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key); ++u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key); ++u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c, ++ const hsiphash_key_t *key); ++u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d, ++ const hsiphash_key_t *key); ++ ++static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len, ++ const hsiphash_key_t *key) ++{ ++ if (__builtin_constant_p(len) && len == 4) ++ return hsiphash_1u32(le32_to_cpu(data[0]), key); ++ if (__builtin_constant_p(len) && len == 8) ++ return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), ++ key); ++ if (__builtin_constant_p(len) && len == 12) ++ return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), ++ le32_to_cpu(data[2]), key); ++ if (__builtin_constant_p(len) && len == 16) ++ return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), ++ le32_to_cpu(data[2]), le32_to_cpu(data[3]), ++ key); ++ return __hsiphash_aligned(data, len, key); ++} ++ ++/** ++ * hsiphash - compute 32-bit hsiphash PRF value ++ * @data: buffer to hash ++ * @size: size of @data ++ * @key: the hsiphash key ++ */ ++static inline u32 hsiphash(const void *data, size_t len, ++ const hsiphash_key_t *key) ++{ ++#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS ++ if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) ++ return __hsiphash_unaligned(data, len, key); ++#endif ++ return ___hsiphash_aligned(data, len, key); ++} ++ + #endif /* _LINUX_SIPHASH_H */ +--- a/lib/siphash.c ++++ b/lib/siphash.c +@@ -5,7 +5,9 @@ + * SipHash: a fast short-input PRF + * https://131002.net/siphash/ + * +- * This implementation is specifically for SipHash2-4. ++ * This implementation is specifically for SipHash2-4 for a secure PRF ++ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for ++ * hashtables. + */ + + #include +@@ -230,3 +232,320 @@ u64 siphash_3u32(const u32 first, const + POSTAMBLE + } + EXPORT_SYMBOL(siphash_3u32); ++ ++#if BITS_PER_LONG == 64 ++/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for ++ * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3. ++ */ ++ ++#define HSIPROUND SIPROUND ++#define HPREAMBLE(len) PREAMBLE(len) ++#define HPOSTAMBLE \ ++ v3 ^= b; \ ++ HSIPROUND; \ ++ v0 ^= b; \ ++ v2 ^= 0xff; \ ++ HSIPROUND; \ ++ HSIPROUND; \ ++ HSIPROUND; \ ++ return (v0 ^ v1) ^ (v2 ^ v3); ++ ++u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) ++{ ++ const u8 *end = data + len - (len % sizeof(u64)); ++ const u8 left = len & (sizeof(u64) - 1); ++ u64 m; ++ HPREAMBLE(len) ++ for (; data != end; data += sizeof(u64)) { ++ m = le64_to_cpup(data); ++ v3 ^= m; ++ HSIPROUND; ++ v0 ^= m; ++ } ++#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 ++ if (left) ++ b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & ++ bytemask_from_count(left))); ++#else ++ switch (left) { ++ case 7: b |= ((u64)end[6]) << 48; ++ case 6: b |= ((u64)end[5]) << 40; ++ case 5: b |= ((u64)end[4]) << 32; ++ case 4: b |= le32_to_cpup(data); break; ++ case 3: b |= ((u64)end[2]) << 16; ++ case 2: b |= le16_to_cpup(data); break; ++ case 1: b |= end[0]; ++ } ++#endif ++ HPOSTAMBLE ++} ++EXPORT_SYMBOL(__hsiphash_aligned); ++ ++#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS ++u32 __hsiphash_unaligned(const void *data, size_t len, ++ const hsiphash_key_t *key) ++{ ++ const u8 *end = data + len - (len % sizeof(u64)); ++ const u8 left = len & (sizeof(u64) - 1); ++ u64 m; ++ HPREAMBLE(len) ++ for (; data != end; data += sizeof(u64)) { ++ m = get_unaligned_le64(data); ++ v3 ^= m; ++ HSIPROUND; ++ v0 ^= m; ++ } ++#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 ++ if (left) ++ b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & ++ bytemask_from_count(left))); ++#else ++ switch (left) { ++ case 7: b |= ((u64)end[6]) << 48; ++ case 6: b |= ((u64)end[5]) << 40; ++ case 5: b |= ((u64)end[4]) << 32; ++ case 4: b |= get_unaligned_le32(end); break; ++ case 3: b |= ((u64)end[2]) << 16; ++ case 2: b |= get_unaligned_le16(end); break; ++ case 1: b |= end[0]; ++ } ++#endif ++ HPOSTAMBLE ++} ++EXPORT_SYMBOL(__hsiphash_unaligned); ++#endif ++ ++/** ++ * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32 ++ * @first: first u32 ++ * @key: the hsiphash key ++ */ ++u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) ++{ ++ HPREAMBLE(4) ++ b |= first; ++ HPOSTAMBLE ++} ++EXPORT_SYMBOL(hsiphash_1u32); ++ ++/** ++ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 ++ * @first: first u32 ++ * @second: second u32 ++ * @key: the hsiphash key ++ */ ++u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) ++{ ++ u64 combined = (u64)second << 32 | first; ++ HPREAMBLE(8) ++ v3 ^= combined; ++ HSIPROUND; ++ v0 ^= combined; ++ HPOSTAMBLE ++} ++EXPORT_SYMBOL(hsiphash_2u32); ++ ++/** ++ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 ++ * @first: first u32 ++ * @second: second u32 ++ * @third: third u32 ++ * @key: the hsiphash key ++ */ ++u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, ++ const hsiphash_key_t *key) ++{ ++ u64 combined = (u64)second << 32 | first; ++ HPREAMBLE(12) ++ v3 ^= combined; ++ HSIPROUND; ++ v0 ^= combined; ++ b |= third; ++ HPOSTAMBLE ++} ++EXPORT_SYMBOL(hsiphash_3u32); ++ ++/** ++ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 ++ * @first: first u32 ++ * @second: second u32 ++ * @third: third u32 ++ * @forth: forth u32 ++ * @key: the hsiphash key ++ */ ++u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, ++ const u32 forth, const hsiphash_key_t *key) ++{ ++ u64 combined = (u64)second << 32 | first; ++ HPREAMBLE(16) ++ v3 ^= combined; ++ HSIPROUND; ++ v0 ^= combined; ++ combined = (u64)forth << 32 | third; ++ v3 ^= combined; ++ HSIPROUND; ++ v0 ^= combined; ++ HPOSTAMBLE ++} ++EXPORT_SYMBOL(hsiphash_4u32); ++#else ++#define HSIPROUND \ ++ do { \ ++ v0 += v1; v1 = rol32(v1, 5); v1 ^= v0; v0 = rol32(v0, 16); \ ++ v2 += v3; v3 = rol32(v3, 8); v3 ^= v2; \ ++ v0 += v3; v3 = rol32(v3, 7); v3 ^= v0; \ ++ v2 += v1; v1 = rol32(v1, 13); v1 ^= v2; v2 = rol32(v2, 16); \ ++ } while (0) ++ ++#define HPREAMBLE(len) \ ++ u32 v0 = 0; \ ++ u32 v1 = 0; \ ++ u32 v2 = 0x6c796765U; \ ++ u32 v3 = 0x74656462U; \ ++ u32 b = ((u32)(len)) << 24; \ ++ v3 ^= key->key[1]; \ ++ v2 ^= key->key[0]; \ ++ v1 ^= key->key[1]; \ ++ v0 ^= key->key[0]; ++ ++#define HPOSTAMBLE \ ++ v3 ^= b; \ ++ HSIPROUND; \ ++ v0 ^= b; \ ++ v2 ^= 0xff; \ ++ HSIPROUND; \ ++ HSIPROUND; \ ++ HSIPROUND; \ ++ return v1 ^ v3; ++ ++u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) ++{ ++ const u8 *end = data + len - (len % sizeof(u32)); ++ const u8 left = len & (sizeof(u32) - 1); ++ u32 m; ++ HPREAMBLE(len) ++ for (; data != end; data += sizeof(u32)) { ++ m = le32_to_cpup(data); ++ v3 ^= m; ++ HSIPROUND; ++ v0 ^= m; ++ } ++ switch (left) { ++ case 3: b |= ((u32)end[2]) << 16; ++ case 2: b |= le16_to_cpup(data); break; ++ case 1: b |= end[0]; ++ } ++ HPOSTAMBLE ++} ++EXPORT_SYMBOL(__hsiphash_aligned); ++ ++#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS ++u32 __hsiphash_unaligned(const void *data, size_t len, ++ const hsiphash_key_t *key) ++{ ++ const u8 *end = data + len - (len % sizeof(u32)); ++ const u8 left = len & (sizeof(u32) - 1); ++ u32 m; ++ HPREAMBLE(len) ++ for (; data != end; data += sizeof(u32)) { ++ m = get_unaligned_le32(data); ++ v3 ^= m; ++ HSIPROUND; ++ v0 ^= m; ++ } ++ switch (left) { ++ case 3: b |= ((u32)end[2]) << 16; ++ case 2: b |= get_unaligned_le16(end); break; ++ case 1: b |= end[0]; ++ } ++ HPOSTAMBLE ++} ++EXPORT_SYMBOL(__hsiphash_unaligned); ++#endif ++ ++/** ++ * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32 ++ * @first: first u32 ++ * @key: the hsiphash key ++ */ ++u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) ++{ ++ HPREAMBLE(4) ++ v3 ^= first; ++ HSIPROUND; ++ v0 ^= first; ++ HPOSTAMBLE ++} ++EXPORT_SYMBOL(hsiphash_1u32); ++ ++/** ++ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 ++ * @first: first u32 ++ * @second: second u32 ++ * @key: the hsiphash key ++ */ ++u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) ++{ ++ HPREAMBLE(8) ++ v3 ^= first; ++ HSIPROUND; ++ v0 ^= first; ++ v3 ^= second; ++ HSIPROUND; ++ v0 ^= second; ++ HPOSTAMBLE ++} ++EXPORT_SYMBOL(hsiphash_2u32); ++ ++/** ++ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 ++ * @first: first u32 ++ * @second: second u32 ++ * @third: third u32 ++ * @key: the hsiphash key ++ */ ++u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, ++ const hsiphash_key_t *key) ++{ ++ HPREAMBLE(12) ++ v3 ^= first; ++ HSIPROUND; ++ v0 ^= first; ++ v3 ^= second; ++ HSIPROUND; ++ v0 ^= second; ++ v3 ^= third; ++ HSIPROUND; ++ v0 ^= third; ++ HPOSTAMBLE ++} ++EXPORT_SYMBOL(hsiphash_3u32); ++ ++/** ++ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 ++ * @first: first u32 ++ * @second: second u32 ++ * @third: third u32 ++ * @forth: forth u32 ++ * @key: the hsiphash key ++ */ ++u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, ++ const u32 forth, const hsiphash_key_t *key) ++{ ++ HPREAMBLE(16) ++ v3 ^= first; ++ HSIPROUND; ++ v0 ^= first; ++ v3 ^= second; ++ HSIPROUND; ++ v0 ^= second; ++ v3 ^= third; ++ HSIPROUND; ++ v0 ^= third; ++ v3 ^= forth; ++ HSIPROUND; ++ v0 ^= forth; ++ HPOSTAMBLE ++} ++EXPORT_SYMBOL(hsiphash_4u32); ++#endif +--- a/lib/test_siphash.c ++++ b/lib/test_siphash.c +@@ -7,7 +7,9 @@ + * SipHash: a fast short-input PRF + * https://131002.net/siphash/ + * +- * This implementation is specifically for SipHash2-4. ++ * This implementation is specifically for SipHash2-4 for a secure PRF ++ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for ++ * hashtables. + */ + + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +@@ -18,8 +20,8 @@ + #include + #include + +-/* Test vectors taken from official reference source available at: +- * https://131002.net/siphash/siphash24.c ++/* Test vectors taken from reference source available at: ++ * https://github.com/veorq/SipHash + */ + + static const siphash_key_t test_key_siphash = +@@ -50,6 +52,64 @@ static const u64 test_vectors_siphash[64 + 0x958a324ceb064572ULL + }; + ++#if BITS_PER_LONG == 64 ++static const hsiphash_key_t test_key_hsiphash = ++ {{ 0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL }}; ++ ++static const u32 test_vectors_hsiphash[64] = { ++ 0x050fc4dcU, 0x7d57ca93U, 0x4dc7d44dU, ++ 0xe7ddf7fbU, 0x88d38328U, 0x49533b67U, ++ 0xc59f22a7U, 0x9bb11140U, 0x8d299a8eU, ++ 0x6c063de4U, 0x92ff097fU, 0xf94dc352U, ++ 0x57b4d9a2U, 0x1229ffa7U, 0xc0f95d34U, ++ 0x2a519956U, 0x7d908b66U, 0x63dbd80cU, ++ 0xb473e63eU, 0x8d297d1cU, 0xa6cce040U, ++ 0x2b45f844U, 0xa320872eU, 0xdae6c123U, ++ 0x67349c8cU, 0x705b0979U, 0xca9913a5U, ++ 0x4ade3b35U, 0xef6cd00dU, 0x4ab1e1f4U, ++ 0x43c5e663U, 0x8c21d1bcU, 0x16a7b60dU, ++ 0x7a8ff9bfU, 0x1f2a753eU, 0xbf186b91U, ++ 0xada26206U, 0xa3c33057U, 0xae3a36a1U, ++ 0x7b108392U, 0x99e41531U, 0x3f1ad944U, ++ 0xc8138825U, 0xc28949a6U, 0xfaf8876bU, ++ 0x9f042196U, 0x68b1d623U, 0x8b5114fdU, ++ 0xdf074c46U, 0x12cc86b3U, 0x0a52098fU, ++ 0x9d292f9aU, 0xa2f41f12U, 0x43a71ed0U, ++ 0x73f0bce6U, 0x70a7e980U, 0x243c6d75U, ++ 0xfdb71513U, 0xa67d8a08U, 0xb7e8f148U, ++ 0xf7a644eeU, 0x0f1837f2U, 0x4b6694e0U, ++ 0xb7bbb3a8U ++}; ++#else ++static const hsiphash_key_t test_key_hsiphash = ++ {{ 0x03020100U, 0x07060504U }}; ++ ++static const u32 test_vectors_hsiphash[64] = { ++ 0x5814c896U, 0xe7e864caU, 0xbc4b0e30U, ++ 0x01539939U, 0x7e059ea6U, 0x88e3d89bU, ++ 0xa0080b65U, 0x9d38d9d6U, 0x577999b1U, ++ 0xc839caedU, 0xe4fa32cfU, 0x959246eeU, ++ 0x6b28096cU, 0x66dd9cd6U, 0x16658a7cU, ++ 0xd0257b04U, 0x8b31d501U, 0x2b1cd04bU, ++ 0x06712339U, 0x522aca67U, 0x911bb605U, ++ 0x90a65f0eU, 0xf826ef7bU, 0x62512debU, ++ 0x57150ad7U, 0x5d473507U, 0x1ec47442U, ++ 0xab64afd3U, 0x0a4100d0U, 0x6d2ce652U, ++ 0x2331b6a3U, 0x08d8791aU, 0xbc6dda8dU, ++ 0xe0f6c934U, 0xb0652033U, 0x9b9851ccU, ++ 0x7c46fb7fU, 0x732ba8cbU, 0xf142997aU, ++ 0xfcc9aa1bU, 0x05327eb2U, 0xe110131cU, ++ 0xf9e5e7c0U, 0xa7d708a6U, 0x11795ab1U, ++ 0x65671619U, 0x9f5fff91U, 0xd89c5267U, ++ 0x007783ebU, 0x95766243U, 0xab639262U, ++ 0x9c7e1390U, 0xc368dda6U, 0x38ddc455U, ++ 0xfa13d379U, 0x979ea4e8U, 0x53ecd77eU, ++ 0x2ee80657U, 0x33dbb66aU, 0xae3f0577U, ++ 0x88b4c4ccU, 0x3e7f480bU, 0x74c1ebf8U, ++ 0x87178304U ++}; ++#endif ++ + static int __init siphash_test_init(void) + { + u8 in[64] __aligned(SIPHASH_ALIGNMENT); +@@ -70,6 +130,16 @@ static int __init siphash_test_init(void + pr_info("siphash self-test unaligned %u: FAIL\n", i + 1); + ret = -EINVAL; + } ++ if (hsiphash(in, i, &test_key_hsiphash) != ++ test_vectors_hsiphash[i]) { ++ pr_info("hsiphash self-test aligned %u: FAIL\n", i + 1); ++ ret = -EINVAL; ++ } ++ if (hsiphash(in_unaligned + 1, i, &test_key_hsiphash) != ++ test_vectors_hsiphash[i]) { ++ pr_info("hsiphash self-test unaligned %u: FAIL\n", i + 1); ++ ret = -EINVAL; ++ } + } + if (siphash_1u64(0x0706050403020100ULL, &test_key_siphash) != + test_vectors_siphash[8]) { +@@ -115,6 +185,28 @@ static int __init siphash_test_init(void + pr_info("siphash self-test 4u32: FAIL\n"); + ret = -EINVAL; + } ++ if (hsiphash_1u32(0x03020100U, &test_key_hsiphash) != ++ test_vectors_hsiphash[4]) { ++ pr_info("hsiphash self-test 1u32: FAIL\n"); ++ ret = -EINVAL; ++ } ++ if (hsiphash_2u32(0x03020100U, 0x07060504U, &test_key_hsiphash) != ++ test_vectors_hsiphash[8]) { ++ pr_info("hsiphash self-test 2u32: FAIL\n"); ++ ret = -EINVAL; ++ } ++ if (hsiphash_3u32(0x03020100U, 0x07060504U, ++ 0x0b0a0908U, &test_key_hsiphash) != ++ test_vectors_hsiphash[12]) { ++ pr_info("hsiphash self-test 3u32: FAIL\n"); ++ ret = -EINVAL; ++ } ++ if (hsiphash_4u32(0x03020100U, 0x07060504U, ++ 0x0b0a0908U, 0x0f0e0d0cU, &test_key_hsiphash) != ++ test_vectors_hsiphash[16]) { ++ pr_info("hsiphash self-test 4u32: FAIL\n"); ++ ret = -EINVAL; ++ } + if (!ret) + pr_info("self-tests: pass\n"); + return ret; diff --git a/queue-4.9/vhost-introduce-vhost_exceeds_weight.patch b/queue-4.9/vhost-introduce-vhost_exceeds_weight.patch new file mode 100644 index 00000000000..4a0880d00a3 --- /dev/null +++ b/queue-4.9/vhost-introduce-vhost_exceeds_weight.patch @@ -0,0 +1,196 @@ +From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST +From: Ben Hutchings +Date: Sat, 17 Aug 2019 00:00:44 +0100 +Subject: vhost: introduce vhost_exceeds_weight() +To: Greg Kroah-Hartman , Sasha Levin +Cc: stable +Message-ID: <20190816230044.GK9843@xylophone.i.decadent.org.uk> +Content-Disposition: inline + +From: Jason Wang + +commit e82b9b0727ff6d665fff2d326162b460dded554d upstream. + +We used to have vhost_exceeds_weight() for vhost-net to: + +- prevent vhost kthread from hogging the cpu +- balance the time spent between TX and RX + +This function could be useful for vsock and scsi as well. So move it +to vhost.c. Device must specify a weight which counts the number of +requests, or it can also specific a byte_weight which counts the +number of bytes that has been processed. + +Signed-off-by: Jason Wang +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Michael S. Tsirkin +[bwh: Backported to 4.9: + - In vhost_net, both Tx modes are handled in one loop in handle_tx() + - Adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + drivers/vhost/net.c | 18 +++++------------- + drivers/vhost/scsi.c | 9 ++++++++- + drivers/vhost/vhost.c | 20 +++++++++++++++++++- + drivers/vhost/vhost.h | 6 +++++- + drivers/vhost/vsock.c | 12 +++++++++++- + 5 files changed, 48 insertions(+), 17 deletions(-) + +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -357,12 +357,6 @@ static int vhost_net_tx_get_vq_desc(stru + return r; + } + +-static bool vhost_exceeds_weight(int pkts, int total_len) +-{ +- return total_len >= VHOST_NET_WEIGHT || +- pkts >= VHOST_NET_PKT_WEIGHT; +-} +- + /* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ + static void handle_tx(struct vhost_net *net) +@@ -487,10 +481,9 @@ static void handle_tx(struct vhost_net * + vhost_zerocopy_signal_used(net, vq); + total_len += len; + vhost_net_tx_packet(net); +- if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) { +- vhost_poll_queue(&vq->poll); ++ if (unlikely(vhost_exceeds_weight(vq, ++sent_pkts, ++ total_len))) + break; +- } + } + out: + mutex_unlock(&vq->mutex); +@@ -768,10 +761,8 @@ static void handle_rx(struct vhost_net * + vhost_log_write(vq, vq_log, log, vhost_len, + vq->iov, in); + total_len += vhost_len; +- if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) { +- vhost_poll_queue(&vq->poll); ++ if (unlikely(vhost_exceeds_weight(vq, ++recv_pkts, total_len))) + goto out; +- } + } + vhost_net_enable_vq(net, vq); + out: +@@ -842,7 +833,8 @@ static int vhost_net_open(struct inode * + n->vqs[i].vhost_hlen = 0; + n->vqs[i].sock_hlen = 0; + } +- vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX); ++ vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, ++ VHOST_NET_WEIGHT, VHOST_NET_PKT_WEIGHT); + + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); +--- a/drivers/vhost/scsi.c ++++ b/drivers/vhost/scsi.c +@@ -58,6 +58,12 @@ + #define VHOST_SCSI_PREALLOC_UPAGES 2048 + #define VHOST_SCSI_PREALLOC_PROT_SGLS 512 + ++/* Max number of requests before requeueing the job. ++ * Using this limit prevents one virtqueue from starving others with ++ * request. ++ */ ++#define VHOST_SCSI_WEIGHT 256 ++ + struct vhost_scsi_inflight { + /* Wait for the flush operation to finish */ + struct completion comp; +@@ -1433,7 +1439,8 @@ static int vhost_scsi_open(struct inode + vqs[i] = &vs->vqs[i].vq; + vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick; + } +- vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ); ++ vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, ++ VHOST_SCSI_WEIGHT, 0); + + vhost_scsi_init_inflight(vs, NULL); + +--- a/drivers/vhost/vhost.c ++++ b/drivers/vhost/vhost.c +@@ -393,8 +393,24 @@ static void vhost_dev_free_iovecs(struct + vhost_vq_free_iovecs(dev->vqs[i]); + } + ++bool vhost_exceeds_weight(struct vhost_virtqueue *vq, ++ int pkts, int total_len) ++{ ++ struct vhost_dev *dev = vq->dev; ++ ++ if ((dev->byte_weight && total_len >= dev->byte_weight) || ++ pkts >= dev->weight) { ++ vhost_poll_queue(&vq->poll); ++ return true; ++ } ++ ++ return false; ++} ++EXPORT_SYMBOL_GPL(vhost_exceeds_weight); ++ + void vhost_dev_init(struct vhost_dev *dev, +- struct vhost_virtqueue **vqs, int nvqs) ++ struct vhost_virtqueue **vqs, int nvqs, ++ int weight, int byte_weight) + { + struct vhost_virtqueue *vq; + int i; +@@ -408,6 +424,8 @@ void vhost_dev_init(struct vhost_dev *de + dev->iotlb = NULL; + dev->mm = NULL; + dev->worker = NULL; ++ dev->weight = weight; ++ dev->byte_weight = byte_weight; + init_llist_head(&dev->work_list); + init_waitqueue_head(&dev->wait); + INIT_LIST_HEAD(&dev->read_list); +--- a/drivers/vhost/vhost.h ++++ b/drivers/vhost/vhost.h +@@ -164,9 +164,13 @@ struct vhost_dev { + struct list_head read_list; + struct list_head pending_list; + wait_queue_head_t wait; ++ int weight; ++ int byte_weight; + }; + +-void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs); ++bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len); ++void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, ++ int nvqs, int weight, int byte_weight); + long vhost_dev_set_owner(struct vhost_dev *dev); + bool vhost_dev_has_owner(struct vhost_dev *dev); + long vhost_dev_check_owner(struct vhost_dev *); +--- a/drivers/vhost/vsock.c ++++ b/drivers/vhost/vsock.c +@@ -21,6 +21,14 @@ + #include "vhost.h" + + #define VHOST_VSOCK_DEFAULT_HOST_CID 2 ++/* Max number of bytes transferred before requeueing the job. ++ * Using this limit prevents one virtqueue from starving others. */ ++#define VHOST_VSOCK_WEIGHT 0x80000 ++/* Max number of packets transferred before requeueing the job. ++ * Using this limit prevents one virtqueue from starving others with ++ * small pkts. ++ */ ++#define VHOST_VSOCK_PKT_WEIGHT 256 + + enum { + VHOST_VSOCK_FEATURES = VHOST_FEATURES, +@@ -529,7 +537,9 @@ static int vhost_vsock_dev_open(struct i + vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick; + vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick; + +- vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs)); ++ vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs), ++ VHOST_VSOCK_PKT_WEIGHT, ++ VHOST_VSOCK_WEIGHT); + + file->private_data = vsock; + spin_lock_init(&vsock->send_pkt_list_lock); diff --git a/queue-4.9/vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch b/queue-4.9/vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch new file mode 100644 index 00000000000..67b5e91b770 --- /dev/null +++ b/queue-4.9/vhost-net-set-packet-weight-of-tx-polling-to-2-vq-size.patch @@ -0,0 +1,139 @@ +From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST +From: Ben Hutchings +Date: Sat, 17 Aug 2019 00:00:19 +0100 +Subject: vhost-net: set packet weight of tx polling to 2 * vq size +To: Greg Kroah-Hartman , Sasha Levin +Cc: stable +Message-ID: <20190816230019.GH9843@xylophone.i.decadent.org.uk> +Content-Disposition: inline + +From: haibinzhang(张海斌) + +commit a2ac99905f1ea8b15997a6ec39af69aa28a3653b upstream. + +handle_tx will delay rx for tens or even hundreds of milliseconds when tx busy +polling udp packets with small length(e.g. 1byte udp payload), because setting +VHOST_NET_WEIGHT takes into account only sent-bytes but no single packet length. + +Ping-Latencies shown below were tested between two Virtual Machines using +netperf (UDP_STREAM, len=1), and then another machine pinged the client: + +vq size=256 +Packet-Weight Ping-Latencies(millisecond) + min avg max +Origin 3.319 18.489 57.303 +64 1.643 2.021 2.552 +128 1.825 2.600 3.224 +256 1.997 2.710 4.295 +512 1.860 3.171 4.631 +1024 2.002 4.173 9.056 +2048 2.257 5.650 9.688 +4096 2.093 8.508 15.943 + +vq size=512 +Packet-Weight Ping-Latencies(millisecond) + min avg max +Origin 6.537 29.177 66.245 +64 2.798 3.614 4.403 +128 2.861 3.820 4.775 +256 3.008 4.018 4.807 +512 3.254 4.523 5.824 +1024 3.079 5.335 7.747 +2048 3.944 8.201 12.762 +4096 4.158 11.057 19.985 + +Seems pretty consistent, a small dip at 2 VQ sizes. +Ring size is a hint from device about a burst size it can tolerate. Based on +benchmarks, set the weight to 2 * vq size. + +To evaluate this change, another tests were done using netperf(RR, TX) between +two machines with Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz, and vq size was +tweaked through qemu. Results shown below does not show obvious changes. + +vq size=256 TCP_RR vq size=512 TCP_RR +size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize% + 1/ 1/ -7%/ -2% 1/ 1/ 0%/ -2% + 1/ 4/ +1%/ 0% 1/ 4/ +1%/ 0% + 1/ 8/ +1%/ -2% 1/ 8/ 0%/ +1% + 64/ 1/ -6%/ 0% 64/ 1/ +7%/ +3% + 64/ 4/ 0%/ +2% 64/ 4/ -1%/ +1% + 64/ 8/ 0%/ 0% 64/ 8/ -1%/ -2% + 256/ 1/ -3%/ -4% 256/ 1/ -4%/ -2% + 256/ 4/ +3%/ +4% 256/ 4/ +1%/ +2% + 256/ 8/ +2%/ 0% 256/ 8/ +1%/ -1% + +vq size=256 UDP_RR vq size=512 UDP_RR +size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize% + 1/ 1/ -5%/ +1% 1/ 1/ -3%/ -2% + 1/ 4/ +4%/ +1% 1/ 4/ -2%/ +2% + 1/ 8/ -1%/ -1% 1/ 8/ -1%/ 0% + 64/ 1/ -2%/ -3% 64/ 1/ +1%/ +1% + 64/ 4/ -5%/ -1% 64/ 4/ +2%/ 0% + 64/ 8/ 0%/ -1% 64/ 8/ -2%/ +1% + 256/ 1/ +7%/ +1% 256/ 1/ -7%/ 0% + 256/ 4/ +1%/ +1% 256/ 4/ -3%/ -4% + 256/ 8/ +2%/ +2% 256/ 8/ +1%/ +1% + +vq size=256 TCP_STREAM vq size=512 TCP_STREAM +size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize% + 64/ 1/ 0%/ -3% 64/ 1/ 0%/ 0% + 64/ 4/ +3%/ -1% 64/ 4/ -2%/ +4% + 64/ 8/ +9%/ -4% 64/ 8/ -1%/ +2% + 256/ 1/ +1%/ -4% 256/ 1/ +1%/ +1% + 256/ 4/ -1%/ -1% 256/ 4/ -3%/ 0% + 256/ 8/ +7%/ +5% 256/ 8/ -3%/ 0% + 512/ 1/ +1%/ 0% 512/ 1/ -1%/ -1% + 512/ 4/ +1%/ -1% 512/ 4/ 0%/ 0% + 512/ 8/ +7%/ -5% 512/ 8/ +6%/ -1% +1024/ 1/ 0%/ -1% 1024/ 1/ 0%/ +1% +1024/ 4/ +3%/ 0% 1024/ 4/ +1%/ 0% +1024/ 8/ +8%/ +5% 1024/ 8/ -1%/ 0% +2048/ 1/ +2%/ +2% 2048/ 1/ -1%/ 0% +2048/ 4/ +1%/ 0% 2048/ 4/ 0%/ -1% +2048/ 8/ -2%/ 0% 2048/ 8/ 5%/ -1% +4096/ 1/ -2%/ 0% 4096/ 1/ -2%/ 0% +4096/ 4/ +2%/ 0% 4096/ 4/ 0%/ 0% +4096/ 8/ +9%/ -2% 4096/ 8/ -5%/ -1% + +Acked-by: Michael S. Tsirkin +Signed-off-by: Haibin Zhang +Signed-off-by: Yunfang Tai +Signed-off-by: Lidong Chen +Signed-off-by: David S. Miller +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + drivers/vhost/net.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -39,6 +39,10 @@ MODULE_PARM_DESC(experimental_zcopytx, " + * Using this limit prevents one virtqueue from starving others. */ + #define VHOST_NET_WEIGHT 0x80000 + ++/* Max number of packets transferred before requeueing the job. ++ * Using this limit prevents one virtqueue from starving rx. */ ++#define VHOST_NET_PKT_WEIGHT(vq) ((vq)->num * 2) ++ + /* MAX number of TX used buffers for outstanding zerocopy */ + #define VHOST_MAX_PEND 128 + #define VHOST_GOODCOPY_LEN 256 +@@ -372,6 +376,7 @@ static void handle_tx(struct vhost_net * + struct socket *sock; + struct vhost_net_ubuf_ref *uninitialized_var(ubufs); + bool zcopy, zcopy_used; ++ int sent_pkts = 0; + + mutex_lock(&vq->mutex); + sock = vq->private_data; +@@ -474,7 +479,8 @@ static void handle_tx(struct vhost_net * + vhost_zerocopy_signal_used(net, vq); + total_len += len; + vhost_net_tx_packet(net); +- if (unlikely(total_len >= VHOST_NET_WEIGHT)) { ++ if (unlikely(total_len >= VHOST_NET_WEIGHT) || ++ unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT(vq))) { + vhost_poll_queue(&vq->poll); + break; + } diff --git a/queue-4.9/vhost-scsi-add-weight-support.patch b/queue-4.9/vhost-scsi-add-weight-support.patch new file mode 100644 index 00000000000..6bc95e4b9aa --- /dev/null +++ b/queue-4.9/vhost-scsi-add-weight-support.patch @@ -0,0 +1,64 @@ +From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST +From: Ben Hutchings +Date: Sat, 17 Aug 2019 00:01:01 +0100 +Subject: vhost: scsi: add weight support +To: Greg Kroah-Hartman , Sasha Levin +Cc: stable +Message-ID: <20190816230101.GM9843@xylophone.i.decadent.org.uk> +Content-Disposition: inline + +From: Jason Wang + +commit c1ea02f15ab5efb3e93fc3144d895410bf79fcf2 upstream. + +This patch will check the weight and exit the loop if we exceeds the +weight. This is useful for preventing scsi kthread from hogging cpu +which is guest triggerable. + +This addresses CVE-2019-3900. + +Cc: Paolo Bonzini +Cc: Stefan Hajnoczi +Fixes: 057cbf49a1f0 ("tcm_vhost: Initial merge for vhost level target fabric driver") +Signed-off-by: Jason Wang +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Stefan Hajnoczi +[bwh: Backported to 4.9: + - Drop changes in vhost_scsi_ctl_handle_vq() + - Adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + drivers/vhost/scsi.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/drivers/vhost/scsi.c ++++ b/drivers/vhost/scsi.c +@@ -851,7 +851,7 @@ vhost_scsi_handle_vq(struct vhost_scsi * + u64 tag; + u32 exp_data_len, data_direction; + unsigned out, in; +- int head, ret, prot_bytes; ++ int head, ret, prot_bytes, c = 0; + size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp); + size_t out_size, in_size; + u16 lun; +@@ -870,7 +870,7 @@ vhost_scsi_handle_vq(struct vhost_scsi * + + vhost_disable_notify(&vs->dev, vq); + +- for (;;) { ++ do { + head = vhost_get_vq_desc(vq, vq->iov, + ARRAY_SIZE(vq->iov), &out, &in, + NULL, NULL); +@@ -1086,7 +1086,7 @@ vhost_scsi_handle_vq(struct vhost_scsi * + */ + INIT_WORK(&cmd->work, vhost_scsi_submission_work); + queue_work(vhost_scsi_workqueue, &cmd->work); +- } ++ } while (likely(!vhost_exceeds_weight(vq, ++c, 0))); + out: + mutex_unlock(&vq->mutex); + } diff --git a/queue-4.9/vhost_net-fix-possible-infinite-loop.patch b/queue-4.9/vhost_net-fix-possible-infinite-loop.patch new file mode 100644 index 00000000000..01393db9f55 --- /dev/null +++ b/queue-4.9/vhost_net-fix-possible-infinite-loop.patch @@ -0,0 +1,120 @@ +From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST +From: Ben Hutchings +Date: Sat, 17 Aug 2019 00:00:53 +0100 +Subject: vhost_net: fix possible infinite loop +To: Greg Kroah-Hartman , Sasha Levin +Cc: stable +Message-ID: <20190816230053.GL9843@xylophone.i.decadent.org.uk> +Content-Disposition: inline + +From: Jason Wang + +commit e2412c07f8f3040593dfb88207865a3cd58680c0 upstream. + +When the rx buffer is too small for a packet, we will discard the vq +descriptor and retry it for the next packet: + +while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk, + &busyloop_intr))) { +... + /* On overrun, truncate and discard */ + if (unlikely(headcount > UIO_MAXIOV)) { + iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1); + err = sock->ops->recvmsg(sock, &msg, + 1, MSG_DONTWAIT | MSG_TRUNC); + pr_debug("Discarded rx packet: len %zd\n", sock_len); + continue; + } +... +} + +This makes it possible to trigger a infinite while..continue loop +through the co-opreation of two VMs like: + +1) Malicious VM1 allocate 1 byte rx buffer and try to slow down the + vhost process as much as possible e.g using indirect descriptors or + other. +2) Malicious VM2 generate packets to VM1 as fast as possible + +Fixing this by checking against weight at the end of RX and TX +loop. This also eliminate other similar cases when: + +- userspace is consuming the packets in the meanwhile +- theoretical TOCTOU attack if guest moving avail index back and forth + to hit the continue after vhost find guest just add new buffers + +This addresses CVE-2019-3900. + +Fixes: d8316f3991d20 ("vhost: fix total length when packets are too short") +Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server") +Signed-off-by: Jason Wang +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Michael S. Tsirkin +[bwh: Backported to 4.9: + - Both Tx modes are handled in one loop in handle_tx() + - Adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + drivers/vhost/net.c | 22 +++++++++++----------- + 1 file changed, 11 insertions(+), 11 deletions(-) + +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -393,7 +393,7 @@ static void handle_tx(struct vhost_net * + hdr_size = nvq->vhost_hlen; + zcopy = nvq->ubufs; + +- for (;;) { ++ do { + /* Release DMAs done buffers first */ + if (zcopy) + vhost_zerocopy_signal_used(net, vq); +@@ -481,10 +481,7 @@ static void handle_tx(struct vhost_net * + vhost_zerocopy_signal_used(net, vq); + total_len += len; + vhost_net_tx_packet(net); +- if (unlikely(vhost_exceeds_weight(vq, ++sent_pkts, +- total_len))) +- break; +- } ++ } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); + out: + mutex_unlock(&vq->mutex); + } +@@ -682,7 +679,10 @@ static void handle_rx(struct vhost_net * + vq->log : NULL; + mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); + +- while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) { ++ do { ++ sock_len = vhost_net_rx_peek_head_len(net, sock->sk); ++ if (!sock_len) ++ break; + sock_len += sock_hlen; + vhost_len = sock_len + vhost_hlen; + headcount = get_rx_bufs(vq, vq->heads, vhost_len, +@@ -761,10 +761,10 @@ static void handle_rx(struct vhost_net * + vhost_log_write(vq, vq_log, log, vhost_len, + vq->iov, in); + total_len += vhost_len; +- if (unlikely(vhost_exceeds_weight(vq, ++recv_pkts, total_len))) +- goto out; +- } +- vhost_net_enable_vq(net, vq); ++ } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len))); ++ ++ if (!sock_len) ++ vhost_net_enable_vq(net, vq); + out: + mutex_unlock(&vq->mutex); + } +@@ -834,7 +834,7 @@ static int vhost_net_open(struct inode * + n->vqs[i].sock_hlen = 0; + } + vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, +- VHOST_NET_WEIGHT, VHOST_NET_PKT_WEIGHT); ++ VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT); + + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); diff --git a/queue-4.9/vhost_net-introduce-vhost_exceeds_weight.patch b/queue-4.9/vhost_net-introduce-vhost_exceeds_weight.patch new file mode 100644 index 00000000000..5d1b3e92157 --- /dev/null +++ b/queue-4.9/vhost_net-introduce-vhost_exceeds_weight.patch @@ -0,0 +1,57 @@ +From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST +From: Ben Hutchings +Date: Sat, 17 Aug 2019 00:00:36 +0100 +Subject: vhost_net: introduce vhost_exceeds_weight() +To: Greg Kroah-Hartman , Sasha Levin +Cc: stable +Message-ID: <20190816230036.GJ9843@xylophone.i.decadent.org.uk> +Content-Disposition: inline + +From: Jason Wang + +commit 272f35cba53d088085e5952fd81d7a133ab90789 upstream. + +Signed-off-by: Jason Wang +Signed-off-by: David S. Miller +[bwh: Backported to 4.9: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + drivers/vhost/net.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -357,6 +357,12 @@ static int vhost_net_tx_get_vq_desc(stru + return r; + } + ++static bool vhost_exceeds_weight(int pkts, int total_len) ++{ ++ return total_len >= VHOST_NET_WEIGHT || ++ pkts >= VHOST_NET_PKT_WEIGHT; ++} ++ + /* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ + static void handle_tx(struct vhost_net *net) +@@ -481,8 +487,7 @@ static void handle_tx(struct vhost_net * + vhost_zerocopy_signal_used(net, vq); + total_len += len; + vhost_net_tx_packet(net); +- if (unlikely(total_len >= VHOST_NET_WEIGHT) || +- unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) { ++ if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) { + vhost_poll_queue(&vq->poll); + break; + } +@@ -763,8 +768,7 @@ static void handle_rx(struct vhost_net * + vhost_log_write(vq, vq_log, log, vhost_len, + vq->iov, in); + total_len += vhost_len; +- if (unlikely(total_len >= VHOST_NET_WEIGHT) || +- unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) { ++ if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) { + vhost_poll_queue(&vq->poll); + goto out; + } diff --git a/queue-4.9/vhost_net-use-packet-weight-for-rx-handler-too.patch b/queue-4.9/vhost_net-use-packet-weight-for-rx-handler-too.patch new file mode 100644 index 00000000000..5ccd5234b61 --- /dev/null +++ b/queue-4.9/vhost_net-use-packet-weight-for-rx-handler-too.patch @@ -0,0 +1,95 @@ +From foo@baz Sat 17 Aug 2019 06:38:22 PM CEST +From: Ben Hutchings +Date: Sat, 17 Aug 2019 00:00:28 +0100 +Subject: vhost_net: use packet weight for rx handler, too +To: Greg Kroah-Hartman , Sasha Levin +Cc: stable +Message-ID: <20190816230028.GI9843@xylophone.i.decadent.org.uk> +Content-Disposition: inline + +From: Paolo Abeni + +commit db688c24eada63b1efe6d0d7d835e5c3bdd71fd3 upstream. + +Similar to commit a2ac99905f1e ("vhost-net: set packet weight of +tx polling to 2 * vq size"), we need a packet-based limit for +handler_rx, too - elsewhere, under rx flood with small packets, +tx can be delayed for a very long time, even without busypolling. + +The pkt limit applied to handle_rx must be the same applied by +handle_tx, or we will get unfair scheduling between rx and tx. +Tying such limit to the queue length makes it less effective for +large queue length values and can introduce large process +scheduler latencies, so a constant valued is used - likewise +the existing bytes limit. + +The selected limit has been validated with PVP[1] performance +test with different queue sizes: + +queue size 256 512 1024 + +baseline 366 354 362 +weight 128 715 723 670 +weight 256 740 745 733 +weight 512 600 460 583 +weight 1024 423 427 418 + +A packet weight of 256 gives peek performances in under all the +tested scenarios. + +No measurable regression in unidirectional performance tests has +been detected. + +[1] https://developers.redhat.com/blog/2017/06/05/measuring-and-comparing-open-vswitch-performance/ + +Signed-off-by: Paolo Abeni +Acked-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + drivers/vhost/net.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -40,8 +40,10 @@ MODULE_PARM_DESC(experimental_zcopytx, " + #define VHOST_NET_WEIGHT 0x80000 + + /* Max number of packets transferred before requeueing the job. +- * Using this limit prevents one virtqueue from starving rx. */ +-#define VHOST_NET_PKT_WEIGHT(vq) ((vq)->num * 2) ++ * Using this limit prevents one virtqueue from starving others with small ++ * pkts. ++ */ ++#define VHOST_NET_PKT_WEIGHT 256 + + /* MAX number of TX used buffers for outstanding zerocopy */ + #define VHOST_MAX_PEND 128 +@@ -480,7 +482,7 @@ static void handle_tx(struct vhost_net * + total_len += len; + vhost_net_tx_packet(net); + if (unlikely(total_len >= VHOST_NET_WEIGHT) || +- unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT(vq))) { ++ unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) { + vhost_poll_queue(&vq->poll); + break; + } +@@ -662,6 +664,7 @@ static void handle_rx(struct vhost_net * + struct socket *sock; + struct iov_iter fixup; + __virtio16 num_buffers; ++ int recv_pkts = 0; + + mutex_lock_nested(&vq->mutex, 0); + sock = vq->private_data; +@@ -760,7 +763,8 @@ static void handle_rx(struct vhost_net * + vhost_log_write(vq, vq_log, log, vhost_len, + vq->iov, in); + total_len += vhost_len; +- if (unlikely(total_len >= VHOST_NET_WEIGHT)) { ++ if (unlikely(total_len >= VHOST_NET_WEIGHT) || ++ unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) { + vhost_poll_queue(&vq->poll); + goto out; + }