releases/5.0.10/crypto-x86-poly1305-fix-overflow-during-partial-reduction.patch

   1 From 678cce4019d746da6c680c48ba9e6d417803e127 Mon Sep 17 00:00:00 2001
   2 From: Eric Biggers <ebiggers@google.com>
   3 Date: Sun, 31 Mar 2019 13:04:11 -0700
   4 Subject: crypto: x86/poly1305 - fix overflow during partial reduction
   5
   6 From: Eric Biggers <ebiggers@google.com>
   7
   8 commit 678cce4019d746da6c680c48ba9e6d417803e127 upstream.
   9
  10 The x86_64 implementation of Poly1305 produces the wrong result on some
  11 inputs because poly1305_4block_avx2() incorrectly assumes that when
  12 partially reducing the accumulator, the bits carried from limb 'd4' to
  13 limb 'h0' fit in a 32-bit integer.  This is true for poly1305-generic
  14 which processes only one block at a time.  However, it's not true for
  15 the AVX2 implementation, which processes 4 blocks at a time and
  16 therefore can produce intermediate limbs about 4x larger.
  17
  18 Fix it by making the relevant calculations use 64-bit arithmetic rather
  19 than 32-bit.  Note that most of the carries already used 64-bit
  20 arithmetic, but the d4 -> h0 carry was different for some reason.
  21
  22 To be safe I also made the same change to the corresponding SSE2 code,
  23 though that only operates on 1 or 2 blocks at a time.  I don't think
  24 it's really needed for poly1305_block_sse2(), but it doesn't hurt
  25 because it's already x86_64 code.  It *might* be needed for
  26 poly1305_2block_sse2(), but overflows aren't easy to reproduce there.
  27
  28 This bug was originally detected by my patches that improve testmgr to
  29 fuzz algorithms against their generic implementation.  But also add a
  30 test vector which reproduces it directly (in the AVX2 case).
  31
  32 Fixes: b1ccc8f4b631 ("crypto: poly1305 - Add a four block AVX2 variant for x86_64")
  33 Fixes: c70f4abef07a ("crypto: poly1305 - Add a SSE2 SIMD variant for x86_64")
  34 Cc: <stable@vger.kernel.org> # v4.3+
  35 Cc: Martin Willi <martin@strongswan.org>
  36 Cc: Jason A. Donenfeld <Jason@zx2c4.com>
  37 Signed-off-by: Eric Biggers <ebiggers@google.com>
  38 Reviewed-by: Martin Willi <martin@strongswan.org>
  39 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
  40 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  41
  42 ---
  43  arch/x86/crypto/poly1305-avx2-x86_64.S |   14 +++++++---
  44  arch/x86/crypto/poly1305-sse2-x86_64.S |   22 ++++++++++------
  45  crypto/testmgr.h                       |   44 ++++++++++++++++++++++++++++++++-
  46  3 files changed, 67 insertions(+), 13 deletions(-)
  47
  48 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S
  49 +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
  50 @@ -323,6 +323,12 @@ ENTRY(poly1305_4block_avx2)
  51         vpaddq          t2,t1,t1
  52         vmovq           t1x,d4
  53
  54 +       # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
  55 +       # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
  56 +       # amount.  Careful: we must not assume the carry bits 'd0 >> 26',
  57 +       # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
  58 +       # integers.  It's true in a single-block implementation, but not here.
  59 +
  60         # d1 += d0 >> 26
  61         mov             d0,%rax
  62         shr             $26,%rax
  63 @@ -361,16 +367,16 @@ ENTRY(poly1305_4block_avx2)
  64         # h0 += (d4 >> 26) * 5
  65         mov             d4,%rax
  66         shr             $26,%rax
  67 -       lea             (%eax,%eax,4),%eax
  68 -       add             %eax,%ebx
  69 +       lea             (%rax,%rax,4),%rax
  70 +       add             %rax,%rbx
  71         # h4 = d4 & 0x3ffffff
  72         mov             d4,%rax
  73         and             $0x3ffffff,%eax
  74         mov             %eax,h4
  75
  76         # h1 += h0 >> 26
  77 -       mov             %ebx,%eax
  78 -       shr             $26,%eax
  79 +       mov             %rbx,%rax
  80 +       shr             $26,%rax
  81         add             %eax,h1
  82         # h0 = h0 & 0x3ffffff
  83         andl            $0x3ffffff,%ebx
  84 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S
  85 +++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
  86 @@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2)
  87         # h0 += (d4 >> 26) * 5
  88         mov             d4,%rax
  89         shr             $26,%rax
  90 -       lea             (%eax,%eax,4),%eax
  91 -       add             %eax,%ebx
  92 +       lea             (%rax,%rax,4),%rax
  93 +       add             %rax,%rbx
  94         # h4 = d4 & 0x3ffffff
  95         mov             d4,%rax
  96         and             $0x3ffffff,%eax
  97         mov             %eax,h4
  98
  99         # h1 += h0 >> 26
 100 -       mov             %ebx,%eax
 101 -       shr             $26,%eax
 102 +       mov             %rbx,%rax
 103 +       shr             $26,%rax
 104         add             %eax,h1
 105         # h0 = h0 & 0x3ffffff
 106         andl            $0x3ffffff,%ebx
 107 @@ -520,6 +520,12 @@ ENTRY(poly1305_2block_sse2)
 108         paddq           t2,t1
 109         movq            t1,d4
 110
 111 +       # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
 112 +       # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
 113 +       # amount.  Careful: we must not assume the carry bits 'd0 >> 26',
 114 +       # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
 115 +       # integers.  It's true in a single-block implementation, but not here.
 116 +
 117         # d1 += d0 >> 26
 118         mov             d0,%rax
 119         shr             $26,%rax
 120 @@ -558,16 +564,16 @@ ENTRY(poly1305_2block_sse2)
 121         # h0 += (d4 >> 26) * 5
 122         mov             d4,%rax
 123         shr             $26,%rax
 124 -       lea             (%eax,%eax,4),%eax
 125 -       add             %eax,%ebx
 126 +       lea             (%rax,%rax,4),%rax
 127 +       add             %rax,%rbx
 128         # h4 = d4 & 0x3ffffff
 129         mov             d4,%rax
 130         and             $0x3ffffff,%eax
 131         mov             %eax,h4
 132
 133         # h1 += h0 >> 26
 134 -       mov             %ebx,%eax
 135 -       shr             $26,%eax
 136 +       mov             %rbx,%rax
 137 +       shr             $26,%rax
 138         add             %eax,h1
 139         # h0 = h0 & 0x3ffffff
 140         andl            $0x3ffffff,%ebx
 141 --- a/crypto/testmgr.h
 142 +++ b/crypto/testmgr.h
 143 @@ -5706,7 +5706,49 @@ static const struct hash_testvec poly130
 144                 .psize          = 80,
 145                 .digest         = "\x13\x00\x00\x00\x00\x00\x00\x00"
 146                                   "\x00\x00\x00\x00\x00\x00\x00\x00",
 147 -       },
 148 +       }, { /* Regression test for overflow in AVX2 implementation */
 149 +               .plaintext      = "\xff\xff\xff\xff\xff\xff\xff\xff"
 150 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 151 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 152 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 153 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 154 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 155 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 156 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 157 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 158 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 159 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 160 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 161 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 162 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 163 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 164 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 165 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 166 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 167 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 168 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 169 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 170 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 171 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 172 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 173 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 174 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 175 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 176 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 177 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 178 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 179 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 180 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 181 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 182 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 183 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 184 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 185 +                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
 186 +                                 "\xff\xff\xff\xff",
 187 +               .psize          = 300,
 188 +               .digest         = "\xfb\x5e\x96\xd8\x61\xd5\xc7\xc8"
 189 +                                 "\x78\xe5\x87\xcc\x2d\x5a\x22\xe1",
 190 +       }
 191  };
 192
 193  /* NHPoly1305 test vectors from https://github.com/google/adiantum */