1 From 678cce4019d746da6c680c48ba9e6d417803e127 Mon Sep 17 00:00:00 2001
2 From: Eric Biggers <ebiggers@google.com>
3 Date: Sun, 31 Mar 2019 13:04:11 -0700
4 Subject: crypto: x86/poly1305 - fix overflow during partial reduction
6 From: Eric Biggers <ebiggers@google.com>
8 commit 678cce4019d746da6c680c48ba9e6d417803e127 upstream.
10 The x86_64 implementation of Poly1305 produces the wrong result on some
11 inputs because poly1305_4block_avx2() incorrectly assumes that when
12 partially reducing the accumulator, the bits carried from limb 'd4' to
13 limb 'h0' fit in a 32-bit integer. This is true for poly1305-generic
14 which processes only one block at a time. However, it's not true for
15 the AVX2 implementation, which processes 4 blocks at a time and
16 therefore can produce intermediate limbs about 4x larger.
18 Fix it by making the relevant calculations use 64-bit arithmetic rather
19 than 32-bit. Note that most of the carries already used 64-bit
20 arithmetic, but the d4 -> h0 carry was different for some reason.
22 To be safe I also made the same change to the corresponding SSE2 code,
23 though that only operates on 1 or 2 blocks at a time. I don't think
24 it's really needed for poly1305_block_sse2(), but it doesn't hurt
25 because it's already x86_64 code. It *might* be needed for
26 poly1305_2block_sse2(), but overflows aren't easy to reproduce there.
28 This bug was originally detected by my patches that improve testmgr to
29 fuzz algorithms against their generic implementation. But also add a
30 test vector which reproduces it directly (in the AVX2 case).
32 Fixes: b1ccc8f4b631 ("crypto: poly1305 - Add a four block AVX2 variant for x86_64")
33 Fixes: c70f4abef07a ("crypto: poly1305 - Add a SSE2 SIMD variant for x86_64")
34 Cc: <stable@vger.kernel.org> # v4.3+
35 Cc: Martin Willi <martin@strongswan.org>
36 Cc: Jason A. Donenfeld <Jason@zx2c4.com>
37 Signed-off-by: Eric Biggers <ebiggers@google.com>
38 Reviewed-by: Martin Willi <martin@strongswan.org>
39 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
40 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
43 arch/x86/crypto/poly1305-avx2-x86_64.S | 14 +++++++---
44 arch/x86/crypto/poly1305-sse2-x86_64.S | 22 ++++++++++------
45 crypto/testmgr.h | 44 ++++++++++++++++++++++++++++++++-
46 3 files changed, 67 insertions(+), 13 deletions(-)
48 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S
49 +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
50 @@ -323,6 +323,12 @@ ENTRY(poly1305_4block_avx2)
54 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
55 + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
56 + # amount. Careful: we must not assume the carry bits 'd0 >> 26',
57 + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
58 + # integers. It's true in a single-block implementation, but not here.
63 @@ -361,16 +367,16 @@ ENTRY(poly1305_4block_avx2)
64 # h0 += (d4 >> 26) * 5
67 - lea (%eax,%eax,4),%eax
69 + lea (%rax,%rax,4),%rax
84 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S
85 +++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
86 @@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2)
87 # h0 += (d4 >> 26) * 5
90 - lea (%eax,%eax,4),%eax
92 + lea (%rax,%rax,4),%rax
105 # h0 = h0 & 0x3ffffff
107 @@ -520,6 +520,12 @@ ENTRY(poly1305_2block_sse2)
111 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
112 + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
113 + # amount. Careful: we must not assume the carry bits 'd0 >> 26',
114 + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
115 + # integers. It's true in a single-block implementation, but not here.
120 @@ -558,16 +564,16 @@ ENTRY(poly1305_2block_sse2)
121 # h0 += (d4 >> 26) * 5
124 - lea (%eax,%eax,4),%eax
126 + lea (%rax,%rax,4),%rax
128 # h4 = d4 & 0x3ffffff
139 # h0 = h0 & 0x3ffffff
141 --- a/crypto/testmgr.h
142 +++ b/crypto/testmgr.h
143 @@ -5592,7 +5592,49 @@ static const struct hash_testvec poly130
145 .digest = "\x13\x00\x00\x00\x00\x00\x00\x00"
146 "\x00\x00\x00\x00\x00\x00\x00\x00",
148 + }, { /* Regression test for overflow in AVX2 implementation */
149 + .plaintext = "\xff\xff\xff\xff\xff\xff\xff\xff"
150 + "\xff\xff\xff\xff\xff\xff\xff\xff"
151 + "\xff\xff\xff\xff\xff\xff\xff\xff"
152 + "\xff\xff\xff\xff\xff\xff\xff\xff"
153 + "\xff\xff\xff\xff\xff\xff\xff\xff"
154 + "\xff\xff\xff\xff\xff\xff\xff\xff"
155 + "\xff\xff\xff\xff\xff\xff\xff\xff"
156 + "\xff\xff\xff\xff\xff\xff\xff\xff"
157 + "\xff\xff\xff\xff\xff\xff\xff\xff"
158 + "\xff\xff\xff\xff\xff\xff\xff\xff"
159 + "\xff\xff\xff\xff\xff\xff\xff\xff"
160 + "\xff\xff\xff\xff\xff\xff\xff\xff"
161 + "\xff\xff\xff\xff\xff\xff\xff\xff"
162 + "\xff\xff\xff\xff\xff\xff\xff\xff"
163 + "\xff\xff\xff\xff\xff\xff\xff\xff"
164 + "\xff\xff\xff\xff\xff\xff\xff\xff"
165 + "\xff\xff\xff\xff\xff\xff\xff\xff"
166 + "\xff\xff\xff\xff\xff\xff\xff\xff"
167 + "\xff\xff\xff\xff\xff\xff\xff\xff"
168 + "\xff\xff\xff\xff\xff\xff\xff\xff"
169 + "\xff\xff\xff\xff\xff\xff\xff\xff"
170 + "\xff\xff\xff\xff\xff\xff\xff\xff"
171 + "\xff\xff\xff\xff\xff\xff\xff\xff"
172 + "\xff\xff\xff\xff\xff\xff\xff\xff"
173 + "\xff\xff\xff\xff\xff\xff\xff\xff"
174 + "\xff\xff\xff\xff\xff\xff\xff\xff"
175 + "\xff\xff\xff\xff\xff\xff\xff\xff"
176 + "\xff\xff\xff\xff\xff\xff\xff\xff"
177 + "\xff\xff\xff\xff\xff\xff\xff\xff"
178 + "\xff\xff\xff\xff\xff\xff\xff\xff"
179 + "\xff\xff\xff\xff\xff\xff\xff\xff"
180 + "\xff\xff\xff\xff\xff\xff\xff\xff"
181 + "\xff\xff\xff\xff\xff\xff\xff\xff"
182 + "\xff\xff\xff\xff\xff\xff\xff\xff"
183 + "\xff\xff\xff\xff\xff\xff\xff\xff"
184 + "\xff\xff\xff\xff\xff\xff\xff\xff"
185 + "\xff\xff\xff\xff\xff\xff\xff\xff"
186 + "\xff\xff\xff\xff",
188 + .digest = "\xfb\x5e\x96\xd8\x61\xd5\xc7\xc8"
189 + "\x78\xe5\x87\xcc\x2d\x5a\x22\xe1",