]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - releases/5.0.10/crypto-x86-poly1305-fix-overflow-during-partial-reduction.patch
move 5.10 queue back in place
[thirdparty/kernel/stable-queue.git] / releases / 5.0.10 / crypto-x86-poly1305-fix-overflow-during-partial-reduction.patch
1 From 678cce4019d746da6c680c48ba9e6d417803e127 Mon Sep 17 00:00:00 2001
2 From: Eric Biggers <ebiggers@google.com>
3 Date: Sun, 31 Mar 2019 13:04:11 -0700
4 Subject: crypto: x86/poly1305 - fix overflow during partial reduction
5
6 From: Eric Biggers <ebiggers@google.com>
7
8 commit 678cce4019d746da6c680c48ba9e6d417803e127 upstream.
9
10 The x86_64 implementation of Poly1305 produces the wrong result on some
11 inputs because poly1305_4block_avx2() incorrectly assumes that when
12 partially reducing the accumulator, the bits carried from limb 'd4' to
13 limb 'h0' fit in a 32-bit integer. This is true for poly1305-generic
14 which processes only one block at a time. However, it's not true for
15 the AVX2 implementation, which processes 4 blocks at a time and
16 therefore can produce intermediate limbs about 4x larger.
17
18 Fix it by making the relevant calculations use 64-bit arithmetic rather
19 than 32-bit. Note that most of the carries already used 64-bit
20 arithmetic, but the d4 -> h0 carry was different for some reason.
21
22 To be safe I also made the same change to the corresponding SSE2 code,
23 though that only operates on 1 or 2 blocks at a time. I don't think
24 it's really needed for poly1305_block_sse2(), but it doesn't hurt
25 because it's already x86_64 code. It *might* be needed for
26 poly1305_2block_sse2(), but overflows aren't easy to reproduce there.
27
28 This bug was originally detected by my patches that improve testmgr to
29 fuzz algorithms against their generic implementation. But also add a
30 test vector which reproduces it directly (in the AVX2 case).
31
32 Fixes: b1ccc8f4b631 ("crypto: poly1305 - Add a four block AVX2 variant for x86_64")
33 Fixes: c70f4abef07a ("crypto: poly1305 - Add a SSE2 SIMD variant for x86_64")
34 Cc: <stable@vger.kernel.org> # v4.3+
35 Cc: Martin Willi <martin@strongswan.org>
36 Cc: Jason A. Donenfeld <Jason@zx2c4.com>
37 Signed-off-by: Eric Biggers <ebiggers@google.com>
38 Reviewed-by: Martin Willi <martin@strongswan.org>
39 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
40 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
41
42 ---
43 arch/x86/crypto/poly1305-avx2-x86_64.S | 14 +++++++---
44 arch/x86/crypto/poly1305-sse2-x86_64.S | 22 ++++++++++------
45 crypto/testmgr.h | 44 ++++++++++++++++++++++++++++++++-
46 3 files changed, 67 insertions(+), 13 deletions(-)
47
48 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S
49 +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
50 @@ -323,6 +323,12 @@ ENTRY(poly1305_4block_avx2)
51 vpaddq t2,t1,t1
52 vmovq t1x,d4
53
54 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
55 + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
56 + # amount. Careful: we must not assume the carry bits 'd0 >> 26',
57 + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
58 + # integers. It's true in a single-block implementation, but not here.
59 +
60 # d1 += d0 >> 26
61 mov d0,%rax
62 shr $26,%rax
63 @@ -361,16 +367,16 @@ ENTRY(poly1305_4block_avx2)
64 # h0 += (d4 >> 26) * 5
65 mov d4,%rax
66 shr $26,%rax
67 - lea (%eax,%eax,4),%eax
68 - add %eax,%ebx
69 + lea (%rax,%rax,4),%rax
70 + add %rax,%rbx
71 # h4 = d4 & 0x3ffffff
72 mov d4,%rax
73 and $0x3ffffff,%eax
74 mov %eax,h4
75
76 # h1 += h0 >> 26
77 - mov %ebx,%eax
78 - shr $26,%eax
79 + mov %rbx,%rax
80 + shr $26,%rax
81 add %eax,h1
82 # h0 = h0 & 0x3ffffff
83 andl $0x3ffffff,%ebx
84 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S
85 +++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
86 @@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2)
87 # h0 += (d4 >> 26) * 5
88 mov d4,%rax
89 shr $26,%rax
90 - lea (%eax,%eax,4),%eax
91 - add %eax,%ebx
92 + lea (%rax,%rax,4),%rax
93 + add %rax,%rbx
94 # h4 = d4 & 0x3ffffff
95 mov d4,%rax
96 and $0x3ffffff,%eax
97 mov %eax,h4
98
99 # h1 += h0 >> 26
100 - mov %ebx,%eax
101 - shr $26,%eax
102 + mov %rbx,%rax
103 + shr $26,%rax
104 add %eax,h1
105 # h0 = h0 & 0x3ffffff
106 andl $0x3ffffff,%ebx
107 @@ -520,6 +520,12 @@ ENTRY(poly1305_2block_sse2)
108 paddq t2,t1
109 movq t1,d4
110
111 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
112 + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
113 + # amount. Careful: we must not assume the carry bits 'd0 >> 26',
114 + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
115 + # integers. It's true in a single-block implementation, but not here.
116 +
117 # d1 += d0 >> 26
118 mov d0,%rax
119 shr $26,%rax
120 @@ -558,16 +564,16 @@ ENTRY(poly1305_2block_sse2)
121 # h0 += (d4 >> 26) * 5
122 mov d4,%rax
123 shr $26,%rax
124 - lea (%eax,%eax,4),%eax
125 - add %eax,%ebx
126 + lea (%rax,%rax,4),%rax
127 + add %rax,%rbx
128 # h4 = d4 & 0x3ffffff
129 mov d4,%rax
130 and $0x3ffffff,%eax
131 mov %eax,h4
132
133 # h1 += h0 >> 26
134 - mov %ebx,%eax
135 - shr $26,%eax
136 + mov %rbx,%rax
137 + shr $26,%rax
138 add %eax,h1
139 # h0 = h0 & 0x3ffffff
140 andl $0x3ffffff,%ebx
141 --- a/crypto/testmgr.h
142 +++ b/crypto/testmgr.h
143 @@ -5706,7 +5706,49 @@ static const struct hash_testvec poly130
144 .psize = 80,
145 .digest = "\x13\x00\x00\x00\x00\x00\x00\x00"
146 "\x00\x00\x00\x00\x00\x00\x00\x00",
147 - },
148 + }, { /* Regression test for overflow in AVX2 implementation */
149 + .plaintext = "\xff\xff\xff\xff\xff\xff\xff\xff"
150 + "\xff\xff\xff\xff\xff\xff\xff\xff"
151 + "\xff\xff\xff\xff\xff\xff\xff\xff"
152 + "\xff\xff\xff\xff\xff\xff\xff\xff"
153 + "\xff\xff\xff\xff\xff\xff\xff\xff"
154 + "\xff\xff\xff\xff\xff\xff\xff\xff"
155 + "\xff\xff\xff\xff\xff\xff\xff\xff"
156 + "\xff\xff\xff\xff\xff\xff\xff\xff"
157 + "\xff\xff\xff\xff\xff\xff\xff\xff"
158 + "\xff\xff\xff\xff\xff\xff\xff\xff"
159 + "\xff\xff\xff\xff\xff\xff\xff\xff"
160 + "\xff\xff\xff\xff\xff\xff\xff\xff"
161 + "\xff\xff\xff\xff\xff\xff\xff\xff"
162 + "\xff\xff\xff\xff\xff\xff\xff\xff"
163 + "\xff\xff\xff\xff\xff\xff\xff\xff"
164 + "\xff\xff\xff\xff\xff\xff\xff\xff"
165 + "\xff\xff\xff\xff\xff\xff\xff\xff"
166 + "\xff\xff\xff\xff\xff\xff\xff\xff"
167 + "\xff\xff\xff\xff\xff\xff\xff\xff"
168 + "\xff\xff\xff\xff\xff\xff\xff\xff"
169 + "\xff\xff\xff\xff\xff\xff\xff\xff"
170 + "\xff\xff\xff\xff\xff\xff\xff\xff"
171 + "\xff\xff\xff\xff\xff\xff\xff\xff"
172 + "\xff\xff\xff\xff\xff\xff\xff\xff"
173 + "\xff\xff\xff\xff\xff\xff\xff\xff"
174 + "\xff\xff\xff\xff\xff\xff\xff\xff"
175 + "\xff\xff\xff\xff\xff\xff\xff\xff"
176 + "\xff\xff\xff\xff\xff\xff\xff\xff"
177 + "\xff\xff\xff\xff\xff\xff\xff\xff"
178 + "\xff\xff\xff\xff\xff\xff\xff\xff"
179 + "\xff\xff\xff\xff\xff\xff\xff\xff"
180 + "\xff\xff\xff\xff\xff\xff\xff\xff"
181 + "\xff\xff\xff\xff\xff\xff\xff\xff"
182 + "\xff\xff\xff\xff\xff\xff\xff\xff"
183 + "\xff\xff\xff\xff\xff\xff\xff\xff"
184 + "\xff\xff\xff\xff\xff\xff\xff\xff"
185 + "\xff\xff\xff\xff\xff\xff\xff\xff"
186 + "\xff\xff\xff\xff",
187 + .psize = 300,
188 + .digest = "\xfb\x5e\x96\xd8\x61\xd5\xc7\xc8"
189 + "\x78\xe5\x87\xcc\x2d\x5a\x22\xe1",
190 + }
191 };
192
193 /* NHPoly1305 test vectors from https://github.com/google/adiantum */