]>
Commit | Line | Data |
---|---|---|
aa6bb135 RS |
1 | /* |
2 | * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. | |
c8d1c9b0 | 3 | * |
aa6bb135 RS |
4 | * Licensed under the OpenSSL license (the "License"). You may not use |
5 | * this file except in compliance with the License. You can obtain a copy | |
6 | * in the file LICENSE in the source distribution or at | |
7 | * https://www.openssl.org/source/license.html | |
c8d1c9b0 AP |
8 | */ |
9 | ||
10 | /* | |
11 | * This module is meant to be used as template for non-x87 floating- | |
12 | * point assembly modules. The template itself is x86_64-specific | |
13 | * though, as it was debugged on x86_64. So that implementor would | |
14 | * have to recognize platform-specific parts, UxTOy and inline asm, | |
15 | * and act accordingly. | |
16 | * | |
17 | * Huh? x86_64-specific code as template for non-x87? Note seven, which | |
18 | * is not a typo, but reference to 80-bit precision. This module on the | |
19 | * other hand relies on 64-bit precision operations, which are default | |
20 | * for x86_64 code. And since we are at it, just for sense of it, | |
21 | * large-block performance in cycles per processed byte for *this* code | |
22 | * is: | |
23 | * gcc-4.8 icc-15.0 clang-3.4(*) | |
24 | * | |
25 | * Westmere 4.96 5.09 4.37 | |
26 | * Sandy Bridge 4.95 4.90 4.17 | |
27 | * Haswell 4.92 4.87 3.78 | |
28 | * Bulldozer 4.67 4.49 4.68 | |
29 | * VIA Nano 7.07 7.05 5.98 | |
30 | * Silvermont 10.6 9.61 12.6 | |
31 | * | |
32 | * (*) clang managed to discover parallelism and deployed SIMD; | |
33 | * | |
34 | * And for range of other platforms with unspecified gcc versions: | |
35 | * | |
36 | * Freescale e300 12.5 | |
37 | * PPC74x0 10.8 | |
38 | * POWER6 4.92 | |
39 | * POWER7 4.50 | |
40 | * POWER8 4.10 | |
41 | * | |
42 | * z10 11.2 | |
43 | * z196+ 7.30 | |
44 | * | |
45 | * UltraSPARC III 16.0 | |
46 | * SPARC T4 16.1 | |
47 | */ | |
48 | ||
49 | #if !(defined(__GNUC__) && __GNUC__>=2) | |
50 | # error "this is gcc-specific template" | |
51 | #endif | |
52 | ||
53 | #include <stdlib.h> | |
54 | ||
55 | typedef unsigned char u8; | |
56 | typedef unsigned int u32; | |
57 | typedef unsigned long long u64; | |
58 | typedef union { double d; u64 u; } elem64; | |
59 | ||
60 | #define TWO(p) ((double)(1ULL<<(p))) | |
61 | #define TWO0 TWO(0) | |
62 | #define TWO32 TWO(32) | |
63 | #define TWO64 (TWO32*TWO(32)) | |
64 | #define TWO96 (TWO64*TWO(32)) | |
65 | #define TWO130 (TWO96*TWO(34)) | |
66 | ||
67 | #define EXP(p) ((1023ULL+(p))<<52) | |
68 | ||
69 | #if defined(__x86_64__) || (defined(__PPC__) && defined(__LITTLE_ENDIAN__)) | |
70 | # define U8TOU32(p) (*(const u32 *)(p)) | |
71 | # define U32TO8(p,v) (*(u32 *)(p) = (v)) | |
72 | #elif defined(__PPC__) | |
73 | # define U8TOU32(p) ({u32 ret; asm ("lwbrx %0,0,%1":"=r"(ret):"b"(p)); ret; }) | |
74 | # define U32TO8(p,v) asm ("stwbrx %0,0,%1"::"r"(v),"b"(p):"memory") | |
75 | #elif defined(__s390x__) | |
76 | # define U8TOU32(p) ({u32 ret; asm ("lrv %0,%1":"=d"(ret):"m"(*(u32 *)(p))); ret; }) | |
77 | # define U32TO8(p,v) asm ("strv %1,%0":"=m"(*(u32 *)(p)):"d"(v)) | |
78 | #endif | |
79 | ||
80 | #ifndef U8TOU32 | |
81 | # define U8TOU32(p) ((u32)(p)[0] | (u32)(p)[1]<<8 | \ | |
82 | (u32)(p)[2]<<16 | (u32)(p)[3]<<24 ) | |
83 | #endif | |
84 | #ifndef U32TO8 | |
85 | # define U32TO8(p,v) ((p)[0] = (u8)(v), (p)[1] = (u8)((v)>>8), \ | |
86 | (p)[2] = (u8)((v)>>16), (p)[3] = (u8)((v)>>24) ) | |
87 | #endif | |
88 | ||
89 | typedef struct { | |
90 | elem64 h[4]; | |
91 | double r[8]; | |
92 | double s[6]; | |
93 | } poly1305_internal; | |
94 | ||
95 | /* "round toward zero (truncate), mask all exceptions" */ | |
96 | #if defined(__x86_64__) | |
97 | static const u32 mxcsr = 0x7f80; | |
98 | #elif defined(__PPC__) | |
99 | static const u64 one = 1; | |
100 | #elif defined(__s390x__) | |
101 | static const u32 fpc = 1; | |
102 | #elif defined(__sparc__) | |
103 | static const u64 fsr = 1ULL<<30; | |
104 | #else | |
105 | #error "unrecognized platform" | |
106 | #endif | |
107 | ||
108 | int poly1305_init(void *ctx, const unsigned char key[16]) | |
109 | { | |
110 | poly1305_internal *st = (poly1305_internal *) ctx; | |
111 | elem64 r0, r1, r2, r3; | |
112 | ||
113 | /* h = 0, biased */ | |
114 | #if 0 | |
115 | st->h[0].d = TWO(52)*TWO0; | |
116 | st->h[1].d = TWO(52)*TWO32; | |
117 | st->h[2].d = TWO(52)*TWO64; | |
118 | st->h[3].d = TWO(52)*TWO96; | |
119 | #else | |
120 | st->h[0].u = EXP(52+0); | |
121 | st->h[1].u = EXP(52+32); | |
122 | st->h[2].u = EXP(52+64); | |
123 | st->h[3].u = EXP(52+96); | |
124 | #endif | |
125 | ||
126 | if (key) { | |
127 | /* | |
128 | * set "truncate" rounding mode | |
129 | */ | |
130 | #if defined(__x86_64__) | |
131 | u32 mxcsr_orig; | |
132 | ||
133 | asm volatile ("stmxcsr %0":"=m"(mxcsr_orig)); | |
134 | asm volatile ("ldmxcsr %0"::"m"(mxcsr)); | |
135 | #elif defined(__PPC__) | |
136 | double fpscr_orig, fpscr = *(double *)&one; | |
137 | ||
138 | asm volatile ("mffs %0":"=f"(fpscr_orig)); | |
139 | asm volatile ("mtfsf 255,%0"::"f"(fpscr)); | |
140 | #elif defined(__s390x__) | |
141 | u32 fpc_orig; | |
142 | ||
143 | asm volatile ("stfpc %0":"=m"(fpc_orig)); | |
144 | asm volatile ("lfpc %0"::"m"(fpc)); | |
145 | #elif defined(__sparc__) | |
146 | u64 fsr_orig; | |
147 | ||
148 | asm volatile ("stx %%fsr,%0":"=m"(fsr_orig)); | |
149 | asm volatile ("ldx %0,%%fsr"::"m"(fsr)); | |
150 | #endif | |
151 | ||
152 | /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ | |
153 | r0.u = EXP(52+0) | (U8TOU32(&key[0]) & 0x0fffffff); | |
154 | r1.u = EXP(52+32) | (U8TOU32(&key[4]) & 0x0ffffffc); | |
155 | r2.u = EXP(52+64) | (U8TOU32(&key[8]) & 0x0ffffffc); | |
156 | r3.u = EXP(52+96) | (U8TOU32(&key[12]) & 0x0ffffffc); | |
157 | ||
158 | st->r[0] = r0.d - TWO(52)*TWO0; | |
159 | st->r[2] = r1.d - TWO(52)*TWO32; | |
160 | st->r[4] = r2.d - TWO(52)*TWO64; | |
161 | st->r[6] = r3.d - TWO(52)*TWO96; | |
162 | ||
163 | st->s[0] = st->r[2] * (5.0/TWO130); | |
164 | st->s[2] = st->r[4] * (5.0/TWO130); | |
165 | st->s[4] = st->r[6] * (5.0/TWO130); | |
166 | ||
167 | /* | |
168 | * base 2^32 -> base 2^16 | |
169 | */ | |
170 | st->r[1] = (st->r[0] + TWO(52)*TWO(16)*TWO0) - | |
171 | TWO(52)*TWO(16)*TWO0; | |
172 | st->r[0] -= st->r[1]; | |
173 | ||
174 | st->r[3] = (st->r[2] + TWO(52)*TWO(16)*TWO32) - | |
175 | TWO(52)*TWO(16)*TWO32; | |
176 | st->r[2] -= st->r[3]; | |
177 | ||
178 | st->r[5] = (st->r[4] + TWO(52)*TWO(16)*TWO64) - | |
179 | TWO(52)*TWO(16)*TWO64; | |
180 | st->r[4] -= st->r[5]; | |
181 | ||
182 | st->r[7] = (st->r[6] + TWO(52)*TWO(16)*TWO96) - | |
183 | TWO(52)*TWO(16)*TWO96; | |
184 | st->r[6] -= st->r[7]; | |
185 | ||
186 | st->s[1] = (st->s[0] + TWO(52)*TWO(16)*TWO0/TWO96) - | |
187 | TWO(52)*TWO(16)*TWO0/TWO96; | |
188 | st->s[0] -= st->s[1]; | |
189 | ||
190 | st->s[3] = (st->s[2] + TWO(52)*TWO(16)*TWO32/TWO96) - | |
191 | TWO(52)*TWO(16)*TWO32/TWO96; | |
192 | st->s[2] -= st->s[3]; | |
193 | ||
194 | st->s[5] = (st->s[4] + TWO(52)*TWO(16)*TWO64/TWO96) - | |
195 | TWO(52)*TWO(16)*TWO64/TWO96; | |
196 | st->s[4] -= st->s[5]; | |
197 | ||
198 | /* | |
199 | * restore original FPU control register | |
200 | */ | |
201 | #if defined(__x86_64__) | |
202 | asm volatile ("ldmxcsr %0"::"m"(mxcsr_orig)); | |
203 | #elif defined(__PPC__) | |
204 | asm volatile ("mtfsf 255,%0"::"f"(fpscr_orig)); | |
205 | #elif defined(__s390x__) | |
206 | asm volatile ("lfpc %0"::"m"(fpc_orig)); | |
207 | #elif defined(__sparc__) | |
208 | asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig)); | |
209 | #endif | |
210 | } | |
211 | ||
212 | return 0; | |
213 | } | |
214 | ||
215 | void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, | |
216 | int padbit) | |
217 | { | |
218 | poly1305_internal *st = (poly1305_internal *)ctx; | |
219 | elem64 in0, in1, in2, in3; | |
220 | u64 pad = (u64)padbit<<32; | |
221 | ||
222 | double x0, x1, x2, x3; | |
223 | double h0lo, h0hi, h1lo, h1hi, h2lo, h2hi, h3lo, h3hi; | |
224 | double c0lo, c0hi, c1lo, c1hi, c2lo, c2hi, c3lo, c3hi; | |
225 | ||
226 | const double r0lo = st->r[0]; | |
227 | const double r0hi = st->r[1]; | |
228 | const double r1lo = st->r[2]; | |
229 | const double r1hi = st->r[3]; | |
230 | const double r2lo = st->r[4]; | |
231 | const double r2hi = st->r[5]; | |
232 | const double r3lo = st->r[6]; | |
233 | const double r3hi = st->r[7]; | |
234 | ||
235 | const double s1lo = st->s[0]; | |
236 | const double s1hi = st->s[1]; | |
237 | const double s2lo = st->s[2]; | |
238 | const double s2hi = st->s[3]; | |
239 | const double s3lo = st->s[4]; | |
240 | const double s3hi = st->s[5]; | |
241 | ||
242 | /* | |
243 | * set "truncate" rounding mode | |
244 | */ | |
245 | #if defined(__x86_64__) | |
246 | u32 mxcsr_orig; | |
247 | ||
248 | asm volatile ("stmxcsr %0":"=m"(mxcsr_orig)); | |
249 | asm volatile ("ldmxcsr %0"::"m"(mxcsr)); | |
250 | #elif defined(__PPC__) | |
251 | double fpscr_orig, fpscr = *(double *)&one; | |
252 | ||
253 | asm volatile ("mffs %0":"=f"(fpscr_orig)); | |
254 | asm volatile ("mtfsf 255,%0"::"f"(fpscr)); | |
255 | #elif defined(__s390x__) | |
256 | u32 fpc_orig; | |
257 | ||
258 | asm volatile ("stfpc %0":"=m"(fpc_orig)); | |
259 | asm volatile ("lfpc %0"::"m"(fpc)); | |
260 | #elif defined(__sparc__) | |
261 | u64 fsr_orig; | |
262 | ||
263 | asm volatile ("stx %%fsr,%0":"=m"(fsr_orig)); | |
264 | asm volatile ("ldx %0,%%fsr"::"m"(fsr)); | |
265 | #endif | |
266 | ||
267 | /* | |
268 | * load base 2^32 and de-bias | |
269 | */ | |
270 | h0lo = st->h[0].d - TWO(52)*TWO0; | |
271 | h1lo = st->h[1].d - TWO(52)*TWO32; | |
272 | h2lo = st->h[2].d - TWO(52)*TWO64; | |
273 | h3lo = st->h[3].d - TWO(52)*TWO96; | |
274 | ||
275 | #ifdef __clang__ | |
276 | h0hi = 0; | |
277 | h1hi = 0; | |
278 | h2hi = 0; | |
279 | h3hi = 0; | |
280 | #else | |
281 | in0.u = EXP(52+0) | U8TOU32(&inp[0]); | |
282 | in1.u = EXP(52+32) | U8TOU32(&inp[4]); | |
283 | in2.u = EXP(52+64) | U8TOU32(&inp[8]); | |
284 | in3.u = EXP(52+96) | U8TOU32(&inp[12]) | pad; | |
285 | ||
286 | x0 = in0.d - TWO(52)*TWO0; | |
287 | x1 = in1.d - TWO(52)*TWO32; | |
288 | x2 = in2.d - TWO(52)*TWO64; | |
289 | x3 = in3.d - TWO(52)*TWO96; | |
290 | ||
291 | x0 += h0lo; | |
292 | x1 += h1lo; | |
293 | x2 += h2lo; | |
294 | x3 += h3lo; | |
295 | ||
296 | goto fast_entry; | |
297 | #endif | |
298 | ||
299 | do { | |
300 | in0.u = EXP(52+0) | U8TOU32(&inp[0]); | |
301 | in1.u = EXP(52+32) | U8TOU32(&inp[4]); | |
302 | in2.u = EXP(52+64) | U8TOU32(&inp[8]); | |
303 | in3.u = EXP(52+96) | U8TOU32(&inp[12]) | pad; | |
304 | ||
305 | x0 = in0.d - TWO(52)*TWO0; | |
306 | x1 = in1.d - TWO(52)*TWO32; | |
307 | x2 = in2.d - TWO(52)*TWO64; | |
308 | x3 = in3.d - TWO(52)*TWO96; | |
309 | ||
310 | /* | |
311 | * note that there are multiple ways to accumulate input, e.g. | |
312 | * one can as well accumulate to h0lo-h1lo-h1hi-h2hi... | |
313 | */ | |
314 | h0lo += x0; | |
315 | h0hi += x1; | |
316 | h2lo += x2; | |
317 | h2hi += x3; | |
318 | ||
319 | /* | |
320 | * carries that cross 32n-bit (and 130-bit) boundaries | |
321 | */ | |
322 | c0lo = (h0lo + TWO(52)*TWO32) - TWO(52)*TWO32; | |
323 | c1lo = (h1lo + TWO(52)*TWO64) - TWO(52)*TWO64; | |
324 | c2lo = (h2lo + TWO(52)*TWO96) - TWO(52)*TWO96; | |
325 | c3lo = (h3lo + TWO(52)*TWO130) - TWO(52)*TWO130; | |
326 | ||
327 | c0hi = (h0hi + TWO(52)*TWO32) - TWO(52)*TWO32; | |
328 | c1hi = (h1hi + TWO(52)*TWO64) - TWO(52)*TWO64; | |
329 | c2hi = (h2hi + TWO(52)*TWO96) - TWO(52)*TWO96; | |
330 | c3hi = (h3hi + TWO(52)*TWO130) - TWO(52)*TWO130; | |
331 | ||
332 | /* | |
333 | * base 2^48 -> base 2^32 with last reduction step | |
334 | */ | |
335 | x1 = (h1lo - c1lo) + c0lo; | |
336 | x2 = (h2lo - c2lo) + c1lo; | |
337 | x3 = (h3lo - c3lo) + c2lo; | |
338 | x0 = (h0lo - c0lo) + c3lo * (5.0/TWO130); | |
339 | ||
340 | x1 += (h1hi - c1hi) + c0hi; | |
341 | x2 += (h2hi - c2hi) + c1hi; | |
342 | x3 += (h3hi - c3hi) + c2hi; | |
343 | x0 += (h0hi - c0hi) + c3hi * (5.0/TWO130); | |
344 | ||
345 | #ifndef __clang__ | |
346 | fast_entry: | |
347 | #endif | |
348 | /* | |
349 | * base 2^32 * base 2^16 = base 2^48 | |
350 | */ | |
351 | h0lo = s3lo * x1 + s2lo * x2 + s1lo * x3 + r0lo * x0; | |
352 | h1lo = r0lo * x1 + s3lo * x2 + s2lo * x3 + r1lo * x0; | |
353 | h2lo = r1lo * x1 + r0lo * x2 + s3lo * x3 + r2lo * x0; | |
354 | h3lo = r2lo * x1 + r1lo * x2 + r0lo * x3 + r3lo * x0; | |
355 | ||
356 | h0hi = s3hi * x1 + s2hi * x2 + s1hi * x3 + r0hi * x0; | |
357 | h1hi = r0hi * x1 + s3hi * x2 + s2hi * x3 + r1hi * x0; | |
358 | h2hi = r1hi * x1 + r0hi * x2 + s3hi * x3 + r2hi * x0; | |
359 | h3hi = r2hi * x1 + r1hi * x2 + r0hi * x3 + r3hi * x0; | |
360 | ||
361 | inp += 16; | |
362 | len -= 16; | |
363 | ||
364 | } while (len >= 16); | |
365 | ||
366 | /* | |
367 | * carries that cross 32n-bit (and 130-bit) boundaries | |
368 | */ | |
369 | c0lo = (h0lo + TWO(52)*TWO32) - TWO(52)*TWO32; | |
370 | c1lo = (h1lo + TWO(52)*TWO64) - TWO(52)*TWO64; | |
371 | c2lo = (h2lo + TWO(52)*TWO96) - TWO(52)*TWO96; | |
372 | c3lo = (h3lo + TWO(52)*TWO130) - TWO(52)*TWO130; | |
373 | ||
374 | c0hi = (h0hi + TWO(52)*TWO32) - TWO(52)*TWO32; | |
375 | c1hi = (h1hi + TWO(52)*TWO64) - TWO(52)*TWO64; | |
376 | c2hi = (h2hi + TWO(52)*TWO96) - TWO(52)*TWO96; | |
377 | c3hi = (h3hi + TWO(52)*TWO130) - TWO(52)*TWO130; | |
378 | ||
379 | /* | |
380 | * base 2^48 -> base 2^32 with last reduction step | |
381 | */ | |
382 | x1 = (h1lo - c1lo) + c0lo; | |
383 | x2 = (h2lo - c2lo) + c1lo; | |
384 | x3 = (h3lo - c3lo) + c2lo; | |
385 | x0 = (h0lo - c0lo) + c3lo * (5.0/TWO130); | |
386 | ||
387 | x1 += (h1hi - c1hi) + c0hi; | |
388 | x2 += (h2hi - c2hi) + c1hi; | |
389 | x3 += (h3hi - c3hi) + c2hi; | |
390 | x0 += (h0hi - c0hi) + c3hi * (5.0/TWO130); | |
391 | ||
392 | /* | |
393 | * store base 2^32, with bias | |
394 | */ | |
395 | st->h[1].d = x1 + TWO(52)*TWO32; | |
396 | st->h[2].d = x2 + TWO(52)*TWO64; | |
397 | st->h[3].d = x3 + TWO(52)*TWO96; | |
398 | st->h[0].d = x0 + TWO(52)*TWO0; | |
399 | ||
400 | /* | |
401 | * restore original FPU control register | |
402 | */ | |
403 | #if defined(__x86_64__) | |
404 | asm volatile ("ldmxcsr %0"::"m"(mxcsr_orig)); | |
405 | #elif defined(__PPC__) | |
406 | asm volatile ("mtfsf 255,%0"::"f"(fpscr_orig)); | |
407 | #elif defined(__s390x__) | |
408 | asm volatile ("lfpc %0"::"m"(fpc_orig)); | |
409 | #elif defined(__sparc__) | |
410 | asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig)); | |
411 | #endif | |
412 | } | |
413 | ||
414 | void poly1305_emit(void *ctx, unsigned char mac[16], const u32 nonce[4]) | |
415 | { | |
416 | poly1305_internal *st = (poly1305_internal *) ctx; | |
417 | u64 h0, h1, h2, h3, h4; | |
418 | u32 g0, g1, g2, g3, g4; | |
419 | u64 t; | |
420 | u32 mask; | |
421 | ||
422 | /* | |
423 | * thanks to bias masking exponent gives integer result | |
424 | */ | |
425 | h0 = st->h[0].u & 0x000fffffffffffffULL; | |
426 | h1 = st->h[1].u & 0x000fffffffffffffULL; | |
427 | h2 = st->h[2].u & 0x000fffffffffffffULL; | |
428 | h3 = st->h[3].u & 0x000fffffffffffffULL; | |
429 | ||
430 | /* | |
431 | * can be partially reduced, so reduce... | |
432 | */ | |
433 | h4 = h3>>32; h3 &= 0xffffffffU; | |
434 | g4 = h4&-4; | |
435 | h4 &= 3; | |
436 | g4 += g4>>2; | |
437 | ||
438 | h0 += g4; | |
439 | h1 += h0>>32; h0 &= 0xffffffffU; | |
440 | h2 += h1>>32; h1 &= 0xffffffffU; | |
441 | h3 += h2>>32; h2 &= 0xffffffffU; | |
442 | ||
443 | /* compute h + -p */ | |
444 | g0 = (u32)(t = h0 + 5); | |
445 | g1 = (u32)(t = h1 + (t >> 32)); | |
446 | g2 = (u32)(t = h2 + (t >> 32)); | |
447 | g3 = (u32)(t = h3 + (t >> 32)); | |
448 | g4 = h4 + (u32)(t >> 32); | |
449 | ||
450 | /* if there was carry, select g0-g3 */ | |
451 | mask = 0 - (g4 >> 2); | |
452 | g0 &= mask; | |
453 | g1 &= mask; | |
454 | g2 &= mask; | |
455 | g3 &= mask; | |
456 | mask = ~mask; | |
457 | g0 |= (h0 & mask); | |
458 | g1 |= (h1 & mask); | |
459 | g2 |= (h2 & mask); | |
460 | g3 |= (h3 & mask); | |
461 | ||
462 | /* mac = (h + nonce) % (2^128) */ | |
463 | g0 = (u32)(t = (u64)g0 + nonce[0]); | |
464 | g1 = (u32)(t = (u64)g1 + (t >> 32) + nonce[1]); | |
465 | g2 = (u32)(t = (u64)g2 + (t >> 32) + nonce[2]); | |
466 | g3 = (u32)(t = (u64)g3 + (t >> 32) + nonce[3]); | |
467 | ||
468 | U32TO8(mac + 0, g0); | |
469 | U32TO8(mac + 4, g1); | |
470 | U32TO8(mac + 8, g2); | |
471 | U32TO8(mac + 12, g3); | |
472 | } |