]>
git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/poly1305/poly1305_ieee754.c
2 * Copyright 2016-20018 The OpenSSL Project Authors. All Rights Reserved.
4 * Licensed under the OpenSSL license (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
11 * This module is meant to be used as template for non-x87 floating-
12 * point assembly modules. The template itself is x86_64-specific
13 * though, as it was debugged on x86_64. So that implementor would
14 * have to recognize platform-specific parts, UxTOy and inline asm,
15 * and act accordingly.
17 * Huh? x86_64-specific code as template for non-x87? Note seven, which
18 * is not a typo, but reference to 80-bit precision. This module on the
19 * other hand relies on 64-bit precision operations, which are default
20 * for x86_64 code. And since we are at it, just for sense of it,
21 * large-block performance in cycles per processed byte for *this* code
23 * gcc-4.8 icc-15.0 clang-3.4(*)
25 * Westmere 4.96 5.09 4.37
26 * Sandy Bridge 4.95 4.90 4.17
27 * Haswell 4.92 4.87 3.78
28 * Bulldozer 4.67 4.49 4.68
29 * VIA Nano 7.07 7.05 5.98
30 * Silvermont 10.6 9.61 12.6
32 * (*) clang managed to discover parallelism and deployed SIMD;
34 * And for range of other platforms with unspecified gcc versions:
50 #if !(defined(__GNUC__) && __GNUC__>=2)
51 # error "this is gcc-specific template"
56 typedef unsigned char u8
;
57 typedef unsigned int u32
;
58 typedef unsigned long long u64
;
59 typedef union { double d
; u64 u
; } elem64
;
61 #define TWO(p) ((double)(1ULL<<(p)))
64 #define TWO64 (TWO32*TWO(32))
65 #define TWO96 (TWO64*TWO(32))
66 #define TWO130 (TWO96*TWO(34))
68 #define EXP(p) ((1023ULL+(p))<<52)
70 #if defined(__x86_64__) || (defined(__PPC__) && defined(__LITTLE_ENDIAN__))
71 # define U8TOU32(p) (*(const u32 *)(p))
72 # define U32TO8(p,v) (*(u32 *)(p) = (v))
73 #elif defined(__PPC__)
74 # define U8TOU32(p) ({u32 ret; asm ("lwbrx %0,0,%1":"=r"(ret):"b"(p)); ret; })
75 # define U32TO8(p,v) asm ("stwbrx %0,0,%1"::"r"(v),"b"(p):"memory")
76 #elif defined(__s390x__)
77 # define U8TOU32(p) ({u32 ret; asm ("lrv %0,%1":"=d"(ret):"m"(*(u32 *)(p))); ret; })
78 # define U32TO8(p,v) asm ("strv %1,%0":"=m"(*(u32 *)(p)):"d"(v))
82 # define U8TOU32(p) ((u32)(p)[0] | (u32)(p)[1]<<8 | \
83 (u32)(p)[2]<<16 | (u32)(p)[3]<<24 )
86 # define U32TO8(p,v) ((p)[0] = (u8)(v), (p)[1] = (u8)((v)>>8), \
87 (p)[2] = (u8)((v)>>16), (p)[3] = (u8)((v)>>24) )
96 /* "round toward zero (truncate), mask all exceptions" */
97 #if defined(__x86_64__)
98 static const u32 mxcsr
= 0x7f80;
99 #elif defined(__PPC__)
100 static const u64 one
= 1;
101 #elif defined(__s390x__)
102 static const u32 fpc
= 1;
103 #elif defined(__sparc__)
104 static const u64 fsr
= 1ULL<<30;
105 #elif defined(__mips__)
106 static const u32 fcsr
= 1;
108 #error "unrecognized platform"
111 int poly1305_init(void *ctx
, const unsigned char key
[16])
113 poly1305_internal
*st
= (poly1305_internal
*) ctx
;
114 elem64 r0
, r1
, r2
, r3
;
118 st
->h
[0].d
= TWO(52)*TWO0
;
119 st
->h
[1].d
= TWO(52)*TWO32
;
120 st
->h
[2].d
= TWO(52)*TWO64
;
121 st
->h
[3].d
= TWO(52)*TWO96
;
123 st
->h
[0].u
= EXP(52+0);
124 st
->h
[1].u
= EXP(52+32);
125 st
->h
[2].u
= EXP(52+64);
126 st
->h
[3].u
= EXP(52+96);
131 * set "truncate" rounding mode
133 #if defined(__x86_64__)
136 asm volatile ("stmxcsr %0":"=m"(mxcsr_orig
));
137 asm volatile ("ldmxcsr %0"::"m"(mxcsr
));
138 #elif defined(__PPC__)
139 double fpscr_orig
, fpscr
= *(double *)&one
;
141 asm volatile ("mffs %0":"=f"(fpscr_orig
));
142 asm volatile ("mtfsf 255,%0"::"f"(fpscr
));
143 #elif defined(__s390x__)
146 asm volatile ("stfpc %0":"=m"(fpc_orig
));
147 asm volatile ("lfpc %0"::"m"(fpc
));
148 #elif defined(__sparc__)
151 asm volatile ("stx %%fsr,%0":"=m"(fsr_orig
));
152 asm volatile ("ldx %0,%%fsr"::"m"(fsr
));
153 #elif defined(__mips__)
156 asm volatile ("cfc1 %0,$31":"=r"(fcsr_orig
));
157 asm volatile ("ctc1 %0,$31"::"r"(fcsr
));
160 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
161 r0
.u
= EXP(52+0) | (U8TOU32(&key
[0]) & 0x0fffffff);
162 r1
.u
= EXP(52+32) | (U8TOU32(&key
[4]) & 0x0ffffffc);
163 r2
.u
= EXP(52+64) | (U8TOU32(&key
[8]) & 0x0ffffffc);
164 r3
.u
= EXP(52+96) | (U8TOU32(&key
[12]) & 0x0ffffffc);
166 st
->r
[0] = r0
.d
- TWO(52)*TWO0
;
167 st
->r
[2] = r1
.d
- TWO(52)*TWO32
;
168 st
->r
[4] = r2
.d
- TWO(52)*TWO64
;
169 st
->r
[6] = r3
.d
- TWO(52)*TWO96
;
171 st
->s
[0] = st
->r
[2] * (5.0/TWO130
);
172 st
->s
[2] = st
->r
[4] * (5.0/TWO130
);
173 st
->s
[4] = st
->r
[6] * (5.0/TWO130
);
176 * base 2^32 -> base 2^16
178 st
->r
[1] = (st
->r
[0] + TWO(52)*TWO(16)*TWO0
) -
179 TWO(52)*TWO(16)*TWO0
;
180 st
->r
[0] -= st
->r
[1];
182 st
->r
[3] = (st
->r
[2] + TWO(52)*TWO(16)*TWO32
) -
183 TWO(52)*TWO(16)*TWO32
;
184 st
->r
[2] -= st
->r
[3];
186 st
->r
[5] = (st
->r
[4] + TWO(52)*TWO(16)*TWO64
) -
187 TWO(52)*TWO(16)*TWO64
;
188 st
->r
[4] -= st
->r
[5];
190 st
->r
[7] = (st
->r
[6] + TWO(52)*TWO(16)*TWO96
) -
191 TWO(52)*TWO(16)*TWO96
;
192 st
->r
[6] -= st
->r
[7];
194 st
->s
[1] = (st
->s
[0] + TWO(52)*TWO(16)*TWO0
/TWO96
) -
195 TWO(52)*TWO(16)*TWO0
/TWO96
;
196 st
->s
[0] -= st
->s
[1];
198 st
->s
[3] = (st
->s
[2] + TWO(52)*TWO(16)*TWO32
/TWO96
) -
199 TWO(52)*TWO(16)*TWO32
/TWO96
;
200 st
->s
[2] -= st
->s
[3];
202 st
->s
[5] = (st
->s
[4] + TWO(52)*TWO(16)*TWO64
/TWO96
) -
203 TWO(52)*TWO(16)*TWO64
/TWO96
;
204 st
->s
[4] -= st
->s
[5];
207 * restore original FPU control register
209 #if defined(__x86_64__)
210 asm volatile ("ldmxcsr %0"::"m"(mxcsr_orig
));
211 #elif defined(__PPC__)
212 asm volatile ("mtfsf 255,%0"::"f"(fpscr_orig
));
213 #elif defined(__s390x__)
214 asm volatile ("lfpc %0"::"m"(fpc_orig
));
215 #elif defined(__sparc__)
216 asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig
));
217 #elif defined(__mips__)
218 asm volatile ("ctc1 %0,$31"::"r"(fcsr_orig
));
225 void poly1305_blocks(void *ctx
, const unsigned char *inp
, size_t len
,
228 poly1305_internal
*st
= (poly1305_internal
*)ctx
;
229 elem64 in0
, in1
, in2
, in3
;
230 u64 pad
= (u64
)padbit
<<32;
232 double x0
, x1
, x2
, x3
;
233 double h0lo
, h0hi
, h1lo
, h1hi
, h2lo
, h2hi
, h3lo
, h3hi
;
234 double c0lo
, c0hi
, c1lo
, c1hi
, c2lo
, c2hi
, c3lo
, c3hi
;
236 const double r0lo
= st
->r
[0];
237 const double r0hi
= st
->r
[1];
238 const double r1lo
= st
->r
[2];
239 const double r1hi
= st
->r
[3];
240 const double r2lo
= st
->r
[4];
241 const double r2hi
= st
->r
[5];
242 const double r3lo
= st
->r
[6];
243 const double r3hi
= st
->r
[7];
245 const double s1lo
= st
->s
[0];
246 const double s1hi
= st
->s
[1];
247 const double s2lo
= st
->s
[2];
248 const double s2hi
= st
->s
[3];
249 const double s3lo
= st
->s
[4];
250 const double s3hi
= st
->s
[5];
253 * set "truncate" rounding mode
255 #if defined(__x86_64__)
258 asm volatile ("stmxcsr %0":"=m"(mxcsr_orig
));
259 asm volatile ("ldmxcsr %0"::"m"(mxcsr
));
260 #elif defined(__PPC__)
261 double fpscr_orig
, fpscr
= *(double *)&one
;
263 asm volatile ("mffs %0":"=f"(fpscr_orig
));
264 asm volatile ("mtfsf 255,%0"::"f"(fpscr
));
265 #elif defined(__s390x__)
268 asm volatile ("stfpc %0":"=m"(fpc_orig
));
269 asm volatile ("lfpc %0"::"m"(fpc
));
270 #elif defined(__sparc__)
273 asm volatile ("stx %%fsr,%0":"=m"(fsr_orig
));
274 asm volatile ("ldx %0,%%fsr"::"m"(fsr
));
275 #elif defined(__mips__)
278 asm volatile ("cfc1 %0,$31":"=r"(fcsr_orig
));
279 asm volatile ("ctc1 %0,$31"::"r"(fcsr
));
283 * load base 2^32 and de-bias
285 h0lo
= st
->h
[0].d
- TWO(52)*TWO0
;
286 h1lo
= st
->h
[1].d
- TWO(52)*TWO32
;
287 h2lo
= st
->h
[2].d
- TWO(52)*TWO64
;
288 h3lo
= st
->h
[3].d
- TWO(52)*TWO96
;
296 in0
.u
= EXP(52+0) | U8TOU32(&inp
[0]);
297 in1
.u
= EXP(52+32) | U8TOU32(&inp
[4]);
298 in2
.u
= EXP(52+64) | U8TOU32(&inp
[8]);
299 in3
.u
= EXP(52+96) | U8TOU32(&inp
[12]) | pad
;
301 x0
= in0
.d
- TWO(52)*TWO0
;
302 x1
= in1
.d
- TWO(52)*TWO32
;
303 x2
= in2
.d
- TWO(52)*TWO64
;
304 x3
= in3
.d
- TWO(52)*TWO96
;
315 in0
.u
= EXP(52+0) | U8TOU32(&inp
[0]);
316 in1
.u
= EXP(52+32) | U8TOU32(&inp
[4]);
317 in2
.u
= EXP(52+64) | U8TOU32(&inp
[8]);
318 in3
.u
= EXP(52+96) | U8TOU32(&inp
[12]) | pad
;
320 x0
= in0
.d
- TWO(52)*TWO0
;
321 x1
= in1
.d
- TWO(52)*TWO32
;
322 x2
= in2
.d
- TWO(52)*TWO64
;
323 x3
= in3
.d
- TWO(52)*TWO96
;
326 * note that there are multiple ways to accumulate input, e.g.
327 * one can as well accumulate to h0lo-h1lo-h1hi-h2hi...
335 * carries that cross 32n-bit (and 130-bit) boundaries
337 c0lo
= (h0lo
+ TWO(52)*TWO32
) - TWO(52)*TWO32
;
338 c1lo
= (h1lo
+ TWO(52)*TWO64
) - TWO(52)*TWO64
;
339 c2lo
= (h2lo
+ TWO(52)*TWO96
) - TWO(52)*TWO96
;
340 c3lo
= (h3lo
+ TWO(52)*TWO130
) - TWO(52)*TWO130
;
342 c0hi
= (h0hi
+ TWO(52)*TWO32
) - TWO(52)*TWO32
;
343 c1hi
= (h1hi
+ TWO(52)*TWO64
) - TWO(52)*TWO64
;
344 c2hi
= (h2hi
+ TWO(52)*TWO96
) - TWO(52)*TWO96
;
345 c3hi
= (h3hi
+ TWO(52)*TWO130
) - TWO(52)*TWO130
;
348 * base 2^48 -> base 2^32 with last reduction step
350 x1
= (h1lo
- c1lo
) + c0lo
;
351 x2
= (h2lo
- c2lo
) + c1lo
;
352 x3
= (h3lo
- c3lo
) + c2lo
;
353 x0
= (h0lo
- c0lo
) + c3lo
* (5.0/TWO130
);
355 x1
+= (h1hi
- c1hi
) + c0hi
;
356 x2
+= (h2hi
- c2hi
) + c1hi
;
357 x3
+= (h3hi
- c3hi
) + c2hi
;
358 x0
+= (h0hi
- c0hi
) + c3hi
* (5.0/TWO130
);
364 * base 2^32 * base 2^16 = base 2^48
366 h0lo
= s3lo
* x1
+ s2lo
* x2
+ s1lo
* x3
+ r0lo
* x0
;
367 h1lo
= r0lo
* x1
+ s3lo
* x2
+ s2lo
* x3
+ r1lo
* x0
;
368 h2lo
= r1lo
* x1
+ r0lo
* x2
+ s3lo
* x3
+ r2lo
* x0
;
369 h3lo
= r2lo
* x1
+ r1lo
* x2
+ r0lo
* x3
+ r3lo
* x0
;
371 h0hi
= s3hi
* x1
+ s2hi
* x2
+ s1hi
* x3
+ r0hi
* x0
;
372 h1hi
= r0hi
* x1
+ s3hi
* x2
+ s2hi
* x3
+ r1hi
* x0
;
373 h2hi
= r1hi
* x1
+ r0hi
* x2
+ s3hi
* x3
+ r2hi
* x0
;
374 h3hi
= r2hi
* x1
+ r1hi
* x2
+ r0hi
* x3
+ r3hi
* x0
;
382 * carries that cross 32n-bit (and 130-bit) boundaries
384 c0lo
= (h0lo
+ TWO(52)*TWO32
) - TWO(52)*TWO32
;
385 c1lo
= (h1lo
+ TWO(52)*TWO64
) - TWO(52)*TWO64
;
386 c2lo
= (h2lo
+ TWO(52)*TWO96
) - TWO(52)*TWO96
;
387 c3lo
= (h3lo
+ TWO(52)*TWO130
) - TWO(52)*TWO130
;
389 c0hi
= (h0hi
+ TWO(52)*TWO32
) - TWO(52)*TWO32
;
390 c1hi
= (h1hi
+ TWO(52)*TWO64
) - TWO(52)*TWO64
;
391 c2hi
= (h2hi
+ TWO(52)*TWO96
) - TWO(52)*TWO96
;
392 c3hi
= (h3hi
+ TWO(52)*TWO130
) - TWO(52)*TWO130
;
395 * base 2^48 -> base 2^32 with last reduction step
397 x1
= (h1lo
- c1lo
) + c0lo
;
398 x2
= (h2lo
- c2lo
) + c1lo
;
399 x3
= (h3lo
- c3lo
) + c2lo
;
400 x0
= (h0lo
- c0lo
) + c3lo
* (5.0/TWO130
);
402 x1
+= (h1hi
- c1hi
) + c0hi
;
403 x2
+= (h2hi
- c2hi
) + c1hi
;
404 x3
+= (h3hi
- c3hi
) + c2hi
;
405 x0
+= (h0hi
- c0hi
) + c3hi
* (5.0/TWO130
);
408 * store base 2^32, with bias
410 st
->h
[1].d
= x1
+ TWO(52)*TWO32
;
411 st
->h
[2].d
= x2
+ TWO(52)*TWO64
;
412 st
->h
[3].d
= x3
+ TWO(52)*TWO96
;
413 st
->h
[0].d
= x0
+ TWO(52)*TWO0
;
416 * restore original FPU control register
418 #if defined(__x86_64__)
419 asm volatile ("ldmxcsr %0"::"m"(mxcsr_orig
));
420 #elif defined(__PPC__)
421 asm volatile ("mtfsf 255,%0"::"f"(fpscr_orig
));
422 #elif defined(__s390x__)
423 asm volatile ("lfpc %0"::"m"(fpc_orig
));
424 #elif defined(__sparc__)
425 asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig
));
426 #elif defined(__mips__)
427 asm volatile ("ctc1 %0,$31"::"r"(fcsr_orig
));
431 void poly1305_emit(void *ctx
, unsigned char mac
[16], const u32 nonce
[4])
433 poly1305_internal
*st
= (poly1305_internal
*) ctx
;
434 u64 h0
, h1
, h2
, h3
, h4
;
435 u32 g0
, g1
, g2
, g3
, g4
;
440 * thanks to bias masking exponent gives integer result
442 h0
= st
->h
[0].u
& 0x000fffffffffffffULL
;
443 h1
= st
->h
[1].u
& 0x000fffffffffffffULL
;
444 h2
= st
->h
[2].u
& 0x000fffffffffffffULL
;
445 h3
= st
->h
[3].u
& 0x000fffffffffffffULL
;
448 * can be partially reduced, so reduce...
450 h4
= h3
>>32; h3
&= 0xffffffffU
;
456 h1
+= h0
>>32; h0
&= 0xffffffffU
;
457 h2
+= h1
>>32; h1
&= 0xffffffffU
;
458 h3
+= h2
>>32; h2
&= 0xffffffffU
;
461 g0
= (u32
)(t
= h0
+ 5);
462 g1
= (u32
)(t
= h1
+ (t
>> 32));
463 g2
= (u32
)(t
= h2
+ (t
>> 32));
464 g3
= (u32
)(t
= h3
+ (t
>> 32));
465 g4
= h4
+ (u32
)(t
>> 32);
467 /* if there was carry, select g0-g3 */
468 mask
= 0 - (g4
>> 2);
479 /* mac = (h + nonce) % (2^128) */
480 g0
= (u32
)(t
= (u64
)g0
+ nonce
[0]);
481 g1
= (u32
)(t
= (u64
)g1
+ (t
>> 32) + nonce
[1]);
482 g2
= (u32
)(t
= (u64
)g2
+ (t
>> 32) + nonce
[2]);
483 g3
= (u32
)(t
= (u64
)g3
+ (t
>> 32) + nonce
[3]);
488 U32TO8(mac
+ 12, g3
);