]>
git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/poly1305/poly1305_ieee754.c
2 * Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
4 * Licensed under the Apache License 2.0 (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
11 * This module is meant to be used as template for non-x87 floating-
12 * point assembly modules. The template itself is x86_64-specific
13 * though, as it was debugged on x86_64. So that implementor would
14 * have to recognize platform-specific parts, UxTOy and inline asm,
15 * and act accordingly.
17 * Huh? x86_64-specific code as template for non-x87? Note seven, which
18 * is not a typo, but reference to 80-bit precision. This module on the
19 * other hand relies on 64-bit precision operations, which are default
20 * for x86_64 code. And since we are at it, just for sense of it,
21 * large-block performance in cycles per processed byte for *this* code
23 * gcc-4.8 icc-15.0 clang-3.4(*)
25 * Westmere 4.96 5.09 4.37
26 * Sandy Bridge 4.95 4.90 4.17
27 * Haswell 4.92 4.87 3.78
28 * Bulldozer 4.67 4.49 4.68
29 * VIA Nano 7.07 7.05 5.98
30 * Silvermont 10.6 9.61 12.6
32 * (*) clang managed to discover parallelism and deployed SIMD;
34 * And for range of other platforms with unspecified gcc versions:
49 #if !(defined(__GNUC__) && __GNUC__>=2)
50 # error "this is gcc-specific template"
55 typedef unsigned char u8
;
56 typedef unsigned int u32
;
57 typedef unsigned long long u64
;
58 typedef union { double d
; u64 u
; } elem64
;
60 #define TWO(p) ((double)(1ULL<<(p)))
63 #define TWO64 (TWO32*TWO(32))
64 #define TWO96 (TWO64*TWO(32))
65 #define TWO130 (TWO96*TWO(34))
67 #define EXP(p) ((1023ULL+(p))<<52)
69 #if defined(__x86_64__) || (defined(__PPC__) && defined(__LITTLE_ENDIAN__))
70 # define U8TOU32(p) (*(const u32 *)(p))
71 # define U32TO8(p,v) (*(u32 *)(p) = (v))
72 #elif defined(__PPC__)
73 # define U8TOU32(p) ({u32 ret; asm ("lwbrx %0,0,%1":"=r"(ret):"b"(p)); ret; })
74 # define U32TO8(p,v) asm ("stwbrx %0,0,%1"::"r"(v),"b"(p):"memory")
75 #elif defined(__s390x__)
76 # define U8TOU32(p) ({u32 ret; asm ("lrv %0,%1":"=d"(ret):"m"(*(u32 *)(p))); ret; })
77 # define U32TO8(p,v) asm ("strv %1,%0":"=m"(*(u32 *)(p)):"d"(v))
81 # define U8TOU32(p) ((u32)(p)[0] | (u32)(p)[1]<<8 | \
82 (u32)(p)[2]<<16 | (u32)(p)[3]<<24 )
85 # define U32TO8(p,v) ((p)[0] = (u8)(v), (p)[1] = (u8)((v)>>8), \
86 (p)[2] = (u8)((v)>>16), (p)[3] = (u8)((v)>>24) )
95 /* "round toward zero (truncate), mask all exceptions" */
96 #if defined(__x86_64__)
97 static const u32 mxcsr
= 0x7f80;
98 #elif defined(__PPC__)
99 static const u64 one
= 1;
100 #elif defined(__s390x__)
101 static const u32 fpc
= 1;
102 #elif defined(__sparc__)
103 static const u64 fsr
= 1ULL<<30;
104 #elif defined(__mips__)
105 static const u32 fcsr
= 1;
107 #error "unrecognized platform"
110 int poly1305_init(void *ctx
, const unsigned char key
[16])
112 poly1305_internal
*st
= (poly1305_internal
*) ctx
;
113 elem64 r0
, r1
, r2
, r3
;
117 st
->h
[0].d
= TWO(52)*TWO0
;
118 st
->h
[1].d
= TWO(52)*TWO32
;
119 st
->h
[2].d
= TWO(52)*TWO64
;
120 st
->h
[3].d
= TWO(52)*TWO96
;
122 st
->h
[0].u
= EXP(52+0);
123 st
->h
[1].u
= EXP(52+32);
124 st
->h
[2].u
= EXP(52+64);
125 st
->h
[3].u
= EXP(52+96);
130 * set "truncate" rounding mode
132 #if defined(__x86_64__)
135 asm volatile ("stmxcsr %0":"=m"(mxcsr_orig
));
136 asm volatile ("ldmxcsr %0"::"m"(mxcsr
));
137 #elif defined(__PPC__)
138 double fpscr_orig
, fpscr
= *(double *)&one
;
140 asm volatile ("mffs %0":"=f"(fpscr_orig
));
141 asm volatile ("mtfsf 255,%0"::"f"(fpscr
));
142 #elif defined(__s390x__)
145 asm volatile ("stfpc %0":"=m"(fpc_orig
));
146 asm volatile ("lfpc %0"::"m"(fpc
));
147 #elif defined(__sparc__)
150 asm volatile ("stx %%fsr,%0":"=m"(fsr_orig
));
151 asm volatile ("ldx %0,%%fsr"::"m"(fsr
));
152 #elif defined(__mips__)
155 asm volatile ("cfc1 %0,$31":"=r"(fcsr_orig
));
156 asm volatile ("ctc1 %0,$31"::"r"(fcsr
));
159 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
160 r0
.u
= EXP(52+0) | (U8TOU32(&key
[0]) & 0x0fffffff);
161 r1
.u
= EXP(52+32) | (U8TOU32(&key
[4]) & 0x0ffffffc);
162 r2
.u
= EXP(52+64) | (U8TOU32(&key
[8]) & 0x0ffffffc);
163 r3
.u
= EXP(52+96) | (U8TOU32(&key
[12]) & 0x0ffffffc);
165 st
->r
[0] = r0
.d
- TWO(52)*TWO0
;
166 st
->r
[2] = r1
.d
- TWO(52)*TWO32
;
167 st
->r
[4] = r2
.d
- TWO(52)*TWO64
;
168 st
->r
[6] = r3
.d
- TWO(52)*TWO96
;
170 st
->s
[0] = st
->r
[2] * (5.0/TWO130
);
171 st
->s
[2] = st
->r
[4] * (5.0/TWO130
);
172 st
->s
[4] = st
->r
[6] * (5.0/TWO130
);
175 * base 2^32 -> base 2^16
177 st
->r
[1] = (st
->r
[0] + TWO(52)*TWO(16)*TWO0
) -
178 TWO(52)*TWO(16)*TWO0
;
179 st
->r
[0] -= st
->r
[1];
181 st
->r
[3] = (st
->r
[2] + TWO(52)*TWO(16)*TWO32
) -
182 TWO(52)*TWO(16)*TWO32
;
183 st
->r
[2] -= st
->r
[3];
185 st
->r
[5] = (st
->r
[4] + TWO(52)*TWO(16)*TWO64
) -
186 TWO(52)*TWO(16)*TWO64
;
187 st
->r
[4] -= st
->r
[5];
189 st
->r
[7] = (st
->r
[6] + TWO(52)*TWO(16)*TWO96
) -
190 TWO(52)*TWO(16)*TWO96
;
191 st
->r
[6] -= st
->r
[7];
193 st
->s
[1] = (st
->s
[0] + TWO(52)*TWO(16)*TWO0
/TWO96
) -
194 TWO(52)*TWO(16)*TWO0
/TWO96
;
195 st
->s
[0] -= st
->s
[1];
197 st
->s
[3] = (st
->s
[2] + TWO(52)*TWO(16)*TWO32
/TWO96
) -
198 TWO(52)*TWO(16)*TWO32
/TWO96
;
199 st
->s
[2] -= st
->s
[3];
201 st
->s
[5] = (st
->s
[4] + TWO(52)*TWO(16)*TWO64
/TWO96
) -
202 TWO(52)*TWO(16)*TWO64
/TWO96
;
203 st
->s
[4] -= st
->s
[5];
206 * restore original FPU control register
208 #if defined(__x86_64__)
209 asm volatile ("ldmxcsr %0"::"m"(mxcsr_orig
));
210 #elif defined(__PPC__)
211 asm volatile ("mtfsf 255,%0"::"f"(fpscr_orig
));
212 #elif defined(__s390x__)
213 asm volatile ("lfpc %0"::"m"(fpc_orig
));
214 #elif defined(__sparc__)
215 asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig
));
216 #elif defined(__mips__)
217 asm volatile ("ctc1 %0,$31"::"r"(fcsr_orig
));
224 void poly1305_blocks(void *ctx
, const unsigned char *inp
, size_t len
,
227 poly1305_internal
*st
= (poly1305_internal
*)ctx
;
228 elem64 in0
, in1
, in2
, in3
;
229 u64 pad
= (u64
)padbit
<<32;
231 double x0
, x1
, x2
, x3
;
232 double h0lo
, h0hi
, h1lo
, h1hi
, h2lo
, h2hi
, h3lo
, h3hi
;
233 double c0lo
, c0hi
, c1lo
, c1hi
, c2lo
, c2hi
, c3lo
, c3hi
;
235 const double r0lo
= st
->r
[0];
236 const double r0hi
= st
->r
[1];
237 const double r1lo
= st
->r
[2];
238 const double r1hi
= st
->r
[3];
239 const double r2lo
= st
->r
[4];
240 const double r2hi
= st
->r
[5];
241 const double r3lo
= st
->r
[6];
242 const double r3hi
= st
->r
[7];
244 const double s1lo
= st
->s
[0];
245 const double s1hi
= st
->s
[1];
246 const double s2lo
= st
->s
[2];
247 const double s2hi
= st
->s
[3];
248 const double s3lo
= st
->s
[4];
249 const double s3hi
= st
->s
[5];
252 * set "truncate" rounding mode
254 #if defined(__x86_64__)
257 asm volatile ("stmxcsr %0":"=m"(mxcsr_orig
));
258 asm volatile ("ldmxcsr %0"::"m"(mxcsr
));
259 #elif defined(__PPC__)
260 double fpscr_orig
, fpscr
= *(double *)&one
;
262 asm volatile ("mffs %0":"=f"(fpscr_orig
));
263 asm volatile ("mtfsf 255,%0"::"f"(fpscr
));
264 #elif defined(__s390x__)
267 asm volatile ("stfpc %0":"=m"(fpc_orig
));
268 asm volatile ("lfpc %0"::"m"(fpc
));
269 #elif defined(__sparc__)
272 asm volatile ("stx %%fsr,%0":"=m"(fsr_orig
));
273 asm volatile ("ldx %0,%%fsr"::"m"(fsr
));
274 #elif defined(__mips__)
277 asm volatile ("cfc1 %0,$31":"=r"(fcsr_orig
));
278 asm volatile ("ctc1 %0,$31"::"r"(fcsr
));
282 * load base 2^32 and de-bias
284 h0lo
= st
->h
[0].d
- TWO(52)*TWO0
;
285 h1lo
= st
->h
[1].d
- TWO(52)*TWO32
;
286 h2lo
= st
->h
[2].d
- TWO(52)*TWO64
;
287 h3lo
= st
->h
[3].d
- TWO(52)*TWO96
;
295 in0
.u
= EXP(52+0) | U8TOU32(&inp
[0]);
296 in1
.u
= EXP(52+32) | U8TOU32(&inp
[4]);
297 in2
.u
= EXP(52+64) | U8TOU32(&inp
[8]);
298 in3
.u
= EXP(52+96) | U8TOU32(&inp
[12]) | pad
;
300 x0
= in0
.d
- TWO(52)*TWO0
;
301 x1
= in1
.d
- TWO(52)*TWO32
;
302 x2
= in2
.d
- TWO(52)*TWO64
;
303 x3
= in3
.d
- TWO(52)*TWO96
;
314 in0
.u
= EXP(52+0) | U8TOU32(&inp
[0]);
315 in1
.u
= EXP(52+32) | U8TOU32(&inp
[4]);
316 in2
.u
= EXP(52+64) | U8TOU32(&inp
[8]);
317 in3
.u
= EXP(52+96) | U8TOU32(&inp
[12]) | pad
;
319 x0
= in0
.d
- TWO(52)*TWO0
;
320 x1
= in1
.d
- TWO(52)*TWO32
;
321 x2
= in2
.d
- TWO(52)*TWO64
;
322 x3
= in3
.d
- TWO(52)*TWO96
;
325 * note that there are multiple ways to accumulate input, e.g.
326 * one can as well accumulate to h0lo-h1lo-h1hi-h2hi...
334 * carries that cross 32n-bit (and 130-bit) boundaries
336 c0lo
= (h0lo
+ TWO(52)*TWO32
) - TWO(52)*TWO32
;
337 c1lo
= (h1lo
+ TWO(52)*TWO64
) - TWO(52)*TWO64
;
338 c2lo
= (h2lo
+ TWO(52)*TWO96
) - TWO(52)*TWO96
;
339 c3lo
= (h3lo
+ TWO(52)*TWO130
) - TWO(52)*TWO130
;
341 c0hi
= (h0hi
+ TWO(52)*TWO32
) - TWO(52)*TWO32
;
342 c1hi
= (h1hi
+ TWO(52)*TWO64
) - TWO(52)*TWO64
;
343 c2hi
= (h2hi
+ TWO(52)*TWO96
) - TWO(52)*TWO96
;
344 c3hi
= (h3hi
+ TWO(52)*TWO130
) - TWO(52)*TWO130
;
347 * base 2^48 -> base 2^32 with last reduction step
349 x1
= (h1lo
- c1lo
) + c0lo
;
350 x2
= (h2lo
- c2lo
) + c1lo
;
351 x3
= (h3lo
- c3lo
) + c2lo
;
352 x0
= (h0lo
- c0lo
) + c3lo
* (5.0/TWO130
);
354 x1
+= (h1hi
- c1hi
) + c0hi
;
355 x2
+= (h2hi
- c2hi
) + c1hi
;
356 x3
+= (h3hi
- c3hi
) + c2hi
;
357 x0
+= (h0hi
- c0hi
) + c3hi
* (5.0/TWO130
);
363 * base 2^32 * base 2^16 = base 2^48
365 h0lo
= s3lo
* x1
+ s2lo
* x2
+ s1lo
* x3
+ r0lo
* x0
;
366 h1lo
= r0lo
* x1
+ s3lo
* x2
+ s2lo
* x3
+ r1lo
* x0
;
367 h2lo
= r1lo
* x1
+ r0lo
* x2
+ s3lo
* x3
+ r2lo
* x0
;
368 h3lo
= r2lo
* x1
+ r1lo
* x2
+ r0lo
* x3
+ r3lo
* x0
;
370 h0hi
= s3hi
* x1
+ s2hi
* x2
+ s1hi
* x3
+ r0hi
* x0
;
371 h1hi
= r0hi
* x1
+ s3hi
* x2
+ s2hi
* x3
+ r1hi
* x0
;
372 h2hi
= r1hi
* x1
+ r0hi
* x2
+ s3hi
* x3
+ r2hi
* x0
;
373 h3hi
= r2hi
* x1
+ r1hi
* x2
+ r0hi
* x3
+ r3hi
* x0
;
381 * carries that cross 32n-bit (and 130-bit) boundaries
383 c0lo
= (h0lo
+ TWO(52)*TWO32
) - TWO(52)*TWO32
;
384 c1lo
= (h1lo
+ TWO(52)*TWO64
) - TWO(52)*TWO64
;
385 c2lo
= (h2lo
+ TWO(52)*TWO96
) - TWO(52)*TWO96
;
386 c3lo
= (h3lo
+ TWO(52)*TWO130
) - TWO(52)*TWO130
;
388 c0hi
= (h0hi
+ TWO(52)*TWO32
) - TWO(52)*TWO32
;
389 c1hi
= (h1hi
+ TWO(52)*TWO64
) - TWO(52)*TWO64
;
390 c2hi
= (h2hi
+ TWO(52)*TWO96
) - TWO(52)*TWO96
;
391 c3hi
= (h3hi
+ TWO(52)*TWO130
) - TWO(52)*TWO130
;
394 * base 2^48 -> base 2^32 with last reduction step
396 x1
= (h1lo
- c1lo
) + c0lo
;
397 x2
= (h2lo
- c2lo
) + c1lo
;
398 x3
= (h3lo
- c3lo
) + c2lo
;
399 x0
= (h0lo
- c0lo
) + c3lo
* (5.0/TWO130
);
401 x1
+= (h1hi
- c1hi
) + c0hi
;
402 x2
+= (h2hi
- c2hi
) + c1hi
;
403 x3
+= (h3hi
- c3hi
) + c2hi
;
404 x0
+= (h0hi
- c0hi
) + c3hi
* (5.0/TWO130
);
407 * store base 2^32, with bias
409 st
->h
[1].d
= x1
+ TWO(52)*TWO32
;
410 st
->h
[2].d
= x2
+ TWO(52)*TWO64
;
411 st
->h
[3].d
= x3
+ TWO(52)*TWO96
;
412 st
->h
[0].d
= x0
+ TWO(52)*TWO0
;
415 * restore original FPU control register
417 #if defined(__x86_64__)
418 asm volatile ("ldmxcsr %0"::"m"(mxcsr_orig
));
419 #elif defined(__PPC__)
420 asm volatile ("mtfsf 255,%0"::"f"(fpscr_orig
));
421 #elif defined(__s390x__)
422 asm volatile ("lfpc %0"::"m"(fpc_orig
));
423 #elif defined(__sparc__)
424 asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig
));
425 #elif defined(__mips__)
426 asm volatile ("ctc1 %0,$31"::"r"(fcsr_orig
));
430 void poly1305_emit(void *ctx
, unsigned char mac
[16], const u32 nonce
[4])
432 poly1305_internal
*st
= (poly1305_internal
*) ctx
;
433 u64 h0
, h1
, h2
, h3
, h4
;
434 u32 g0
, g1
, g2
, g3
, g4
;
439 * thanks to bias masking exponent gives integer result
441 h0
= st
->h
[0].u
& 0x000fffffffffffffULL
;
442 h1
= st
->h
[1].u
& 0x000fffffffffffffULL
;
443 h2
= st
->h
[2].u
& 0x000fffffffffffffULL
;
444 h3
= st
->h
[3].u
& 0x000fffffffffffffULL
;
447 * can be partially reduced, so reduce...
449 h4
= h3
>>32; h3
&= 0xffffffffU
;
455 h1
+= h0
>>32; h0
&= 0xffffffffU
;
456 h2
+= h1
>>32; h1
&= 0xffffffffU
;
457 h3
+= h2
>>32; h2
&= 0xffffffffU
;
460 g0
= (u32
)(t
= h0
+ 5);
461 g1
= (u32
)(t
= h1
+ (t
>> 32));
462 g2
= (u32
)(t
= h2
+ (t
>> 32));
463 g3
= (u32
)(t
= h3
+ (t
>> 32));
464 g4
= h4
+ (u32
)(t
>> 32);
466 /* if there was carry, select g0-g3 */
467 mask
= 0 - (g4
>> 2);
478 /* mac = (h + nonce) % (2^128) */
479 g0
= (u32
)(t
= (u64
)g0
+ nonce
[0]);
480 g1
= (u32
)(t
= (u64
)g1
+ (t
>> 32) + nonce
[1]);
481 g2
= (u32
)(t
= (u64
)g2
+ (t
>> 32) + nonce
[2]);
482 g3
= (u32
)(t
= (u64
)g3
+ (t
>> 32) + nonce
[3]);
487 U32TO8(mac
+ 12, g3
);