]>
Commit | Line | Data |
---|---|---|
aa6bb135 | 1 | /* |
6b6981ef | 2 | * Copyright 2016-20018 The OpenSSL Project Authors. All Rights Reserved. |
c8d1c9b0 | 3 | * |
aa6bb135 RS |
4 | * Licensed under the OpenSSL license (the "License"). You may not use |
5 | * this file except in compliance with the License. You can obtain a copy | |
6 | * in the file LICENSE in the source distribution or at | |
7 | * https://www.openssl.org/source/license.html | |
c8d1c9b0 AP |
8 | */ |
9 | ||
10 | /* | |
11 | * This module is meant to be used as template for non-x87 floating- | |
12 | * point assembly modules. The template itself is x86_64-specific | |
13 | * though, as it was debugged on x86_64. So that implementor would | |
14 | * have to recognize platform-specific parts, UxTOy and inline asm, | |
15 | * and act accordingly. | |
16 | * | |
17 | * Huh? x86_64-specific code as template for non-x87? Note seven, which | |
18 | * is not a typo, but reference to 80-bit precision. This module on the | |
19 | * other hand relies on 64-bit precision operations, which are default | |
20 | * for x86_64 code. And since we are at it, just for sense of it, | |
21 | * large-block performance in cycles per processed byte for *this* code | |
22 | * is: | |
32c69853 | 23 | * gcc-4.8 icc-15.0 clang-3.4(*) |
c8d1c9b0 | 24 | * |
32c69853 DMSP |
25 | * Westmere 4.96 5.09 4.37 |
26 | * Sandy Bridge 4.95 4.90 4.17 | |
27 | * Haswell 4.92 4.87 3.78 | |
28 | * Bulldozer 4.67 4.49 4.68 | |
29 | * VIA Nano 7.07 7.05 5.98 | |
30 | * Silvermont 10.6 9.61 12.6 | |
c8d1c9b0 | 31 | * |
32c69853 | 32 | * (*) clang managed to discover parallelism and deployed SIMD; |
c8d1c9b0 AP |
33 | * |
34 | * And for range of other platforms with unspecified gcc versions: | |
35 | * | |
32c69853 DMSP |
36 | * Freescale e300 12.5 |
37 | * PPC74x0 10.8 | |
38 | * POWER6 4.92 | |
39 | * POWER7 4.50 | |
40 | * POWER8 4.10 | |
13f6857d | 41 | * POWER9 3.14 |
c8d1c9b0 | 42 | * |
32c69853 DMSP |
43 | * z10 11.2 |
44 | * z196+ 7.30 | |
c8d1c9b0 | 45 | * |
32c69853 DMSP |
46 | * UltraSPARC III 16.0 |
47 | * SPARC T4 16.1 | |
c8d1c9b0 AP |
48 | */ |
49 | ||
50 | #if !(defined(__GNUC__) && __GNUC__>=2) | |
51 | # error "this is gcc-specific template" | |
52 | #endif | |
53 | ||
54 | #include <stdlib.h> | |
55 | ||
56 | typedef unsigned char u8; | |
57 | typedef unsigned int u32; | |
58 | typedef unsigned long long u64; | |
59 | typedef union { double d; u64 u; } elem64; | |
60 | ||
32c69853 DMSP |
61 | #define TWO(p) ((double)(1ULL<<(p))) |
62 | #define TWO0 TWO(0) | |
63 | #define TWO32 TWO(32) | |
64 | #define TWO64 (TWO32*TWO(32)) | |
65 | #define TWO96 (TWO64*TWO(32)) | |
66 | #define TWO130 (TWO96*TWO(34)) | |
c8d1c9b0 | 67 | |
32c69853 | 68 | #define EXP(p) ((1023ULL+(p))<<52) |
c8d1c9b0 AP |
69 | |
70 | #if defined(__x86_64__) || (defined(__PPC__) && defined(__LITTLE_ENDIAN__)) | |
32c69853 DMSP |
71 | # define U8TOU32(p) (*(const u32 *)(p)) |
72 | # define U32TO8(p,v) (*(u32 *)(p) = (v)) | |
c8d1c9b0 | 73 | #elif defined(__PPC__) |
32c69853 DMSP |
74 | # define U8TOU32(p) ({u32 ret; asm ("lwbrx %0,0,%1":"=r"(ret):"b"(p)); ret; }) |
75 | # define U32TO8(p,v) asm ("stwbrx %0,0,%1"::"r"(v),"b"(p):"memory") | |
c8d1c9b0 | 76 | #elif defined(__s390x__) |
32c69853 DMSP |
77 | # define U8TOU32(p) ({u32 ret; asm ("lrv %0,%1":"=d"(ret):"m"(*(u32 *)(p))); ret; }) |
78 | # define U32TO8(p,v) asm ("strv %1,%0":"=m"(*(u32 *)(p)):"d"(v)) | |
c8d1c9b0 AP |
79 | #endif |
80 | ||
81 | #ifndef U8TOU32 | |
32c69853 DMSP |
82 | # define U8TOU32(p) ((u32)(p)[0] | (u32)(p)[1]<<8 | \ |
83 | (u32)(p)[2]<<16 | (u32)(p)[3]<<24 ) | |
c8d1c9b0 AP |
84 | #endif |
85 | #ifndef U32TO8 | |
32c69853 DMSP |
86 | # define U32TO8(p,v) ((p)[0] = (u8)(v), (p)[1] = (u8)((v)>>8), \ |
87 | (p)[2] = (u8)((v)>>16), (p)[3] = (u8)((v)>>24) ) | |
c8d1c9b0 AP |
88 | #endif |
89 | ||
90 | typedef struct { | |
91 | elem64 h[4]; | |
92 | double r[8]; | |
93 | double s[6]; | |
94 | } poly1305_internal; | |
95 | ||
96 | /* "round toward zero (truncate), mask all exceptions" */ | |
97 | #if defined(__x86_64__) | |
98 | static const u32 mxcsr = 0x7f80; | |
99 | #elif defined(__PPC__) | |
100 | static const u64 one = 1; | |
101 | #elif defined(__s390x__) | |
102 | static const u32 fpc = 1; | |
103 | #elif defined(__sparc__) | |
104 | static const u64 fsr = 1ULL<<30; | |
6b6981ef AP |
105 | #elif defined(__mips__) |
106 | static const u32 fcsr = 1; | |
c8d1c9b0 AP |
107 | #else |
108 | #error "unrecognized platform" | |
109 | #endif | |
110 | ||
111 | int poly1305_init(void *ctx, const unsigned char key[16]) | |
112 | { | |
113 | poly1305_internal *st = (poly1305_internal *) ctx; | |
114 | elem64 r0, r1, r2, r3; | |
115 | ||
116 | /* h = 0, biased */ | |
117 | #if 0 | |
118 | st->h[0].d = TWO(52)*TWO0; | |
119 | st->h[1].d = TWO(52)*TWO32; | |
120 | st->h[2].d = TWO(52)*TWO64; | |
121 | st->h[3].d = TWO(52)*TWO96; | |
122 | #else | |
123 | st->h[0].u = EXP(52+0); | |
124 | st->h[1].u = EXP(52+32); | |
125 | st->h[2].u = EXP(52+64); | |
126 | st->h[3].u = EXP(52+96); | |
127 | #endif | |
128 | ||
129 | if (key) { | |
130 | /* | |
131 | * set "truncate" rounding mode | |
132 | */ | |
133 | #if defined(__x86_64__) | |
134 | u32 mxcsr_orig; | |
135 | ||
136 | asm volatile ("stmxcsr %0":"=m"(mxcsr_orig)); | |
137 | asm volatile ("ldmxcsr %0"::"m"(mxcsr)); | |
138 | #elif defined(__PPC__) | |
139 | double fpscr_orig, fpscr = *(double *)&one; | |
140 | ||
141 | asm volatile ("mffs %0":"=f"(fpscr_orig)); | |
142 | asm volatile ("mtfsf 255,%0"::"f"(fpscr)); | |
143 | #elif defined(__s390x__) | |
144 | u32 fpc_orig; | |
145 | ||
146 | asm volatile ("stfpc %0":"=m"(fpc_orig)); | |
147 | asm volatile ("lfpc %0"::"m"(fpc)); | |
148 | #elif defined(__sparc__) | |
149 | u64 fsr_orig; | |
150 | ||
151 | asm volatile ("stx %%fsr,%0":"=m"(fsr_orig)); | |
152 | asm volatile ("ldx %0,%%fsr"::"m"(fsr)); | |
6b6981ef AP |
153 | #elif defined(__mips__) |
154 | u32 fcsr_orig; | |
155 | ||
156 | asm volatile ("cfc1 %0,$31":"=r"(fcsr_orig)); | |
157 | asm volatile ("ctc1 %0,$31"::"r"(fcsr)); | |
c8d1c9b0 AP |
158 | #endif |
159 | ||
160 | /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ | |
161 | r0.u = EXP(52+0) | (U8TOU32(&key[0]) & 0x0fffffff); | |
162 | r1.u = EXP(52+32) | (U8TOU32(&key[4]) & 0x0ffffffc); | |
163 | r2.u = EXP(52+64) | (U8TOU32(&key[8]) & 0x0ffffffc); | |
164 | r3.u = EXP(52+96) | (U8TOU32(&key[12]) & 0x0ffffffc); | |
165 | ||
166 | st->r[0] = r0.d - TWO(52)*TWO0; | |
167 | st->r[2] = r1.d - TWO(52)*TWO32; | |
168 | st->r[4] = r2.d - TWO(52)*TWO64; | |
169 | st->r[6] = r3.d - TWO(52)*TWO96; | |
170 | ||
171 | st->s[0] = st->r[2] * (5.0/TWO130); | |
172 | st->s[2] = st->r[4] * (5.0/TWO130); | |
173 | st->s[4] = st->r[6] * (5.0/TWO130); | |
174 | ||
175 | /* | |
176 | * base 2^32 -> base 2^16 | |
177 | */ | |
178 | st->r[1] = (st->r[0] + TWO(52)*TWO(16)*TWO0) - | |
179 | TWO(52)*TWO(16)*TWO0; | |
180 | st->r[0] -= st->r[1]; | |
181 | ||
182 | st->r[3] = (st->r[2] + TWO(52)*TWO(16)*TWO32) - | |
183 | TWO(52)*TWO(16)*TWO32; | |
184 | st->r[2] -= st->r[3]; | |
185 | ||
186 | st->r[5] = (st->r[4] + TWO(52)*TWO(16)*TWO64) - | |
187 | TWO(52)*TWO(16)*TWO64; | |
188 | st->r[4] -= st->r[5]; | |
189 | ||
190 | st->r[7] = (st->r[6] + TWO(52)*TWO(16)*TWO96) - | |
191 | TWO(52)*TWO(16)*TWO96; | |
192 | st->r[6] -= st->r[7]; | |
193 | ||
194 | st->s[1] = (st->s[0] + TWO(52)*TWO(16)*TWO0/TWO96) - | |
195 | TWO(52)*TWO(16)*TWO0/TWO96; | |
196 | st->s[0] -= st->s[1]; | |
197 | ||
198 | st->s[3] = (st->s[2] + TWO(52)*TWO(16)*TWO32/TWO96) - | |
199 | TWO(52)*TWO(16)*TWO32/TWO96; | |
200 | st->s[2] -= st->s[3]; | |
201 | ||
202 | st->s[5] = (st->s[4] + TWO(52)*TWO(16)*TWO64/TWO96) - | |
203 | TWO(52)*TWO(16)*TWO64/TWO96; | |
204 | st->s[4] -= st->s[5]; | |
205 | ||
206 | /* | |
207 | * restore original FPU control register | |
208 | */ | |
209 | #if defined(__x86_64__) | |
210 | asm volatile ("ldmxcsr %0"::"m"(mxcsr_orig)); | |
211 | #elif defined(__PPC__) | |
212 | asm volatile ("mtfsf 255,%0"::"f"(fpscr_orig)); | |
213 | #elif defined(__s390x__) | |
214 | asm volatile ("lfpc %0"::"m"(fpc_orig)); | |
215 | #elif defined(__sparc__) | |
216 | asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig)); | |
6b6981ef AP |
217 | #elif defined(__mips__) |
218 | asm volatile ("ctc1 %0,$31"::"r"(fcsr_orig)); | |
c8d1c9b0 AP |
219 | #endif |
220 | } | |
221 | ||
222 | return 0; | |
223 | } | |
224 | ||
225 | void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, | |
226 | int padbit) | |
227 | { | |
228 | poly1305_internal *st = (poly1305_internal *)ctx; | |
229 | elem64 in0, in1, in2, in3; | |
230 | u64 pad = (u64)padbit<<32; | |
231 | ||
232 | double x0, x1, x2, x3; | |
233 | double h0lo, h0hi, h1lo, h1hi, h2lo, h2hi, h3lo, h3hi; | |
234 | double c0lo, c0hi, c1lo, c1hi, c2lo, c2hi, c3lo, c3hi; | |
235 | ||
236 | const double r0lo = st->r[0]; | |
237 | const double r0hi = st->r[1]; | |
238 | const double r1lo = st->r[2]; | |
239 | const double r1hi = st->r[3]; | |
240 | const double r2lo = st->r[4]; | |
241 | const double r2hi = st->r[5]; | |
242 | const double r3lo = st->r[6]; | |
243 | const double r3hi = st->r[7]; | |
244 | ||
245 | const double s1lo = st->s[0]; | |
246 | const double s1hi = st->s[1]; | |
247 | const double s2lo = st->s[2]; | |
248 | const double s2hi = st->s[3]; | |
249 | const double s3lo = st->s[4]; | |
250 | const double s3hi = st->s[5]; | |
251 | ||
252 | /* | |
253 | * set "truncate" rounding mode | |
254 | */ | |
255 | #if defined(__x86_64__) | |
256 | u32 mxcsr_orig; | |
257 | ||
258 | asm volatile ("stmxcsr %0":"=m"(mxcsr_orig)); | |
259 | asm volatile ("ldmxcsr %0"::"m"(mxcsr)); | |
260 | #elif defined(__PPC__) | |
261 | double fpscr_orig, fpscr = *(double *)&one; | |
262 | ||
263 | asm volatile ("mffs %0":"=f"(fpscr_orig)); | |
264 | asm volatile ("mtfsf 255,%0"::"f"(fpscr)); | |
265 | #elif defined(__s390x__) | |
266 | u32 fpc_orig; | |
267 | ||
268 | asm volatile ("stfpc %0":"=m"(fpc_orig)); | |
269 | asm volatile ("lfpc %0"::"m"(fpc)); | |
270 | #elif defined(__sparc__) | |
271 | u64 fsr_orig; | |
272 | ||
273 | asm volatile ("stx %%fsr,%0":"=m"(fsr_orig)); | |
274 | asm volatile ("ldx %0,%%fsr"::"m"(fsr)); | |
6b6981ef AP |
275 | #elif defined(__mips__) |
276 | u32 fcsr_orig; | |
277 | ||
278 | asm volatile ("cfc1 %0,$31":"=r"(fcsr_orig)); | |
279 | asm volatile ("ctc1 %0,$31"::"r"(fcsr)); | |
c8d1c9b0 AP |
280 | #endif |
281 | ||
282 | /* | |
283 | * load base 2^32 and de-bias | |
284 | */ | |
285 | h0lo = st->h[0].d - TWO(52)*TWO0; | |
286 | h1lo = st->h[1].d - TWO(52)*TWO32; | |
287 | h2lo = st->h[2].d - TWO(52)*TWO64; | |
288 | h3lo = st->h[3].d - TWO(52)*TWO96; | |
289 | ||
290 | #ifdef __clang__ | |
291 | h0hi = 0; | |
292 | h1hi = 0; | |
293 | h2hi = 0; | |
294 | h3hi = 0; | |
295 | #else | |
296 | in0.u = EXP(52+0) | U8TOU32(&inp[0]); | |
297 | in1.u = EXP(52+32) | U8TOU32(&inp[4]); | |
298 | in2.u = EXP(52+64) | U8TOU32(&inp[8]); | |
299 | in3.u = EXP(52+96) | U8TOU32(&inp[12]) | pad; | |
300 | ||
301 | x0 = in0.d - TWO(52)*TWO0; | |
302 | x1 = in1.d - TWO(52)*TWO32; | |
303 | x2 = in2.d - TWO(52)*TWO64; | |
304 | x3 = in3.d - TWO(52)*TWO96; | |
305 | ||
306 | x0 += h0lo; | |
307 | x1 += h1lo; | |
308 | x2 += h2lo; | |
309 | x3 += h3lo; | |
310 | ||
311 | goto fast_entry; | |
312 | #endif | |
313 | ||
314 | do { | |
315 | in0.u = EXP(52+0) | U8TOU32(&inp[0]); | |
316 | in1.u = EXP(52+32) | U8TOU32(&inp[4]); | |
317 | in2.u = EXP(52+64) | U8TOU32(&inp[8]); | |
318 | in3.u = EXP(52+96) | U8TOU32(&inp[12]) | pad; | |
319 | ||
320 | x0 = in0.d - TWO(52)*TWO0; | |
321 | x1 = in1.d - TWO(52)*TWO32; | |
322 | x2 = in2.d - TWO(52)*TWO64; | |
323 | x3 = in3.d - TWO(52)*TWO96; | |
324 | ||
325 | /* | |
326 | * note that there are multiple ways to accumulate input, e.g. | |
327 | * one can as well accumulate to h0lo-h1lo-h1hi-h2hi... | |
328 | */ | |
329 | h0lo += x0; | |
330 | h0hi += x1; | |
331 | h2lo += x2; | |
332 | h2hi += x3; | |
333 | ||
334 | /* | |
335 | * carries that cross 32n-bit (and 130-bit) boundaries | |
336 | */ | |
337 | c0lo = (h0lo + TWO(52)*TWO32) - TWO(52)*TWO32; | |
338 | c1lo = (h1lo + TWO(52)*TWO64) - TWO(52)*TWO64; | |
339 | c2lo = (h2lo + TWO(52)*TWO96) - TWO(52)*TWO96; | |
340 | c3lo = (h3lo + TWO(52)*TWO130) - TWO(52)*TWO130; | |
341 | ||
342 | c0hi = (h0hi + TWO(52)*TWO32) - TWO(52)*TWO32; | |
343 | c1hi = (h1hi + TWO(52)*TWO64) - TWO(52)*TWO64; | |
344 | c2hi = (h2hi + TWO(52)*TWO96) - TWO(52)*TWO96; | |
345 | c3hi = (h3hi + TWO(52)*TWO130) - TWO(52)*TWO130; | |
346 | ||
347 | /* | |
348 | * base 2^48 -> base 2^32 with last reduction step | |
349 | */ | |
350 | x1 = (h1lo - c1lo) + c0lo; | |
351 | x2 = (h2lo - c2lo) + c1lo; | |
352 | x3 = (h3lo - c3lo) + c2lo; | |
353 | x0 = (h0lo - c0lo) + c3lo * (5.0/TWO130); | |
354 | ||
355 | x1 += (h1hi - c1hi) + c0hi; | |
356 | x2 += (h2hi - c2hi) + c1hi; | |
357 | x3 += (h3hi - c3hi) + c2hi; | |
358 | x0 += (h0hi - c0hi) + c3hi * (5.0/TWO130); | |
359 | ||
360 | #ifndef __clang__ | |
361 | fast_entry: | |
362 | #endif | |
32c69853 DMSP |
363 | /* |
364 | * base 2^32 * base 2^16 = base 2^48 | |
365 | */ | |
c8d1c9b0 AP |
366 | h0lo = s3lo * x1 + s2lo * x2 + s1lo * x3 + r0lo * x0; |
367 | h1lo = r0lo * x1 + s3lo * x2 + s2lo * x3 + r1lo * x0; | |
368 | h2lo = r1lo * x1 + r0lo * x2 + s3lo * x3 + r2lo * x0; | |
369 | h3lo = r2lo * x1 + r1lo * x2 + r0lo * x3 + r3lo * x0; | |
370 | ||
371 | h0hi = s3hi * x1 + s2hi * x2 + s1hi * x3 + r0hi * x0; | |
372 | h1hi = r0hi * x1 + s3hi * x2 + s2hi * x3 + r1hi * x0; | |
373 | h2hi = r1hi * x1 + r0hi * x2 + s3hi * x3 + r2hi * x0; | |
374 | h3hi = r2hi * x1 + r1hi * x2 + r0hi * x3 + r3hi * x0; | |
375 | ||
376 | inp += 16; | |
377 | len -= 16; | |
378 | ||
379 | } while (len >= 16); | |
380 | ||
381 | /* | |
382 | * carries that cross 32n-bit (and 130-bit) boundaries | |
383 | */ | |
384 | c0lo = (h0lo + TWO(52)*TWO32) - TWO(52)*TWO32; | |
385 | c1lo = (h1lo + TWO(52)*TWO64) - TWO(52)*TWO64; | |
386 | c2lo = (h2lo + TWO(52)*TWO96) - TWO(52)*TWO96; | |
387 | c3lo = (h3lo + TWO(52)*TWO130) - TWO(52)*TWO130; | |
388 | ||
389 | c0hi = (h0hi + TWO(52)*TWO32) - TWO(52)*TWO32; | |
390 | c1hi = (h1hi + TWO(52)*TWO64) - TWO(52)*TWO64; | |
391 | c2hi = (h2hi + TWO(52)*TWO96) - TWO(52)*TWO96; | |
392 | c3hi = (h3hi + TWO(52)*TWO130) - TWO(52)*TWO130; | |
393 | ||
394 | /* | |
395 | * base 2^48 -> base 2^32 with last reduction step | |
396 | */ | |
397 | x1 = (h1lo - c1lo) + c0lo; | |
398 | x2 = (h2lo - c2lo) + c1lo; | |
399 | x3 = (h3lo - c3lo) + c2lo; | |
400 | x0 = (h0lo - c0lo) + c3lo * (5.0/TWO130); | |
401 | ||
402 | x1 += (h1hi - c1hi) + c0hi; | |
403 | x2 += (h2hi - c2hi) + c1hi; | |
404 | x3 += (h3hi - c3hi) + c2hi; | |
405 | x0 += (h0hi - c0hi) + c3hi * (5.0/TWO130); | |
406 | ||
407 | /* | |
408 | * store base 2^32, with bias | |
409 | */ | |
410 | st->h[1].d = x1 + TWO(52)*TWO32; | |
411 | st->h[2].d = x2 + TWO(52)*TWO64; | |
412 | st->h[3].d = x3 + TWO(52)*TWO96; | |
413 | st->h[0].d = x0 + TWO(52)*TWO0; | |
414 | ||
415 | /* | |
416 | * restore original FPU control register | |
417 | */ | |
418 | #if defined(__x86_64__) | |
419 | asm volatile ("ldmxcsr %0"::"m"(mxcsr_orig)); | |
420 | #elif defined(__PPC__) | |
421 | asm volatile ("mtfsf 255,%0"::"f"(fpscr_orig)); | |
422 | #elif defined(__s390x__) | |
423 | asm volatile ("lfpc %0"::"m"(fpc_orig)); | |
424 | #elif defined(__sparc__) | |
425 | asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig)); | |
6b6981ef AP |
426 | #elif defined(__mips__) |
427 | asm volatile ("ctc1 %0,$31"::"r"(fcsr_orig)); | |
c8d1c9b0 AP |
428 | #endif |
429 | } | |
430 | ||
431 | void poly1305_emit(void *ctx, unsigned char mac[16], const u32 nonce[4]) | |
432 | { | |
433 | poly1305_internal *st = (poly1305_internal *) ctx; | |
434 | u64 h0, h1, h2, h3, h4; | |
435 | u32 g0, g1, g2, g3, g4; | |
436 | u64 t; | |
437 | u32 mask; | |
438 | ||
439 | /* | |
440 | * thanks to bias masking exponent gives integer result | |
441 | */ | |
442 | h0 = st->h[0].u & 0x000fffffffffffffULL; | |
443 | h1 = st->h[1].u & 0x000fffffffffffffULL; | |
444 | h2 = st->h[2].u & 0x000fffffffffffffULL; | |
445 | h3 = st->h[3].u & 0x000fffffffffffffULL; | |
446 | ||
447 | /* | |
448 | * can be partially reduced, so reduce... | |
449 | */ | |
450 | h4 = h3>>32; h3 &= 0xffffffffU; | |
451 | g4 = h4&-4; | |
452 | h4 &= 3; | |
453 | g4 += g4>>2; | |
454 | ||
455 | h0 += g4; | |
456 | h1 += h0>>32; h0 &= 0xffffffffU; | |
457 | h2 += h1>>32; h1 &= 0xffffffffU; | |
458 | h3 += h2>>32; h2 &= 0xffffffffU; | |
459 | ||
460 | /* compute h + -p */ | |
461 | g0 = (u32)(t = h0 + 5); | |
462 | g1 = (u32)(t = h1 + (t >> 32)); | |
463 | g2 = (u32)(t = h2 + (t >> 32)); | |
464 | g3 = (u32)(t = h3 + (t >> 32)); | |
465 | g4 = h4 + (u32)(t >> 32); | |
466 | ||
467 | /* if there was carry, select g0-g3 */ | |
468 | mask = 0 - (g4 >> 2); | |
469 | g0 &= mask; | |
470 | g1 &= mask; | |
471 | g2 &= mask; | |
472 | g3 &= mask; | |
473 | mask = ~mask; | |
474 | g0 |= (h0 & mask); | |
475 | g1 |= (h1 & mask); | |
476 | g2 |= (h2 & mask); | |
477 | g3 |= (h3 & mask); | |
478 | ||
479 | /* mac = (h + nonce) % (2^128) */ | |
480 | g0 = (u32)(t = (u64)g0 + nonce[0]); | |
481 | g1 = (u32)(t = (u64)g1 + (t >> 32) + nonce[1]); | |
482 | g2 = (u32)(t = (u64)g2 + (t >> 32) + nonce[2]); | |
483 | g3 = (u32)(t = (u64)g3 + (t >> 32) + nonce[3]); | |
484 | ||
485 | U32TO8(mac + 0, g0); | |
486 | U32TO8(mac + 4, g1); | |
487 | U32TO8(mac + 8, g2); | |
488 | U32TO8(mac + 12, g3); | |
489 | } |