]>
Commit | Line | Data |
---|---|---|
f2d78649 AP |
1 | /* |
2 | * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | * | |
49d3b641 | 4 | * Licensed under the Apache License 2.0 (the "License"). You may not use |
f2d78649 AP |
5 | * this file except in compliance with the License. You can obtain a copy |
6 | * in the file LICENSE in the source distribution or at | |
7 | * https://www.openssl.org/source/license.html | |
8 | */ | |
9 | ||
10 | /* | |
11 | * This module is meant to be used as template for base 2^44 assembly | |
12 | * implementation[s]. On side note compiler-generated code is not | |
13 | * slower than compiler-generated base 2^64 code on [high-end] x86_64, | |
14 | * even though amount of multiplications is 50% higher. Go figure... | |
15 | */ | |
16 | #include <stdlib.h> | |
17 | ||
18 | typedef unsigned char u8; | |
19 | typedef unsigned int u32; | |
20 | typedef unsigned long u64; | |
21 | typedef unsigned __int128 u128; | |
22 | ||
23 | typedef struct { | |
24 | u64 h[3]; | |
25 | u64 s[2]; | |
26 | u64 r[3]; | |
27 | } poly1305_internal; | |
28 | ||
29 | #define POLY1305_BLOCK_SIZE 16 | |
30 | ||
31 | /* pick 64-bit unsigned integer in little endian order */ | |
32 | static u64 U8TOU64(const unsigned char *p) | |
33 | { | |
34 | return (((u64)(p[0] & 0xff)) | | |
35 | ((u64)(p[1] & 0xff) << 8) | | |
36 | ((u64)(p[2] & 0xff) << 16) | | |
37 | ((u64)(p[3] & 0xff) << 24) | | |
38 | ((u64)(p[4] & 0xff) << 32) | | |
39 | ((u64)(p[5] & 0xff) << 40) | | |
40 | ((u64)(p[6] & 0xff) << 48) | | |
41 | ((u64)(p[7] & 0xff) << 56)); | |
42 | } | |
43 | ||
44 | /* store a 64-bit unsigned integer in little endian */ | |
45 | static void U64TO8(unsigned char *p, u64 v) | |
46 | { | |
47 | p[0] = (unsigned char)((v) & 0xff); | |
48 | p[1] = (unsigned char)((v >> 8) & 0xff); | |
49 | p[2] = (unsigned char)((v >> 16) & 0xff); | |
50 | p[3] = (unsigned char)((v >> 24) & 0xff); | |
51 | p[4] = (unsigned char)((v >> 32) & 0xff); | |
52 | p[5] = (unsigned char)((v >> 40) & 0xff); | |
53 | p[6] = (unsigned char)((v >> 48) & 0xff); | |
54 | p[7] = (unsigned char)((v >> 56) & 0xff); | |
55 | } | |
56 | ||
57 | int poly1305_init(void *ctx, const unsigned char key[16]) | |
58 | { | |
59 | poly1305_internal *st = (poly1305_internal *)ctx; | |
60 | u64 r0, r1; | |
61 | ||
62 | /* h = 0 */ | |
63 | st->h[0] = 0; | |
64 | st->h[1] = 0; | |
65 | st->h[2] = 0; | |
66 | ||
67 | r0 = U8TOU64(&key[0]) & 0x0ffffffc0fffffff; | |
68 | r1 = U8TOU64(&key[8]) & 0x0ffffffc0ffffffc; | |
69 | ||
70 | /* break r1:r0 to three 44-bit digits, masks are 1<<44-1 */ | |
71 | st->r[0] = r0 & 0x0fffffffffff; | |
72 | st->r[1] = ((r0 >> 44) | (r1 << 20)) & 0x0fffffffffff; | |
73 | st->r[2] = (r1 >> 24); | |
74 | ||
75 | st->s[0] = (st->r[1] + (st->r[1] << 2)) << 2; | |
76 | st->s[1] = (st->r[2] + (st->r[2] << 2)) << 2; | |
77 | ||
78 | return 0; | |
79 | } | |
80 | ||
81 | void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, | |
82 | u32 padbit) | |
83 | { | |
84 | poly1305_internal *st = (poly1305_internal *)ctx; | |
85 | u64 r0, r1, r2; | |
86 | u64 s1, s2; | |
87 | u64 h0, h1, h2, c; | |
88 | u128 d0, d1, d2; | |
89 | u64 pad = (u64)padbit << 40; | |
90 | ||
91 | r0 = st->r[0]; | |
92 | r1 = st->r[1]; | |
93 | r2 = st->r[2]; | |
94 | ||
95 | s1 = st->s[0]; | |
96 | s2 = st->s[1]; | |
97 | ||
98 | h0 = st->h[0]; | |
99 | h1 = st->h[1]; | |
100 | h2 = st->h[2]; | |
101 | ||
102 | while (len >= POLY1305_BLOCK_SIZE) { | |
103 | u64 m0, m1; | |
104 | ||
105 | m0 = U8TOU64(inp + 0); | |
106 | m1 = U8TOU64(inp + 8); | |
107 | ||
108 | /* h += m[i], m[i] is broken to 44-bit digits */ | |
109 | h0 += m0 & 0x0fffffffffff; | |
110 | h1 += ((m0 >> 44) | (m1 << 20)) & 0x0fffffffffff; | |
111 | h2 += (m1 >> 24) + pad; | |
112 | ||
113 | /* h *= r "%" p, where "%" stands for "partial remainder" */ | |
114 | d0 = ((u128)h0 * r0) + ((u128)h1 * s2) + ((u128)h2 * s1); | |
115 | d1 = ((u128)h0 * r1) + ((u128)h1 * r0) + ((u128)h2 * s2); | |
116 | d2 = ((u128)h0 * r2) + ((u128)h1 * r1) + ((u128)h2 * r0); | |
117 | ||
118 | /* "lazy" reduction step */ | |
119 | h0 = (u64)d0 & 0x0fffffffffff; | |
9872238e AP |
120 | h1 = (u64)(d1 += (u64)(d0 >> 44)) & 0x0fffffffffff; |
121 | h2 = (u64)(d2 += (u64)(d1 >> 44)) & 0x03ffffffffff; /* last 42 bits */ | |
f2d78649 AP |
122 | |
123 | c = (d2 >> 42); | |
124 | h0 += c + (c << 2); | |
125 | ||
126 | inp += POLY1305_BLOCK_SIZE; | |
127 | len -= POLY1305_BLOCK_SIZE; | |
128 | } | |
129 | ||
130 | st->h[0] = h0; | |
131 | st->h[1] = h1; | |
132 | st->h[2] = h2; | |
133 | } | |
134 | ||
135 | void poly1305_emit(void *ctx, unsigned char mac[16], const u32 nonce[4]) | |
136 | { | |
137 | poly1305_internal *st = (poly1305_internal *) ctx; | |
138 | u64 h0, h1, h2; | |
139 | u64 g0, g1, g2; | |
140 | u128 t; | |
141 | u64 mask; | |
142 | ||
143 | h0 = st->h[0]; | |
144 | h1 = st->h[1]; | |
145 | h2 = st->h[2]; | |
146 | ||
147 | /* after "lazy" reduction, convert 44+bit digits to 64-bit ones */ | |
148 | h0 = (u64)(t = (u128)h0 + (h1 << 44)); h1 >>= 20; | |
149 | h1 = (u64)(t = (u128)h1 + (h2 << 24) + (t >> 64)); h2 >>= 40; | |
150 | h2 += (u64)(t >> 64); | |
151 | ||
152 | /* compare to modulus by computing h + -p */ | |
153 | g0 = (u64)(t = (u128)h0 + 5); | |
154 | g1 = (u64)(t = (u128)h1 + (t >> 64)); | |
155 | g2 = h2 + (u64)(t >> 64); | |
156 | ||
157 | /* if there was carry into 131st bit, h1:h0 = g1:g0 */ | |
158 | mask = 0 - (g2 >> 2); | |
159 | g0 &= mask; | |
160 | g1 &= mask; | |
161 | mask = ~mask; | |
162 | h0 = (h0 & mask) | g0; | |
163 | h1 = (h1 & mask) | g1; | |
164 | ||
165 | /* mac = (h + nonce) % (2^128) */ | |
166 | h0 = (u64)(t = (u128)h0 + nonce[0] + ((u64)nonce[1]<<32)); | |
167 | h1 = (u64)(t = (u128)h1 + nonce[2] + ((u64)nonce[3]<<32) + (t >> 64)); | |
168 | ||
169 | U64TO8(mac + 0, h0); | |
170 | U64TO8(mac + 8, h1); | |
171 | } |