]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/modes/gcm128.c
Fix platform specific issues with provider ciphers
[thirdparty/openssl.git] / crypto / modes / gcm128.c
CommitLineData
4f22f405 1/*
fd38836b 2 * Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
e7f5b1cd 3 *
81cae8ce 4 * Licensed under the Apache License 2.0 (the "License"). You may not use
4f22f405
RS
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
e7f5b1cd
AP
8 */
9
e7f5b1cd 10#include <string.h>
459b15d4
SL
11#include <openssl/crypto.h>
12#include "internal/modes_int.h"
e7f5b1cd 13
f472ec8c
AP
14#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
15/* redefine, because alignment is ensured */
0f113f3e
MC
16# undef GETU32
17# define GETU32(p) BSWAP4(*(const u32 *)(p))
18# undef PUTU32
19# define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
20#endif
21
22#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
23#define REDUCE1BIT(V) do { \
24 if (sizeof(size_t)==8) { \
25 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
26 V.lo = (V.hi<<63)|(V.lo>>1); \
27 V.hi = (V.hi>>1 )^T; \
28 } \
29 else { \
30 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
31 V.lo = (V.hi<<63)|(V.lo>>1); \
32 V.hi = (V.hi>>1 )^((u64)T<<32); \
33 } \
c1f092d1
AP
34} while(0)
35
1d97c843 36/*-
d8d95832
AP
37 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
38 * never be set to 8. 8 is effectively reserved for testing purposes.
39 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
40 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
41 * whole spectrum of possible table driven implementations. Why? In
42 * non-"Shoup's" case memory access pattern is segmented in such manner,
43 * that it's trivial to see that cache timing information can reveal
44 * fair portion of intermediate hash value. Given that ciphertext is
45 * always available to attacker, it's possible for him to attempt to
46 * deduce secret parameter H and if successful, tamper with messages
47 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
48 * not as trivial, but there is no reason to believe that it's resistant
49 * to cache-timing attack. And the thing about "8-bit" implementation is
50 * that it consumes 16 (sixteen) times more memory, 4KB per individual
51 * key + 1KB shared. Well, on pros side it should be twice as fast as
52 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
53 * was observed to run ~75% faster, closer to 100% for commercial
54 * compilers... Yet "4-bit" procedure is preferred, because it's
55 * believed to provide better security-performance balance and adequate
56 * all-round performance. "All-round" refers to things like:
57 *
58 * - shorter setup time effectively improves overall timing for
59 * handling short messages;
60 * - larger table allocation can become unbearable because of VM
61 * subsystem penalties (for example on Windows large enough free
62 * results in VM working set trimming, meaning that consequent
63 * malloc would immediately incur working set expansion);
64 * - larger table has larger cache footprint, which can affect
65 * performance of other code paths (not necessarily even from same
66 * thread in Hyper-Threading world);
67 *
68 * Value of 1 is not appropriate for performance reasons.
69 */
0f113f3e 70#if TABLE_BITS==8
a595baff 71
e7f5b1cd
AP
72static void gcm_init_8bit(u128 Htable[256], u64 H[2])
73{
0f113f3e
MC
74 int i, j;
75 u128 V;
76
77 Htable[0].hi = 0;
78 Htable[0].lo = 0;
79 V.hi = H[0];
80 V.lo = H[1];
81
82 for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
83 REDUCE1BIT(V);
84 Htable[i] = V;
85 }
86
87 for (i = 2; i < 256; i <<= 1) {
88 u128 *Hi = Htable + i, H0 = *Hi;
89 for (j = 1; j < i; ++j) {
90 Hi[j].hi = H0.hi ^ Htable[j].hi;
91 Hi[j].lo = H0.lo ^ Htable[j].lo;
92 }
93 }
e7f5b1cd
AP
94}
95
d8d95832 96static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
e7f5b1cd 97{
0f113f3e
MC
98 u128 Z = { 0, 0 };
99 const u8 *xi = (const u8 *)Xi + 15;
100 size_t rem, n = *xi;
101 const union {
102 long one;
103 char little;
2e635aa8 104 } is_endian = { 1 };
0f113f3e
MC
105 static const size_t rem_8bit[256] = {
106 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
107 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
108 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
109 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
110 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
111 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
112 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
113 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
114 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
115 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
116 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
117 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
118 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
119 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
120 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
121 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
122 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
123 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
124 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
125 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
126 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
127 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
128 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
129 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
130 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
131 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
132 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
133 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
134 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
135 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
136 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
137 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
138 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
139 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
140 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
141 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
142 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
143 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
144 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
145 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
146 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
147 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
148 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
149 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
150 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
151 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
152 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
153 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
154 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
155 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
156 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
157 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
158 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
159 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
160 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
161 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
162 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
163 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
164 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
165 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
166 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
167 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
168 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
169 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
170 };
171
172 while (1) {
173 Z.hi ^= Htable[n].hi;
174 Z.lo ^= Htable[n].lo;
175
176 if ((u8 *)Xi == xi)
177 break;
178
179 n = *(--xi);
180
181 rem = (size_t)Z.lo & 0xff;
182 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
183 Z.hi = (Z.hi >> 8);
184 if (sizeof(size_t) == 8)
185 Z.hi ^= rem_8bit[rem];
186 else
187 Z.hi ^= (u64)rem_8bit[rem] << 32;
188 }
189
190 if (is_endian.little) {
191# ifdef BSWAP8
192 Xi[0] = BSWAP8(Z.hi);
193 Xi[1] = BSWAP8(Z.lo);
194# else
195 u8 *p = (u8 *)Xi;
196 u32 v;
197 v = (u32)(Z.hi >> 32);
198 PUTU32(p, v);
199 v = (u32)(Z.hi);
200 PUTU32(p + 4, v);
201 v = (u32)(Z.lo >> 32);
202 PUTU32(p + 8, v);
203 v = (u32)(Z.lo);
204 PUTU32(p + 12, v);
205# endif
206 } else {
207 Xi[0] = Z.hi;
208 Xi[1] = Z.lo;
209 }
e7f5b1cd 210}
e7f5b1cd 211
f5791af3 212# define GCM_MUL(ctx) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
0f113f3e
MC
213
214#elif TABLE_BITS==4
2262beef 215
e7f5b1cd
AP
216static void gcm_init_4bit(u128 Htable[16], u64 H[2])
217{
0f113f3e
MC
218 u128 V;
219# if defined(OPENSSL_SMALL_FOOTPRINT)
220 int i;
221# endif
e7f5b1cd 222
0f113f3e
MC
223 Htable[0].hi = 0;
224 Htable[0].lo = 0;
225 V.hi = H[0];
226 V.lo = H[1];
227
228# if defined(OPENSSL_SMALL_FOOTPRINT)
229 for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
230 REDUCE1BIT(V);
231 Htable[i] = V;
232 }
233
234 for (i = 2; i < 16; i <<= 1) {
235 u128 *Hi = Htable + i;
236 int j;
237 for (V = *Hi, j = 1; j < i; ++j) {
238 Hi[j].hi = V.hi ^ Htable[j].hi;
239 Hi[j].lo = V.lo ^ Htable[j].lo;
240 }
241 }
242# else
243 Htable[8] = V;
244 REDUCE1BIT(V);
245 Htable[4] = V;
246 REDUCE1BIT(V);
247 Htable[2] = V;
248 REDUCE1BIT(V);
249 Htable[1] = V;
250 Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
251 V = Htable[4];
252 Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
253 Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
254 Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
255 V = Htable[8];
256 Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
257 Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
258 Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
259 Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
260 Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
261 Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
262 Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
263# endif
264# if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
265 /*
266 * ARM assembler expects specific dword order in Htable.
267 */
268 {
269 int j;
270 const union {
271 long one;
272 char little;
2e635aa8 273 } is_endian = { 1 };
0f113f3e
MC
274
275 if (is_endian.little)
276 for (j = 0; j < 16; ++j) {
277 V = Htable[j];
278 Htable[j].hi = V.lo;
279 Htable[j].lo = V.hi;
280 } else
281 for (j = 0; j < 16; ++j) {
282 V = Htable[j];
283 Htable[j].hi = V.lo << 32 | V.lo >> 32;
284 Htable[j].lo = V.hi << 32 | V.hi >> 32;
285 }
286 }
287# endif
e7f5b1cd
AP
288}
289
0f113f3e 290# ifndef GHASH_ASM
2262beef 291static const size_t rem_4bit[16] = {
0f113f3e
MC
292 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
293 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
294 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
295 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
296};
2262beef 297
4f39edbf 298static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
e7f5b1cd 299{
0f113f3e
MC
300 u128 Z;
301 int cnt = 15;
302 size_t rem, nlo, nhi;
303 const union {
304 long one;
305 char little;
2e635aa8 306 } is_endian = { 1 };
0f113f3e
MC
307
308 nlo = ((const u8 *)Xi)[15];
309 nhi = nlo >> 4;
310 nlo &= 0xf;
311
312 Z.hi = Htable[nlo].hi;
313 Z.lo = Htable[nlo].lo;
314
315 while (1) {
316 rem = (size_t)Z.lo & 0xf;
317 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
318 Z.hi = (Z.hi >> 4);
319 if (sizeof(size_t) == 8)
320 Z.hi ^= rem_4bit[rem];
321 else
322 Z.hi ^= (u64)rem_4bit[rem] << 32;
323
324 Z.hi ^= Htable[nhi].hi;
325 Z.lo ^= Htable[nhi].lo;
326
327 if (--cnt < 0)
328 break;
329
330 nlo = ((const u8 *)Xi)[cnt];
331 nhi = nlo >> 4;
332 nlo &= 0xf;
333
334 rem = (size_t)Z.lo & 0xf;
335 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
336 Z.hi = (Z.hi >> 4);
337 if (sizeof(size_t) == 8)
338 Z.hi ^= rem_4bit[rem];
339 else
340 Z.hi ^= (u64)rem_4bit[rem] << 32;
341
342 Z.hi ^= Htable[nlo].hi;
343 Z.lo ^= Htable[nlo].lo;
344 }
345
346 if (is_endian.little) {
347# ifdef BSWAP8
348 Xi[0] = BSWAP8(Z.hi);
349 Xi[1] = BSWAP8(Z.lo);
350# else
351 u8 *p = (u8 *)Xi;
352 u32 v;
353 v = (u32)(Z.hi >> 32);
354 PUTU32(p, v);
355 v = (u32)(Z.hi);
356 PUTU32(p + 4, v);
357 v = (u32)(Z.lo >> 32);
358 PUTU32(p + 8, v);
359 v = (u32)(Z.lo);
360 PUTU32(p + 12, v);
361# endif
362 } else {
363 Xi[0] = Z.hi;
364 Xi[1] = Z.lo;
365 }
2262beef
AP
366}
367
0f113f3e 368# if !defined(OPENSSL_SMALL_FOOTPRINT)
2262beef
AP
369/*
370 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
a595baff
AP
371 * details... Compiler-generated code doesn't seem to give any
372 * performance improvement, at least not on x86[_64]. It's here
373 * mostly as reference and a placeholder for possible future
374 * non-trivial optimization[s]...
2262beef 375 */
0f113f3e
MC
376static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
377 const u8 *inp, size_t len)
2262beef
AP
378{
379 u128 Z;
380 int cnt;
381 size_t rem, nlo, nhi;
0f113f3e
MC
382 const union {
383 long one;
384 char little;
2e635aa8 385 } is_endian = { 1 };
0f113f3e
MC
386
387# if 1
2262beef 388 do {
0f113f3e
MC
389 cnt = 15;
390 nlo = ((const u8 *)Xi)[15];
391 nlo ^= inp[15];
392 nhi = nlo >> 4;
393 nlo &= 0xf;
394
395 Z.hi = Htable[nlo].hi;
396 Z.lo = Htable[nlo].lo;
397
398 while (1) {
399 rem = (size_t)Z.lo & 0xf;
400 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
401 Z.hi = (Z.hi >> 4);
402 if (sizeof(size_t) == 8)
403 Z.hi ^= rem_4bit[rem];
404 else
405 Z.hi ^= (u64)rem_4bit[rem] << 32;
406
407 Z.hi ^= Htable[nhi].hi;
408 Z.lo ^= Htable[nhi].lo;
409
410 if (--cnt < 0)
411 break;
412
413 nlo = ((const u8 *)Xi)[cnt];
414 nlo ^= inp[cnt];
415 nhi = nlo >> 4;
416 nlo &= 0xf;
417
418 rem = (size_t)Z.lo & 0xf;
419 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
420 Z.hi = (Z.hi >> 4);
421 if (sizeof(size_t) == 8)
422 Z.hi ^= rem_4bit[rem];
423 else
424 Z.hi ^= (u64)rem_4bit[rem] << 32;
425
426 Z.hi ^= Htable[nlo].hi;
427 Z.lo ^= Htable[nlo].lo;
428 }
429# else
e747f4d4
AP
430 /*
431 * Extra 256+16 bytes per-key plus 512 bytes shared tables
432 * [should] give ~50% improvement... One could have PACK()-ed
6acb4ff3
AP
433 * the rem_8bit even here, but the priority is to minimize
434 * cache footprint...
0f113f3e
MC
435 */
436 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
437 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
e747f4d4 438 static const unsigned short rem_8bit[256] = {
0f113f3e
MC
439 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
440 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
441 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
442 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
443 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
444 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
445 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
446 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
447 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
448 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
449 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
450 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
451 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
452 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
453 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
454 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
455 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
456 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
457 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
458 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
459 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
460 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
461 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
462 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
463 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
464 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
465 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
466 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
467 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
468 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
469 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
470 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
471 };
e747f4d4
AP
472 /*
473 * This pre-processing phase slows down procedure by approximately
474 * same time as it makes each loop spin faster. In other words
475 * single block performance is approximately same as straightforward
476 * "4-bit" implementation, and then it goes only faster...
477 */
0f113f3e
MC
478 for (cnt = 0; cnt < 16; ++cnt) {
479 Z.hi = Htable[cnt].hi;
480 Z.lo = Htable[cnt].lo;
481 Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
482 Hshr4[cnt].hi = (Z.hi >> 4);
483 Hshl4[cnt] = (u8)(Z.lo << 4);
e747f4d4
AP
484 }
485
486 do {
0f113f3e
MC
487 for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
488 nlo = ((const u8 *)Xi)[cnt];
489 nlo ^= inp[cnt];
490 nhi = nlo >> 4;
491 nlo &= 0xf;
e747f4d4 492
0f113f3e
MC
493 Z.hi ^= Htable[nlo].hi;
494 Z.lo ^= Htable[nlo].lo;
e747f4d4 495
0f113f3e 496 rem = (size_t)Z.lo & 0xff;
e747f4d4 497
0f113f3e
MC
498 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
499 Z.hi = (Z.hi >> 8);
e747f4d4 500
0f113f3e
MC
501 Z.hi ^= Hshr4[nhi].hi;
502 Z.lo ^= Hshr4[nhi].lo;
503 Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
504 }
e747f4d4 505
0f113f3e
MC
506 nlo = ((const u8 *)Xi)[0];
507 nlo ^= inp[0];
508 nhi = nlo >> 4;
509 nlo &= 0xf;
e747f4d4 510
0f113f3e
MC
511 Z.hi ^= Htable[nlo].hi;
512 Z.lo ^= Htable[nlo].lo;
e747f4d4 513
0f113f3e 514 rem = (size_t)Z.lo & 0xf;
e747f4d4 515
0f113f3e
MC
516 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
517 Z.hi = (Z.hi >> 4);
e747f4d4 518
0f113f3e
MC
519 Z.hi ^= Htable[nhi].hi;
520 Z.lo ^= Htable[nhi].lo;
521 Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
522# endif
e7f5b1cd 523
0f113f3e
MC
524 if (is_endian.little) {
525# ifdef BSWAP8
526 Xi[0] = BSWAP8(Z.hi);
527 Xi[1] = BSWAP8(Z.lo);
528# else
529 u8 *p = (u8 *)Xi;
530 u32 v;
531 v = (u32)(Z.hi >> 32);
532 PUTU32(p, v);
533 v = (u32)(Z.hi);
534 PUTU32(p + 4, v);
535 v = (u32)(Z.lo >> 32);
536 PUTU32(p + 8, v);
537 v = (u32)(Z.lo);
538 PUTU32(p + 12, v);
539# endif
540 } else {
541 Xi[0] = Z.hi;
542 Xi[1] = Z.lo;
543 }
544 } while (inp += 16, len -= 16);
e7f5b1cd 545}
0f113f3e
MC
546# endif
547# else
548void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
549void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
550 size_t len);
551# endif
2262beef 552
f5791af3 553# define GCM_MUL(ctx) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
0f113f3e
MC
554# if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
555# define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
556/*
557 * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
558 * effect. In other words idea is to hash data while it's still in L1 cache
559 * after encryption pass...
560 */
561# define GHASH_CHUNK (3*1024)
562# endif
2262beef 563
0f113f3e 564#else /* TABLE_BITS */
e7f5b1cd 565
0f113f3e 566static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
e7f5b1cd 567{
0f113f3e
MC
568 u128 V, Z = { 0, 0 };
569 long X;
570 int i, j;
571 const long *xi = (const long *)Xi;
572 const union {
573 long one;
574 char little;
2e635aa8 575 } is_endian = { 1 };
0f113f3e
MC
576
577 V.hi = H[0]; /* H is in host byte order, no byte swapping */
578 V.lo = H[1];
579
580 for (j = 0; j < 16 / sizeof(long); ++j) {
581 if (is_endian.little) {
582 if (sizeof(long) == 8) {
583# ifdef BSWAP8
584 X = (long)(BSWAP8(xi[j]));
585# else
586 const u8 *p = (const u8 *)(xi + j);
587 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
588# endif
589 } else {
590 const u8 *p = (const u8 *)(xi + j);
591 X = (long)GETU32(p);
592 }
593 } else
594 X = xi[j];
595
596 for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
597 u64 M = (u64)(X >> (8 * sizeof(long) - 1));
598 Z.hi ^= V.hi & M;
599 Z.lo ^= V.lo & M;
600
601 REDUCE1BIT(V);
602 }
603 }
604
605 if (is_endian.little) {
606# ifdef BSWAP8
607 Xi[0] = BSWAP8(Z.hi);
608 Xi[1] = BSWAP8(Z.lo);
609# else
610 u8 *p = (u8 *)Xi;
611 u32 v;
612 v = (u32)(Z.hi >> 32);
613 PUTU32(p, v);
614 v = (u32)(Z.hi);
615 PUTU32(p + 4, v);
616 v = (u32)(Z.lo >> 32);
617 PUTU32(p + 8, v);
618 v = (u32)(Z.lo);
619 PUTU32(p + 12, v);
620# endif
621 } else {
622 Xi[0] = Z.hi;
623 Xi[1] = Z.lo;
624 }
e7f5b1cd 625}
0f113f3e 626
f5791af3 627# define GCM_MUL(ctx) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
a595baff 628
e7f5b1cd
AP
629#endif
630
0f113f3e
MC
631#if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
632# if !defined(I386_ONLY) && \
633 (defined(__i386) || defined(__i386__) || \
634 defined(__x86_64) || defined(__x86_64__) || \
635 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
1e863180
AP
636# define GHASH_ASM_X86_OR_64
637# define GCM_FUNCREF_4BIT
75c4827d 638extern unsigned int OPENSSL_ia32cap_P[];
c1f092d1 639
0f113f3e
MC
640void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
641void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
642void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
643 size_t len);
c1f092d1 644
0f113f3e
MC
645# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
646# define gcm_init_avx gcm_init_clmul
647# define gcm_gmult_avx gcm_gmult_clmul
648# define gcm_ghash_avx gcm_ghash_clmul
649# else
650void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
651void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
652void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
653 size_t len);
654# endif
1da5d302 655
0f113f3e 656# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
1e863180 657# define GHASH_ASM_X86
0f113f3e
MC
658void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
659void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
660 size_t len);
c1f092d1 661
0f113f3e
MC
662void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
663void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
664 size_t len);
1e863180 665# endif
82741e9c 666# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
1e863180 667# include "arm_arch.h"
c1669e1c 668# if __ARM_MAX_ARCH__>=7
1e863180
AP
669# define GHASH_ASM_ARM
670# define GCM_FUNCREF_4BIT
0f113f3e 671# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
82741e9c 672# if defined(__arm__) || defined(__arm)
0f113f3e 673# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
82741e9c 674# endif
0f113f3e
MC
675void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
676void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
677void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
678 size_t len);
679void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
680void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
681void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
682 size_t len);
1e863180 683# endif
23328d4b
AP
684# elif defined(__sparc__) || defined(__sparc)
685# include "sparc_arch.h"
686# define GHASH_ASM_SPARC
687# define GCM_FUNCREF_4BIT
688extern unsigned int OPENSSL_sparcv9cap_P[];
0f113f3e
MC
689void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
690void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
691void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
692 size_t len);
693# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
0e716d92
AP
694# include "ppc_arch.h"
695# define GHASH_ASM_PPC
696# define GCM_FUNCREF_4BIT
0f113f3e
MC
697void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
698void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
699void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
700 size_t len);
c1f092d1 701# endif
c1f092d1
AP
702#endif
703
7af04002
AP
704#ifdef GCM_FUNCREF_4BIT
705# undef GCM_MUL
f5791af3 706# define GCM_MUL(ctx) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
7af04002
AP
707# ifdef GHASH
708# undef GHASH
0f113f3e 709# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
7af04002
AP
710# endif
711#endif
712
0f113f3e 713void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
e7f5b1cd 714{
0f113f3e
MC
715 const union {
716 long one;
717 char little;
2e635aa8 718 } is_endian = { 1 };
e7f5b1cd 719
0f113f3e
MC
720 memset(ctx, 0, sizeof(*ctx));
721 ctx->block = block;
722 ctx->key = key;
e7f5b1cd 723
0f113f3e 724 (*block) (ctx->H.c, ctx->H.c, key);
e7f5b1cd 725
0f113f3e
MC
726 if (is_endian.little) {
727 /* H is stored in host byte order */
e7f5b1cd 728#ifdef BSWAP8
0f113f3e
MC
729 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
730 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
e7f5b1cd 731#else
0f113f3e
MC
732 u8 *p = ctx->H.c;
733 u64 hi, lo;
734 hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
735 lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
736 ctx->H.u[0] = hi;
737 ctx->H.u[1] = lo;
e7f5b1cd 738#endif
0f113f3e
MC
739 }
740#if TABLE_BITS==8
741 gcm_init_8bit(ctx->Htable, ctx->H.u);
742#elif TABLE_BITS==4
2e635aa8
AP
743# if defined(GHASH)
744# define CTX__GHASH(f) (ctx->ghash = (f))
745# else
746# define CTX__GHASH(f) (ctx->ghash = NULL)
747# endif
0f113f3e
MC
748# if defined(GHASH_ASM_X86_OR_64)
749# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
6e5a853b 750 if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
0f113f3e
MC
751 if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
752 gcm_init_avx(ctx->Htable, ctx->H.u);
753 ctx->gmult = gcm_gmult_avx;
2e635aa8 754 CTX__GHASH(gcm_ghash_avx);
0f113f3e
MC
755 } else {
756 gcm_init_clmul(ctx->Htable, ctx->H.u);
757 ctx->gmult = gcm_gmult_clmul;
2e635aa8 758 CTX__GHASH(gcm_ghash_clmul);
0f113f3e
MC
759 }
760 return;
761 }
a6d915e0 762# endif
0f113f3e
MC
763 gcm_init_4bit(ctx->Htable, ctx->H.u);
764# if defined(GHASH_ASM_X86) /* x86 only */
765# if defined(OPENSSL_IA32_SSE2)
766 if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
98909c1d 767# else
0f113f3e 768 if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
98909c1d 769# endif
0f113f3e 770 ctx->gmult = gcm_gmult_4bit_mmx;
2e635aa8 771 CTX__GHASH(gcm_ghash_4bit_mmx);
0f113f3e
MC
772 } else {
773 ctx->gmult = gcm_gmult_4bit_x86;
2e635aa8 774 CTX__GHASH(gcm_ghash_4bit_x86);
0f113f3e 775 }
c1f092d1 776# else
0f113f3e 777 ctx->gmult = gcm_gmult_4bit;
2e635aa8 778 CTX__GHASH(gcm_ghash_4bit);
c1f092d1 779# endif
0f113f3e 780# elif defined(GHASH_ASM_ARM)
82741e9c 781# ifdef PMULL_CAPABLE
0f113f3e
MC
782 if (PMULL_CAPABLE) {
783 gcm_init_v8(ctx->Htable, ctx->H.u);
784 ctx->gmult = gcm_gmult_v8;
2e635aa8 785 CTX__GHASH(gcm_ghash_v8);
0f113f3e 786 } else
82741e9c
AP
787# endif
788# ifdef NEON_CAPABLE
0f113f3e
MC
789 if (NEON_CAPABLE) {
790 gcm_init_neon(ctx->Htable, ctx->H.u);
791 ctx->gmult = gcm_gmult_neon;
2e635aa8 792 CTX__GHASH(gcm_ghash_neon);
0f113f3e 793 } else
82741e9c 794# endif
0f113f3e
MC
795 {
796 gcm_init_4bit(ctx->Htable, ctx->H.u);
797 ctx->gmult = gcm_gmult_4bit;
2e635aa8 798 CTX__GHASH(gcm_ghash_4bit);
0f113f3e
MC
799 }
800# elif defined(GHASH_ASM_SPARC)
801 if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
802 gcm_init_vis3(ctx->Htable, ctx->H.u);
803 ctx->gmult = gcm_gmult_vis3;
2e635aa8 804 CTX__GHASH(gcm_ghash_vis3);
0f113f3e
MC
805 } else {
806 gcm_init_4bit(ctx->Htable, ctx->H.u);
807 ctx->gmult = gcm_gmult_4bit;
2e635aa8 808 CTX__GHASH(gcm_ghash_4bit);
0f113f3e
MC
809 }
810# elif defined(GHASH_ASM_PPC)
811 if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
812 gcm_init_p8(ctx->Htable, ctx->H.u);
813 ctx->gmult = gcm_gmult_p8;
2e635aa8 814 CTX__GHASH(gcm_ghash_p8);
0f113f3e
MC
815 } else {
816 gcm_init_4bit(ctx->Htable, ctx->H.u);
817 ctx->gmult = gcm_gmult_4bit;
2e635aa8 818 CTX__GHASH(gcm_ghash_4bit);
0f113f3e 819 }
c1f092d1 820# else
0f113f3e 821 gcm_init_4bit(ctx->Htable, ctx->H.u);
c1f092d1 822# endif
2e635aa8 823# undef CTX__GHASH
a595baff 824#endif
e7f5b1cd
AP
825}
826
0f113f3e
MC
827void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
828 size_t len)
e7f5b1cd 829{
0f113f3e
MC
830 const union {
831 long one;
832 char little;
2e635aa8 833 } is_endian = { 1 };
0f113f3e 834 unsigned int ctr;
d8d95832 835#ifdef GCM_FUNCREF_4BIT
0f113f3e
MC
836 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
837#endif
838
0f113f3e
MC
839 ctx->len.u[0] = 0; /* AAD length */
840 ctx->len.u[1] = 0; /* message length */
841 ctx->ares = 0;
842 ctx->mres = 0;
843
844 if (len == 12) {
845 memcpy(ctx->Yi.c, iv, 12);
f5791af3
AP
846 ctx->Yi.c[12] = 0;
847 ctx->Yi.c[13] = 0;
848 ctx->Yi.c[14] = 0;
0f113f3e
MC
849 ctx->Yi.c[15] = 1;
850 ctr = 1;
851 } else {
852 size_t i;
853 u64 len0 = len;
854
f5791af3
AP
855 /* Borrow ctx->Xi to calculate initial Yi */
856 ctx->Xi.u[0] = 0;
857 ctx->Xi.u[1] = 0;
858
0f113f3e
MC
859 while (len >= 16) {
860 for (i = 0; i < 16; ++i)
f5791af3
AP
861 ctx->Xi.c[i] ^= iv[i];
862 GCM_MUL(ctx);
0f113f3e
MC
863 iv += 16;
864 len -= 16;
865 }
866 if (len) {
867 for (i = 0; i < len; ++i)
f5791af3
AP
868 ctx->Xi.c[i] ^= iv[i];
869 GCM_MUL(ctx);
0f113f3e
MC
870 }
871 len0 <<= 3;
872 if (is_endian.little) {
e7f5b1cd 873#ifdef BSWAP8
f5791af3 874 ctx->Xi.u[1] ^= BSWAP8(len0);
e7f5b1cd 875#else
f5791af3
AP
876 ctx->Xi.c[8] ^= (u8)(len0 >> 56);
877 ctx->Xi.c[9] ^= (u8)(len0 >> 48);
878 ctx->Xi.c[10] ^= (u8)(len0 >> 40);
879 ctx->Xi.c[11] ^= (u8)(len0 >> 32);
880 ctx->Xi.c[12] ^= (u8)(len0 >> 24);
881 ctx->Xi.c[13] ^= (u8)(len0 >> 16);
882 ctx->Xi.c[14] ^= (u8)(len0 >> 8);
883 ctx->Xi.c[15] ^= (u8)(len0);
e7f5b1cd 884#endif
f5791af3
AP
885 } else {
886 ctx->Xi.u[1] ^= len0;
887 }
e7f5b1cd 888
f5791af3 889 GCM_MUL(ctx);
e7f5b1cd 890
0f113f3e 891 if (is_endian.little)
997d1aac 892#ifdef BSWAP4
f5791af3 893 ctr = BSWAP4(ctx->Xi.d[3]);
997d1aac 894#else
f5791af3 895 ctr = GETU32(ctx->Xi.c + 12);
997d1aac 896#endif
0f113f3e 897 else
f5791af3
AP
898 ctr = ctx->Xi.d[3];
899
900 /* Copy borrowed Xi to Yi */
901 ctx->Yi.u[0] = ctx->Xi.u[0];
902 ctx->Yi.u[1] = ctx->Xi.u[1];
0f113f3e 903 }
e7f5b1cd 904
f5791af3
AP
905 ctx->Xi.u[0] = 0;
906 ctx->Xi.u[1] = 0;
907
0f113f3e
MC
908 (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
909 ++ctr;
910 if (is_endian.little)
997d1aac 911#ifdef BSWAP4
0f113f3e 912 ctx->Yi.d[3] = BSWAP4(ctr);
997d1aac 913#else
0f113f3e 914 PUTU32(ctx->Yi.c + 12, ctr);
997d1aac 915#endif
0f113f3e
MC
916 else
917 ctx->Yi.d[3] = ctr;
e7f5b1cd
AP
918}
919
0f113f3e
MC
920int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
921 size_t len)
e7f5b1cd 922{
0f113f3e
MC
923 size_t i;
924 unsigned int n;
925 u64 alen = ctx->len.u[0];
d8d95832 926#ifdef GCM_FUNCREF_4BIT
0f113f3e 927 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
d8d95832 928# ifdef GHASH
0f113f3e
MC
929 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
930 const u8 *inp, size_t len) = ctx->ghash;
d8d95832
AP
931# endif
932#endif
e7f5b1cd 933
0f113f3e
MC
934 if (ctx->len.u[1])
935 return -2;
936
937 alen += len;
938 if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
939 return -1;
940 ctx->len.u[0] = alen;
941
942 n = ctx->ares;
943 if (n) {
944 while (n && len) {
945 ctx->Xi.c[n] ^= *(aad++);
946 --len;
947 n = (n + 1) % 16;
948 }
949 if (n == 0)
f5791af3 950 GCM_MUL(ctx);
0f113f3e
MC
951 else {
952 ctx->ares = n;
953 return 0;
954 }
955 }
2262beef 956#ifdef GHASH
0f113f3e
MC
957 if ((i = (len & (size_t)-16))) {
958 GHASH(ctx, aad, i);
959 aad += i;
960 len -= i;
961 }
2262beef 962#else
0f113f3e
MC
963 while (len >= 16) {
964 for (i = 0; i < 16; ++i)
965 ctx->Xi.c[i] ^= aad[i];
f5791af3 966 GCM_MUL(ctx);
0f113f3e
MC
967 aad += 16;
968 len -= 16;
969 }
2262beef 970#endif
0f113f3e
MC
971 if (len) {
972 n = (unsigned int)len;
973 for (i = 0; i < len; ++i)
974 ctx->Xi.c[i] ^= aad[i];
975 }
b68c1315 976
0f113f3e
MC
977 ctx->ares = n;
978 return 0;
e7f5b1cd
AP
979}
980
1f2502eb 981int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
0f113f3e
MC
982 const unsigned char *in, unsigned char *out,
983 size_t len)
e7f5b1cd 984{
0f113f3e
MC
985 const union {
986 long one;
987 char little;
2e635aa8 988 } is_endian = { 1 };
c1b2569d 989 unsigned int n, ctr, mres;
0f113f3e
MC
990 size_t i;
991 u64 mlen = ctx->len.u[1];
992 block128_f block = ctx->block;
993 void *key = ctx->key;
d8d95832 994#ifdef GCM_FUNCREF_4BIT
0f113f3e 995 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
2e635aa8 996# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
0f113f3e
MC
997 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
998 const u8 *inp, size_t len) = ctx->ghash;
d8d95832
AP
999# endif
1000#endif
1f2502eb 1001
0f113f3e
MC
1002 mlen += len;
1003 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1004 return -1;
1005 ctx->len.u[1] = mlen;
e7f5b1cd 1006
c1b2569d
AP
1007 mres = ctx->mres;
1008
0f113f3e
MC
1009 if (ctx->ares) {
1010 /* First call to encrypt finalizes GHASH(AAD) */
c1b2569d
AP
1011#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1012 if (len == 0) {
1013 GCM_MUL(ctx);
1014 ctx->ares = 0;
1015 return 0;
1016 }
1017 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1018 ctx->Xi.u[0] = 0;
1019 ctx->Xi.u[1] = 0;
1020 mres = sizeof(ctx->Xi);
1021#else
f5791af3 1022 GCM_MUL(ctx);
c1b2569d 1023#endif
0f113f3e
MC
1024 ctx->ares = 0;
1025 }
96a4cf8c 1026
0f113f3e 1027 if (is_endian.little)
997d1aac 1028#ifdef BSWAP4
0f113f3e 1029 ctr = BSWAP4(ctx->Yi.d[3]);
997d1aac 1030#else
0f113f3e 1031 ctr = GETU32(ctx->Yi.c + 12);
997d1aac 1032#endif
0f113f3e
MC
1033 else
1034 ctr = ctx->Yi.d[3];
96a4cf8c 1035
c1b2569d 1036 n = mres % 16;
0f113f3e
MC
1037#if !defined(OPENSSL_SMALL_FOOTPRINT)
1038 if (16 % sizeof(size_t) == 0) { /* always true actually */
1039 do {
1040 if (n) {
c1b2569d
AP
1041# if defined(GHASH)
1042 while (n && len) {
1043 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1044 --len;
1045 n = (n + 1) % 16;
1046 }
1047 if (n == 0) {
1048 GHASH(ctx, ctx->Xn, mres);
1049 mres = 0;
1050 } else {
1051 ctx->mres = mres;
1052 return 0;
1053 }
1054# else
0f113f3e
MC
1055 while (n && len) {
1056 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1057 --len;
1058 n = (n + 1) % 16;
1059 }
c1b2569d 1060 if (n == 0) {
f5791af3 1061 GCM_MUL(ctx);
c1b2569d
AP
1062 mres = 0;
1063 } else {
0f113f3e
MC
1064 ctx->mres = n;
1065 return 0;
1066 }
c1b2569d 1067# endif
0f113f3e
MC
1068 }
1069# if defined(STRICT_ALIGNMENT)
1070 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1071 break;
1072# endif
2e635aa8 1073# if defined(GHASH)
c1b2569d
AP
1074 if (len >= 16 && mres) {
1075 GHASH(ctx, ctx->Xn, mres);
1076 mres = 0;
1077 }
2e635aa8 1078# if defined(GHASH_CHUNK)
0f113f3e
MC
1079 while (len >= GHASH_CHUNK) {
1080 size_t j = GHASH_CHUNK;
1081
1082 while (j) {
1083 size_t *out_t = (size_t *)out;
1084 const size_t *in_t = (const size_t *)in;
1085
1086 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1087 ++ctr;
1088 if (is_endian.little)
2e635aa8 1089# ifdef BSWAP4
0f113f3e 1090 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1091# else
0f113f3e 1092 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1093# endif
0f113f3e
MC
1094 else
1095 ctx->Yi.d[3] = ctr;
1096 for (i = 0; i < 16 / sizeof(size_t); ++i)
1097 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1098 out += 16;
1099 in += 16;
1100 j -= 16;
1101 }
1102 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1103 len -= GHASH_CHUNK;
1104 }
2e635aa8 1105# endif
0f113f3e
MC
1106 if ((i = (len & (size_t)-16))) {
1107 size_t j = i;
1108
1109 while (len >= 16) {
1110 size_t *out_t = (size_t *)out;
1111 const size_t *in_t = (const size_t *)in;
1112
1113 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1114 ++ctr;
1115 if (is_endian.little)
1116# ifdef BSWAP4
1117 ctx->Yi.d[3] = BSWAP4(ctr);
1118# else
1119 PUTU32(ctx->Yi.c + 12, ctr);
1120# endif
1121 else
1122 ctx->Yi.d[3] = ctr;
1123 for (i = 0; i < 16 / sizeof(size_t); ++i)
1124 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1125 out += 16;
1126 in += 16;
1127 len -= 16;
1128 }
1129 GHASH(ctx, out - j, j);
1130 }
1131# else
1132 while (len >= 16) {
1133 size_t *out_t = (size_t *)out;
1134 const size_t *in_t = (const size_t *)in;
1135
1136 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1137 ++ctr;
1138 if (is_endian.little)
1139# ifdef BSWAP4
1140 ctx->Yi.d[3] = BSWAP4(ctr);
1141# else
1142 PUTU32(ctx->Yi.c + 12, ctr);
1143# endif
1144 else
1145 ctx->Yi.d[3] = ctr;
1146 for (i = 0; i < 16 / sizeof(size_t); ++i)
1147 ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
f5791af3 1148 GCM_MUL(ctx);
0f113f3e
MC
1149 out += 16;
1150 in += 16;
1151 len -= 16;
1152 }
1153# endif
1154 if (len) {
1155 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1156 ++ctr;
1157 if (is_endian.little)
1158# ifdef BSWAP4
1159 ctx->Yi.d[3] = BSWAP4(ctr);
1160# else
1161 PUTU32(ctx->Yi.c + 12, ctr);
1162# endif
1163 else
1164 ctx->Yi.d[3] = ctr;
c1b2569d
AP
1165# if defined(GHASH)
1166 while (len--) {
1167 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1168 ++n;
1169 }
1170# else
0f113f3e
MC
1171 while (len--) {
1172 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1173 ++n;
1174 }
c1b2569d
AP
1175 mres = n;
1176# endif
0f113f3e
MC
1177 }
1178
c1b2569d 1179 ctx->mres = mres;
0f113f3e
MC
1180 return 0;
1181 } while (0);
1182 }
e7f5b1cd 1183#endif
0f113f3e
MC
1184 for (i = 0; i < len; ++i) {
1185 if (n == 0) {
1186 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1187 ++ctr;
1188 if (is_endian.little)
997d1aac 1189#ifdef BSWAP4
0f113f3e 1190 ctx->Yi.d[3] = BSWAP4(ctr);
997d1aac 1191#else
0f113f3e
MC
1192 PUTU32(ctx->Yi.c + 12, ctr);
1193#endif
1194 else
1195 ctx->Yi.d[3] = ctr;
1196 }
c1b2569d
AP
1197#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1198 ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
0f113f3e 1199 n = (n + 1) % 16;
c1b2569d
AP
1200 if (mres == sizeof(ctx->Xn)) {
1201 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1202 mres = 0;
1203 }
1204#else
1205 ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1206 mres = n = (n + 1) % 16;
0f113f3e 1207 if (n == 0)
f5791af3 1208 GCM_MUL(ctx);
c1b2569d 1209#endif
0f113f3e
MC
1210 }
1211
c1b2569d 1212 ctx->mres = mres;
0f113f3e 1213 return 0;
e7f5b1cd
AP
1214}
1215
1f2502eb 1216int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
0f113f3e
MC
1217 const unsigned char *in, unsigned char *out,
1218 size_t len)
e7f5b1cd 1219{
0f113f3e
MC
1220 const union {
1221 long one;
1222 char little;
2e635aa8 1223 } is_endian = { 1 };
c1b2569d 1224 unsigned int n, ctr, mres;
0f113f3e
MC
1225 size_t i;
1226 u64 mlen = ctx->len.u[1];
1227 block128_f block = ctx->block;
1228 void *key = ctx->key;
d8d95832 1229#ifdef GCM_FUNCREF_4BIT
0f113f3e 1230 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
2e635aa8 1231# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
0f113f3e
MC
1232 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1233 const u8 *inp, size_t len) = ctx->ghash;
d8d95832
AP
1234# endif
1235#endif
1f2502eb 1236
0f113f3e
MC
1237 mlen += len;
1238 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1239 return -1;
1240 ctx->len.u[1] = mlen;
e7f5b1cd 1241
c1b2569d
AP
1242 mres = ctx->mres;
1243
0f113f3e
MC
1244 if (ctx->ares) {
1245 /* First call to decrypt finalizes GHASH(AAD) */
c1b2569d
AP
1246#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1247 if (len == 0) {
1248 GCM_MUL(ctx);
1249 ctx->ares = 0;
1250 return 0;
1251 }
1252 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1253 ctx->Xi.u[0] = 0;
1254 ctx->Xi.u[1] = 0;
1255 mres = sizeof(ctx->Xi);
1256#else
f5791af3 1257 GCM_MUL(ctx);
c1b2569d 1258#endif
0f113f3e
MC
1259 ctx->ares = 0;
1260 }
b68c1315 1261
0f113f3e 1262 if (is_endian.little)
997d1aac 1263#ifdef BSWAP4
0f113f3e 1264 ctr = BSWAP4(ctx->Yi.d[3]);
997d1aac 1265#else
0f113f3e 1266 ctr = GETU32(ctx->Yi.c + 12);
997d1aac 1267#endif
0f113f3e
MC
1268 else
1269 ctr = ctx->Yi.d[3];
e7f5b1cd 1270
c1b2569d 1271 n = mres % 16;
e7f5b1cd 1272#if !defined(OPENSSL_SMALL_FOOTPRINT)
0f113f3e
MC
1273 if (16 % sizeof(size_t) == 0) { /* always true actually */
1274 do {
1275 if (n) {
c1b2569d
AP
1276# if defined(GHASH)
1277 while (n && len) {
1278 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1279 --len;
1280 n = (n + 1) % 16;
1281 }
1282 if (n == 0) {
1283 GHASH(ctx, ctx->Xn, mres);
1284 mres = 0;
1285 } else {
1286 ctx->mres = mres;
1287 return 0;
1288 }
1289# else
0f113f3e
MC
1290 while (n && len) {
1291 u8 c = *(in++);
1292 *(out++) = c ^ ctx->EKi.c[n];
1293 ctx->Xi.c[n] ^= c;
1294 --len;
1295 n = (n + 1) % 16;
1296 }
c1b2569d 1297 if (n == 0) {
f5791af3 1298 GCM_MUL(ctx);
c1b2569d
AP
1299 mres = 0;
1300 } else {
0f113f3e
MC
1301 ctx->mres = n;
1302 return 0;
1303 }
c1b2569d 1304# endif
0f113f3e
MC
1305 }
1306# if defined(STRICT_ALIGNMENT)
1307 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1308 break;
1309# endif
2e635aa8 1310# if defined(GHASH)
c1b2569d
AP
1311 if (len >= 16 && mres) {
1312 GHASH(ctx, ctx->Xn, mres);
1313 mres = 0;
1314 }
2e635aa8 1315# if defined(GHASH_CHUNK)
0f113f3e
MC
1316 while (len >= GHASH_CHUNK) {
1317 size_t j = GHASH_CHUNK;
1318
1319 GHASH(ctx, in, GHASH_CHUNK);
1320 while (j) {
1321 size_t *out_t = (size_t *)out;
1322 const size_t *in_t = (const size_t *)in;
1323
1324 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1325 ++ctr;
1326 if (is_endian.little)
2e635aa8 1327# ifdef BSWAP4
0f113f3e 1328 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1329# else
0f113f3e 1330 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1331# endif
0f113f3e
MC
1332 else
1333 ctx->Yi.d[3] = ctr;
1334 for (i = 0; i < 16 / sizeof(size_t); ++i)
1335 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1336 out += 16;
1337 in += 16;
1338 j -= 16;
1339 }
1340 len -= GHASH_CHUNK;
1341 }
2e635aa8 1342# endif
0f113f3e
MC
1343 if ((i = (len & (size_t)-16))) {
1344 GHASH(ctx, in, i);
1345 while (len >= 16) {
1346 size_t *out_t = (size_t *)out;
1347 const size_t *in_t = (const size_t *)in;
1348
1349 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1350 ++ctr;
1351 if (is_endian.little)
1352# ifdef BSWAP4
1353 ctx->Yi.d[3] = BSWAP4(ctr);
1354# else
1355 PUTU32(ctx->Yi.c + 12, ctr);
1356# endif
1357 else
1358 ctx->Yi.d[3] = ctr;
1359 for (i = 0; i < 16 / sizeof(size_t); ++i)
1360 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1361 out += 16;
1362 in += 16;
1363 len -= 16;
1364 }
1365 }
1366# else
1367 while (len >= 16) {
1368 size_t *out_t = (size_t *)out;
1369 const size_t *in_t = (const size_t *)in;
1370
1371 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1372 ++ctr;
1373 if (is_endian.little)
1374# ifdef BSWAP4
1375 ctx->Yi.d[3] = BSWAP4(ctr);
1376# else
1377 PUTU32(ctx->Yi.c + 12, ctr);
1378# endif
1379 else
1380 ctx->Yi.d[3] = ctr;
1381 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1382 size_t c = in[i];
1383 out[i] = c ^ ctx->EKi.t[i];
1384 ctx->Xi.t[i] ^= c;
1385 }
f5791af3 1386 GCM_MUL(ctx);
0f113f3e
MC
1387 out += 16;
1388 in += 16;
1389 len -= 16;
1390 }
1391# endif
1392 if (len) {
1393 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1394 ++ctr;
1395 if (is_endian.little)
1396# ifdef BSWAP4
1397 ctx->Yi.d[3] = BSWAP4(ctr);
1398# else
1399 PUTU32(ctx->Yi.c + 12, ctr);
1400# endif
1401 else
1402 ctx->Yi.d[3] = ctr;
c1b2569d
AP
1403# if defined(GHASH)
1404 while (len--) {
1405 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1406 ++n;
1407 }
1408# else
0f113f3e
MC
1409 while (len--) {
1410 u8 c = in[n];
1411 ctx->Xi.c[n] ^= c;
1412 out[n] = c ^ ctx->EKi.c[n];
1413 ++n;
1414 }
c1b2569d
AP
1415 mres = n;
1416# endif
0f113f3e
MC
1417 }
1418
c1b2569d 1419 ctx->mres = mres;
0f113f3e
MC
1420 return 0;
1421 } while (0);
1422 }
997d1aac 1423#endif
0f113f3e
MC
1424 for (i = 0; i < len; ++i) {
1425 u8 c;
1426 if (n == 0) {
1427 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1428 ++ctr;
1429 if (is_endian.little)
997d1aac 1430#ifdef BSWAP4
0f113f3e 1431 ctx->Yi.d[3] = BSWAP4(ctr);
997d1aac 1432#else
0f113f3e
MC
1433 PUTU32(ctx->Yi.c + 12, ctr);
1434#endif
1435 else
1436 ctx->Yi.d[3] = ctr;
1437 }
c1b2569d
AP
1438#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1439 out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1440 n = (n + 1) % 16;
1441 if (mres == sizeof(ctx->Xn)) {
1442 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1443 mres = 0;
1444 }
1445#else
0f113f3e
MC
1446 c = in[i];
1447 out[i] = c ^ ctx->EKi.c[n];
1448 ctx->Xi.c[n] ^= c;
c1b2569d 1449 mres = n = (n + 1) % 16;
0f113f3e 1450 if (n == 0)
f5791af3 1451 GCM_MUL(ctx);
c1b2569d 1452#endif
0f113f3e 1453 }
96a4cf8c 1454
c1b2569d 1455 ctx->mres = mres;
0f113f3e 1456 return 0;
e7f5b1cd
AP
1457}
1458
1f2502eb 1459int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
0f113f3e
MC
1460 const unsigned char *in, unsigned char *out,
1461 size_t len, ctr128_f stream)
f71c6ace 1462{
2e635aa8
AP
1463#if defined(OPENSSL_SMALL_FOOTPRINT)
1464 return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1465#else
0f113f3e
MC
1466 const union {
1467 long one;
1468 char little;
2e635aa8 1469 } is_endian = { 1 };
c1b2569d 1470 unsigned int n, ctr, mres;
0f113f3e
MC
1471 size_t i;
1472 u64 mlen = ctx->len.u[1];
1473 void *key = ctx->key;
2e635aa8 1474# ifdef GCM_FUNCREF_4BIT
0f113f3e 1475 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
2e635aa8 1476# ifdef GHASH
0f113f3e
MC
1477 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1478 const u8 *inp, size_t len) = ctx->ghash;
2e635aa8 1479# endif
d8d95832 1480# endif
1f2502eb 1481
0f113f3e
MC
1482 mlen += len;
1483 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1484 return -1;
1485 ctx->len.u[1] = mlen;
f71c6ace 1486
c1b2569d
AP
1487 mres = ctx->mres;
1488
0f113f3e
MC
1489 if (ctx->ares) {
1490 /* First call to encrypt finalizes GHASH(AAD) */
c1b2569d
AP
1491#if defined(GHASH)
1492 if (len == 0) {
1493 GCM_MUL(ctx);
1494 ctx->ares = 0;
1495 return 0;
1496 }
1497 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1498 ctx->Xi.u[0] = 0;
1499 ctx->Xi.u[1] = 0;
1500 mres = sizeof(ctx->Xi);
1501#else
f5791af3 1502 GCM_MUL(ctx);
c1b2569d 1503#endif
0f113f3e
MC
1504 ctx->ares = 0;
1505 }
b68c1315 1506
0f113f3e 1507 if (is_endian.little)
2e635aa8 1508# ifdef BSWAP4
0f113f3e 1509 ctr = BSWAP4(ctx->Yi.d[3]);
2e635aa8 1510# else
0f113f3e 1511 ctr = GETU32(ctx->Yi.c + 12);
2e635aa8 1512# endif
0f113f3e
MC
1513 else
1514 ctr = ctx->Yi.d[3];
1515
c1b2569d 1516 n = mres % 16;
0f113f3e 1517 if (n) {
c1b2569d
AP
1518# if defined(GHASH)
1519 while (n && len) {
1520 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1521 --len;
1522 n = (n + 1) % 16;
1523 }
1524 if (n == 0) {
1525 GHASH(ctx, ctx->Xn, mres);
1526 mres = 0;
1527 } else {
1528 ctx->mres = mres;
1529 return 0;
1530 }
1531# else
0f113f3e
MC
1532 while (n && len) {
1533 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1534 --len;
1535 n = (n + 1) % 16;
1536 }
c1b2569d 1537 if (n == 0) {
f5791af3 1538 GCM_MUL(ctx);
c1b2569d
AP
1539 mres = 0;
1540 } else {
0f113f3e
MC
1541 ctx->mres = n;
1542 return 0;
1543 }
c1b2569d 1544# endif
0f113f3e 1545 }
c1b2569d
AP
1546# if defined(GHASH)
1547 if (len >= 16 && mres) {
1548 GHASH(ctx, ctx->Xn, mres);
1549 mres = 0;
1550 }
1551# if defined(GHASH_CHUNK)
0f113f3e
MC
1552 while (len >= GHASH_CHUNK) {
1553 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1554 ctr += GHASH_CHUNK / 16;
1555 if (is_endian.little)
c1b2569d 1556# ifdef BSWAP4
0f113f3e 1557 ctx->Yi.d[3] = BSWAP4(ctr);
c1b2569d 1558# else
0f113f3e 1559 PUTU32(ctx->Yi.c + 12, ctr);
c1b2569d 1560# endif
0f113f3e
MC
1561 else
1562 ctx->Yi.d[3] = ctr;
1563 GHASH(ctx, out, GHASH_CHUNK);
1564 out += GHASH_CHUNK;
1565 in += GHASH_CHUNK;
1566 len -= GHASH_CHUNK;
1567 }
c1b2569d 1568# endif
2e635aa8 1569# endif
0f113f3e
MC
1570 if ((i = (len & (size_t)-16))) {
1571 size_t j = i / 16;
f71c6ace 1572
0f113f3e
MC
1573 (*stream) (in, out, j, key, ctx->Yi.c);
1574 ctr += (unsigned int)j;
1575 if (is_endian.little)
2e635aa8 1576# ifdef BSWAP4
0f113f3e 1577 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1578# else
0f113f3e 1579 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1580# endif
0f113f3e
MC
1581 else
1582 ctx->Yi.d[3] = ctr;
1583 in += i;
1584 len -= i;
2e635aa8 1585# if defined(GHASH)
0f113f3e
MC
1586 GHASH(ctx, out, i);
1587 out += i;
2e635aa8 1588# else
0f113f3e
MC
1589 while (j--) {
1590 for (i = 0; i < 16; ++i)
1591 ctx->Xi.c[i] ^= out[i];
f5791af3 1592 GCM_MUL(ctx);
0f113f3e
MC
1593 out += 16;
1594 }
2e635aa8 1595# endif
0f113f3e
MC
1596 }
1597 if (len) {
1598 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1599 ++ctr;
1600 if (is_endian.little)
2e635aa8 1601# ifdef BSWAP4
0f113f3e 1602 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1603# else
0f113f3e 1604 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1605# endif
0f113f3e
MC
1606 else
1607 ctx->Yi.d[3] = ctr;
1608 while (len--) {
c1b2569d
AP
1609# if defined(GHASH)
1610 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1611# else
1612 ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1613# endif
0f113f3e
MC
1614 ++n;
1615 }
1616 }
1617
c1b2569d 1618 ctx->mres = mres;
0f113f3e 1619 return 0;
2e635aa8 1620#endif
f71c6ace
AP
1621}
1622
1f2502eb 1623int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
0f113f3e
MC
1624 const unsigned char *in, unsigned char *out,
1625 size_t len, ctr128_f stream)
f71c6ace 1626{
2e635aa8
AP
1627#if defined(OPENSSL_SMALL_FOOTPRINT)
1628 return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1629#else
0f113f3e
MC
1630 const union {
1631 long one;
1632 char little;
2e635aa8 1633 } is_endian = { 1 };
c1b2569d 1634 unsigned int n, ctr, mres;
0f113f3e
MC
1635 size_t i;
1636 u64 mlen = ctx->len.u[1];
1637 void *key = ctx->key;
2e635aa8 1638# ifdef GCM_FUNCREF_4BIT
0f113f3e 1639 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
2e635aa8 1640# ifdef GHASH
0f113f3e
MC
1641 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1642 const u8 *inp, size_t len) = ctx->ghash;
2e635aa8 1643# endif
d8d95832 1644# endif
1f2502eb 1645
0f113f3e
MC
1646 mlen += len;
1647 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1648 return -1;
1649 ctx->len.u[1] = mlen;
f71c6ace 1650
c1b2569d
AP
1651 mres = ctx->mres;
1652
0f113f3e
MC
1653 if (ctx->ares) {
1654 /* First call to decrypt finalizes GHASH(AAD) */
c1b2569d
AP
1655# if defined(GHASH)
1656 if (len == 0) {
1657 GCM_MUL(ctx);
1658 ctx->ares = 0;
1659 return 0;
1660 }
1661 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1662 ctx->Xi.u[0] = 0;
1663 ctx->Xi.u[1] = 0;
1664 mres = sizeof(ctx->Xi);
1665# else
f5791af3 1666 GCM_MUL(ctx);
c1b2569d 1667# endif
0f113f3e
MC
1668 ctx->ares = 0;
1669 }
b68c1315 1670
0f113f3e 1671 if (is_endian.little)
2e635aa8 1672# ifdef BSWAP4
0f113f3e 1673 ctr = BSWAP4(ctx->Yi.d[3]);
2e635aa8 1674# else
0f113f3e 1675 ctr = GETU32(ctx->Yi.c + 12);
2e635aa8 1676# endif
0f113f3e
MC
1677 else
1678 ctr = ctx->Yi.d[3];
1679
c1b2569d 1680 n = mres % 16;
0f113f3e 1681 if (n) {
c1b2569d
AP
1682# if defined(GHASH)
1683 while (n && len) {
1684 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1685 --len;
1686 n = (n + 1) % 16;
1687 }
1688 if (n == 0) {
1689 GHASH(ctx, ctx->Xn, mres);
1690 mres = 0;
1691 } else {
1692 ctx->mres = mres;
1693 return 0;
1694 }
1695# else
0f113f3e
MC
1696 while (n && len) {
1697 u8 c = *(in++);
1698 *(out++) = c ^ ctx->EKi.c[n];
1699 ctx->Xi.c[n] ^= c;
1700 --len;
1701 n = (n + 1) % 16;
1702 }
c1b2569d 1703 if (n == 0) {
f5791af3 1704 GCM_MUL(ctx);
c1b2569d
AP
1705 mres = 0;
1706 } else {
0f113f3e
MC
1707 ctx->mres = n;
1708 return 0;
1709 }
c1b2569d 1710# endif
0f113f3e 1711 }
c1b2569d
AP
1712# if defined(GHASH)
1713 if (len >= 16 && mres) {
1714 GHASH(ctx, ctx->Xn, mres);
1715 mres = 0;
1716 }
1717# if defined(GHASH_CHUNK)
0f113f3e
MC
1718 while (len >= GHASH_CHUNK) {
1719 GHASH(ctx, in, GHASH_CHUNK);
1720 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1721 ctr += GHASH_CHUNK / 16;
1722 if (is_endian.little)
c1b2569d 1723# ifdef BSWAP4
0f113f3e 1724 ctx->Yi.d[3] = BSWAP4(ctr);
c1b2569d 1725# else
0f113f3e 1726 PUTU32(ctx->Yi.c + 12, ctr);
c1b2569d 1727# endif
0f113f3e
MC
1728 else
1729 ctx->Yi.d[3] = ctr;
1730 out += GHASH_CHUNK;
1731 in += GHASH_CHUNK;
1732 len -= GHASH_CHUNK;
1733 }
c1b2569d 1734# endif
2e635aa8 1735# endif
0f113f3e
MC
1736 if ((i = (len & (size_t)-16))) {
1737 size_t j = i / 16;
f71c6ace 1738
2e635aa8 1739# if defined(GHASH)
0f113f3e 1740 GHASH(ctx, in, i);
2e635aa8 1741# else
0f113f3e
MC
1742 while (j--) {
1743 size_t k;
1744 for (k = 0; k < 16; ++k)
1745 ctx->Xi.c[k] ^= in[k];
f5791af3 1746 GCM_MUL(ctx);
0f113f3e
MC
1747 in += 16;
1748 }
1749 j = i / 16;
1750 in -= i;
2e635aa8 1751# endif
0f113f3e
MC
1752 (*stream) (in, out, j, key, ctx->Yi.c);
1753 ctr += (unsigned int)j;
1754 if (is_endian.little)
2e635aa8 1755# ifdef BSWAP4
0f113f3e 1756 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1757# else
0f113f3e 1758 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1759# endif
0f113f3e
MC
1760 else
1761 ctx->Yi.d[3] = ctr;
1762 out += i;
1763 in += i;
1764 len -= i;
1765 }
1766 if (len) {
1767 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1768 ++ctr;
1769 if (is_endian.little)
2e635aa8 1770# ifdef BSWAP4
0f113f3e 1771 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1772# else
0f113f3e 1773 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1774# endif
0f113f3e
MC
1775 else
1776 ctx->Yi.d[3] = ctr;
1777 while (len--) {
c1b2569d
AP
1778# if defined(GHASH)
1779 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1780# else
0f113f3e 1781 u8 c = in[n];
c1b2569d 1782 ctx->Xi.c[mres++] ^= c;
0f113f3e 1783 out[n] = c ^ ctx->EKi.c[n];
c1b2569d 1784# endif
0f113f3e
MC
1785 ++n;
1786 }
1787 }
1788
c1b2569d 1789 ctx->mres = mres;
0f113f3e 1790 return 0;
2e635aa8 1791#endif
f71c6ace
AP
1792}
1793
0f113f3e
MC
1794int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1795 size_t len)
e7f5b1cd 1796{
0f113f3e
MC
1797 const union {
1798 long one;
1799 char little;
2e635aa8 1800 } is_endian = { 1 };
0f113f3e
MC
1801 u64 alen = ctx->len.u[0] << 3;
1802 u64 clen = ctx->len.u[1] << 3;
d8d95832 1803#ifdef GCM_FUNCREF_4BIT
0f113f3e 1804 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
c1b2569d
AP
1805# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1806 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1807 const u8 *inp, size_t len) = ctx->ghash;
1808# endif
d8d95832 1809#endif
e7f5b1cd 1810
c1b2569d
AP
1811#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1812 u128 bitlen;
1813 unsigned int mres = ctx->mres;
1814
1815 if (mres) {
1816 unsigned blocks = (mres + 15) & -16;
1817
1818 memset(ctx->Xn + mres, 0, blocks - mres);
1819 mres = blocks;
1820 if (mres == sizeof(ctx->Xn)) {
1821 GHASH(ctx, ctx->Xn, mres);
1822 mres = 0;
1823 }
1824 } else if (ctx->ares) {
1825 GCM_MUL(ctx);
1826 }
1827#else
0f113f3e 1828 if (ctx->mres || ctx->ares)
f5791af3 1829 GCM_MUL(ctx);
c1b2569d 1830#endif
e7f5b1cd 1831
0f113f3e 1832 if (is_endian.little) {
e7f5b1cd 1833#ifdef BSWAP8
0f113f3e
MC
1834 alen = BSWAP8(alen);
1835 clen = BSWAP8(clen);
e7f5b1cd 1836#else
0f113f3e 1837 u8 *p = ctx->len.c;
e7f5b1cd 1838
0f113f3e
MC
1839 ctx->len.u[0] = alen;
1840 ctx->len.u[1] = clen;
e7f5b1cd 1841
0f113f3e
MC
1842 alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1843 clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
e7f5b1cd 1844#endif
0f113f3e 1845 }
e7f5b1cd 1846
c1b2569d
AP
1847#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1848 bitlen.hi = alen;
1849 bitlen.lo = clen;
1850 memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1851 mres += sizeof(bitlen);
1852 GHASH(ctx, ctx->Xn, mres);
1853#else
0f113f3e
MC
1854 ctx->Xi.u[0] ^= alen;
1855 ctx->Xi.u[1] ^= clen;
f5791af3 1856 GCM_MUL(ctx);
c1b2569d 1857#endif
e7f5b1cd 1858
0f113f3e
MC
1859 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1860 ctx->Xi.u[1] ^= ctx->EK0.u[1];
6acb4ff3 1861
0f113f3e 1862 if (tag && len <= sizeof(ctx->Xi))
1e4a355d 1863 return CRYPTO_memcmp(ctx->Xi.c, tag, len);
0f113f3e
MC
1864 else
1865 return -1;
6acb4ff3
AP
1866}
1867
fd3dbc1d
DSH
1868void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1869{
0f113f3e
MC
1870 CRYPTO_gcm128_finish(ctx, NULL, 0);
1871 memcpy(tag, ctx->Xi.c,
1872 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
fd3dbc1d
DSH
1873}
1874
6acb4ff3
AP
1875GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1876{
0f113f3e 1877 GCM128_CONTEXT *ret;
6acb4ff3 1878
90945fa3 1879 if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
0f113f3e 1880 CRYPTO_gcm128_init(ret, key, block);
6acb4ff3 1881
0f113f3e 1882 return ret;
6acb4ff3
AP
1883}
1884
1885void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1886{
4b45c6e5 1887 OPENSSL_clear_free(ctx, sizeof(*ctx));
e7f5b1cd 1888}