]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/modes/gcm128.c
Some cleanups to apps/ca.c
[thirdparty/openssl.git] / crypto / modes / gcm128.c
CommitLineData
4f22f405
RS
1/*
2 * Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
e7f5b1cd 3 *
4f22f405
RS
4 * Licensed under the OpenSSL license (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
e7f5b1cd
AP
8 */
9
aa763c0f 10#include <openssl/crypto.h>
f472ec8c 11#include "modes_lcl.h"
e7f5b1cd
AP
12#include <string.h>
13
f472ec8c
AP
14#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
15/* redefine, because alignment is ensured */
0f113f3e
MC
16# undef GETU32
17# define GETU32(p) BSWAP4(*(const u32 *)(p))
18# undef PUTU32
19# define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
20#endif
21
22#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
23#define REDUCE1BIT(V) do { \
24 if (sizeof(size_t)==8) { \
25 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
26 V.lo = (V.hi<<63)|(V.lo>>1); \
27 V.hi = (V.hi>>1 )^T; \
28 } \
29 else { \
30 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
31 V.lo = (V.hi<<63)|(V.lo>>1); \
32 V.hi = (V.hi>>1 )^((u64)T<<32); \
33 } \
c1f092d1
AP
34} while(0)
35
1d97c843 36/*-
d8d95832
AP
37 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
38 * never be set to 8. 8 is effectively reserved for testing purposes.
39 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
40 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
41 * whole spectrum of possible table driven implementations. Why? In
42 * non-"Shoup's" case memory access pattern is segmented in such manner,
43 * that it's trivial to see that cache timing information can reveal
44 * fair portion of intermediate hash value. Given that ciphertext is
45 * always available to attacker, it's possible for him to attempt to
46 * deduce secret parameter H and if successful, tamper with messages
47 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
48 * not as trivial, but there is no reason to believe that it's resistant
49 * to cache-timing attack. And the thing about "8-bit" implementation is
50 * that it consumes 16 (sixteen) times more memory, 4KB per individual
51 * key + 1KB shared. Well, on pros side it should be twice as fast as
52 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
53 * was observed to run ~75% faster, closer to 100% for commercial
54 * compilers... Yet "4-bit" procedure is preferred, because it's
55 * believed to provide better security-performance balance and adequate
56 * all-round performance. "All-round" refers to things like:
57 *
58 * - shorter setup time effectively improves overall timing for
59 * handling short messages;
60 * - larger table allocation can become unbearable because of VM
61 * subsystem penalties (for example on Windows large enough free
62 * results in VM working set trimming, meaning that consequent
63 * malloc would immediately incur working set expansion);
64 * - larger table has larger cache footprint, which can affect
65 * performance of other code paths (not necessarily even from same
66 * thread in Hyper-Threading world);
67 *
68 * Value of 1 is not appropriate for performance reasons.
69 */
0f113f3e 70#if TABLE_BITS==8
a595baff 71
e7f5b1cd
AP
72static void gcm_init_8bit(u128 Htable[256], u64 H[2])
73{
0f113f3e
MC
74 int i, j;
75 u128 V;
76
77 Htable[0].hi = 0;
78 Htable[0].lo = 0;
79 V.hi = H[0];
80 V.lo = H[1];
81
82 for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
83 REDUCE1BIT(V);
84 Htable[i] = V;
85 }
86
87 for (i = 2; i < 256; i <<= 1) {
88 u128 *Hi = Htable + i, H0 = *Hi;
89 for (j = 1; j < i; ++j) {
90 Hi[j].hi = H0.hi ^ Htable[j].hi;
91 Hi[j].lo = H0.lo ^ Htable[j].lo;
92 }
93 }
e7f5b1cd
AP
94}
95
d8d95832 96static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
e7f5b1cd 97{
0f113f3e
MC
98 u128 Z = { 0, 0 };
99 const u8 *xi = (const u8 *)Xi + 15;
100 size_t rem, n = *xi;
101 const union {
102 long one;
103 char little;
2e635aa8 104 } is_endian = { 1 };
0f113f3e
MC
105 static const size_t rem_8bit[256] = {
106 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
107 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
108 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
109 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
110 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
111 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
112 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
113 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
114 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
115 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
116 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
117 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
118 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
119 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
120 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
121 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
122 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
123 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
124 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
125 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
126 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
127 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
128 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
129 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
130 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
131 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
132 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
133 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
134 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
135 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
136 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
137 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
138 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
139 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
140 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
141 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
142 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
143 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
144 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
145 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
146 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
147 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
148 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
149 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
150 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
151 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
152 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
153 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
154 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
155 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
156 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
157 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
158 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
159 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
160 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
161 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
162 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
163 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
164 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
165 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
166 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
167 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
168 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
169 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
170 };
171
172 while (1) {
173 Z.hi ^= Htable[n].hi;
174 Z.lo ^= Htable[n].lo;
175
176 if ((u8 *)Xi == xi)
177 break;
178
179 n = *(--xi);
180
181 rem = (size_t)Z.lo & 0xff;
182 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
183 Z.hi = (Z.hi >> 8);
184 if (sizeof(size_t) == 8)
185 Z.hi ^= rem_8bit[rem];
186 else
187 Z.hi ^= (u64)rem_8bit[rem] << 32;
188 }
189
190 if (is_endian.little) {
191# ifdef BSWAP8
192 Xi[0] = BSWAP8(Z.hi);
193 Xi[1] = BSWAP8(Z.lo);
194# else
195 u8 *p = (u8 *)Xi;
196 u32 v;
197 v = (u32)(Z.hi >> 32);
198 PUTU32(p, v);
199 v = (u32)(Z.hi);
200 PUTU32(p + 4, v);
201 v = (u32)(Z.lo >> 32);
202 PUTU32(p + 8, v);
203 v = (u32)(Z.lo);
204 PUTU32(p + 12, v);
205# endif
206 } else {
207 Xi[0] = Z.hi;
208 Xi[1] = Z.lo;
209 }
e7f5b1cd 210}
e7f5b1cd 211
0f113f3e
MC
212# define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
213
214#elif TABLE_BITS==4
2262beef 215
e7f5b1cd
AP
216static void gcm_init_4bit(u128 Htable[16], u64 H[2])
217{
0f113f3e
MC
218 u128 V;
219# if defined(OPENSSL_SMALL_FOOTPRINT)
220 int i;
221# endif
e7f5b1cd 222
0f113f3e
MC
223 Htable[0].hi = 0;
224 Htable[0].lo = 0;
225 V.hi = H[0];
226 V.lo = H[1];
227
228# if defined(OPENSSL_SMALL_FOOTPRINT)
229 for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
230 REDUCE1BIT(V);
231 Htable[i] = V;
232 }
233
234 for (i = 2; i < 16; i <<= 1) {
235 u128 *Hi = Htable + i;
236 int j;
237 for (V = *Hi, j = 1; j < i; ++j) {
238 Hi[j].hi = V.hi ^ Htable[j].hi;
239 Hi[j].lo = V.lo ^ Htable[j].lo;
240 }
241 }
242# else
243 Htable[8] = V;
244 REDUCE1BIT(V);
245 Htable[4] = V;
246 REDUCE1BIT(V);
247 Htable[2] = V;
248 REDUCE1BIT(V);
249 Htable[1] = V;
250 Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
251 V = Htable[4];
252 Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
253 Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
254 Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
255 V = Htable[8];
256 Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
257 Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
258 Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
259 Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
260 Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
261 Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
262 Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
263# endif
264# if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
265 /*
266 * ARM assembler expects specific dword order in Htable.
267 */
268 {
269 int j;
270 const union {
271 long one;
272 char little;
2e635aa8 273 } is_endian = { 1 };
0f113f3e
MC
274
275 if (is_endian.little)
276 for (j = 0; j < 16; ++j) {
277 V = Htable[j];
278 Htable[j].hi = V.lo;
279 Htable[j].lo = V.hi;
280 } else
281 for (j = 0; j < 16; ++j) {
282 V = Htable[j];
283 Htable[j].hi = V.lo << 32 | V.lo >> 32;
284 Htable[j].lo = V.hi << 32 | V.hi >> 32;
285 }
286 }
287# endif
e7f5b1cd
AP
288}
289
0f113f3e 290# ifndef GHASH_ASM
2262beef 291static const size_t rem_4bit[16] = {
0f113f3e
MC
292 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
293 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
294 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
295 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
296};
2262beef 297
4f39edbf 298static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
e7f5b1cd 299{
0f113f3e
MC
300 u128 Z;
301 int cnt = 15;
302 size_t rem, nlo, nhi;
303 const union {
304 long one;
305 char little;
2e635aa8 306 } is_endian = { 1 };
0f113f3e
MC
307
308 nlo = ((const u8 *)Xi)[15];
309 nhi = nlo >> 4;
310 nlo &= 0xf;
311
312 Z.hi = Htable[nlo].hi;
313 Z.lo = Htable[nlo].lo;
314
315 while (1) {
316 rem = (size_t)Z.lo & 0xf;
317 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
318 Z.hi = (Z.hi >> 4);
319 if (sizeof(size_t) == 8)
320 Z.hi ^= rem_4bit[rem];
321 else
322 Z.hi ^= (u64)rem_4bit[rem] << 32;
323
324 Z.hi ^= Htable[nhi].hi;
325 Z.lo ^= Htable[nhi].lo;
326
327 if (--cnt < 0)
328 break;
329
330 nlo = ((const u8 *)Xi)[cnt];
331 nhi = nlo >> 4;
332 nlo &= 0xf;
333
334 rem = (size_t)Z.lo & 0xf;
335 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
336 Z.hi = (Z.hi >> 4);
337 if (sizeof(size_t) == 8)
338 Z.hi ^= rem_4bit[rem];
339 else
340 Z.hi ^= (u64)rem_4bit[rem] << 32;
341
342 Z.hi ^= Htable[nlo].hi;
343 Z.lo ^= Htable[nlo].lo;
344 }
345
346 if (is_endian.little) {
347# ifdef BSWAP8
348 Xi[0] = BSWAP8(Z.hi);
349 Xi[1] = BSWAP8(Z.lo);
350# else
351 u8 *p = (u8 *)Xi;
352 u32 v;
353 v = (u32)(Z.hi >> 32);
354 PUTU32(p, v);
355 v = (u32)(Z.hi);
356 PUTU32(p + 4, v);
357 v = (u32)(Z.lo >> 32);
358 PUTU32(p + 8, v);
359 v = (u32)(Z.lo);
360 PUTU32(p + 12, v);
361# endif
362 } else {
363 Xi[0] = Z.hi;
364 Xi[1] = Z.lo;
365 }
2262beef
AP
366}
367
0f113f3e 368# if !defined(OPENSSL_SMALL_FOOTPRINT)
2262beef
AP
369/*
370 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
a595baff
AP
371 * details... Compiler-generated code doesn't seem to give any
372 * performance improvement, at least not on x86[_64]. It's here
373 * mostly as reference and a placeholder for possible future
374 * non-trivial optimization[s]...
2262beef 375 */
0f113f3e
MC
376static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
377 const u8 *inp, size_t len)
2262beef
AP
378{
379 u128 Z;
380 int cnt;
381 size_t rem, nlo, nhi;
0f113f3e
MC
382 const union {
383 long one;
384 char little;
2e635aa8 385 } is_endian = { 1 };
0f113f3e
MC
386
387# if 1
2262beef 388 do {
0f113f3e
MC
389 cnt = 15;
390 nlo = ((const u8 *)Xi)[15];
391 nlo ^= inp[15];
392 nhi = nlo >> 4;
393 nlo &= 0xf;
394
395 Z.hi = Htable[nlo].hi;
396 Z.lo = Htable[nlo].lo;
397
398 while (1) {
399 rem = (size_t)Z.lo & 0xf;
400 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
401 Z.hi = (Z.hi >> 4);
402 if (sizeof(size_t) == 8)
403 Z.hi ^= rem_4bit[rem];
404 else
405 Z.hi ^= (u64)rem_4bit[rem] << 32;
406
407 Z.hi ^= Htable[nhi].hi;
408 Z.lo ^= Htable[nhi].lo;
409
410 if (--cnt < 0)
411 break;
412
413 nlo = ((const u8 *)Xi)[cnt];
414 nlo ^= inp[cnt];
415 nhi = nlo >> 4;
416 nlo &= 0xf;
417
418 rem = (size_t)Z.lo & 0xf;
419 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
420 Z.hi = (Z.hi >> 4);
421 if (sizeof(size_t) == 8)
422 Z.hi ^= rem_4bit[rem];
423 else
424 Z.hi ^= (u64)rem_4bit[rem] << 32;
425
426 Z.hi ^= Htable[nlo].hi;
427 Z.lo ^= Htable[nlo].lo;
428 }
429# else
e747f4d4
AP
430 /*
431 * Extra 256+16 bytes per-key plus 512 bytes shared tables
432 * [should] give ~50% improvement... One could have PACK()-ed
6acb4ff3
AP
433 * the rem_8bit even here, but the priority is to minimize
434 * cache footprint...
0f113f3e
MC
435 */
436 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
437 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
e747f4d4 438 static const unsigned short rem_8bit[256] = {
0f113f3e
MC
439 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
440 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
441 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
442 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
443 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
444 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
445 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
446 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
447 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
448 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
449 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
450 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
451 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
452 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
453 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
454 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
455 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
456 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
457 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
458 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
459 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
460 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
461 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
462 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
463 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
464 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
465 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
466 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
467 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
468 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
469 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
470 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
471 };
e747f4d4
AP
472 /*
473 * This pre-processing phase slows down procedure by approximately
474 * same time as it makes each loop spin faster. In other words
475 * single block performance is approximately same as straightforward
476 * "4-bit" implementation, and then it goes only faster...
477 */
0f113f3e
MC
478 for (cnt = 0; cnt < 16; ++cnt) {
479 Z.hi = Htable[cnt].hi;
480 Z.lo = Htable[cnt].lo;
481 Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
482 Hshr4[cnt].hi = (Z.hi >> 4);
483 Hshl4[cnt] = (u8)(Z.lo << 4);
e747f4d4
AP
484 }
485
486 do {
0f113f3e
MC
487 for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
488 nlo = ((const u8 *)Xi)[cnt];
489 nlo ^= inp[cnt];
490 nhi = nlo >> 4;
491 nlo &= 0xf;
e747f4d4 492
0f113f3e
MC
493 Z.hi ^= Htable[nlo].hi;
494 Z.lo ^= Htable[nlo].lo;
e747f4d4 495
0f113f3e 496 rem = (size_t)Z.lo & 0xff;
e747f4d4 497
0f113f3e
MC
498 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
499 Z.hi = (Z.hi >> 8);
e747f4d4 500
0f113f3e
MC
501 Z.hi ^= Hshr4[nhi].hi;
502 Z.lo ^= Hshr4[nhi].lo;
503 Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
504 }
e747f4d4 505
0f113f3e
MC
506 nlo = ((const u8 *)Xi)[0];
507 nlo ^= inp[0];
508 nhi = nlo >> 4;
509 nlo &= 0xf;
e747f4d4 510
0f113f3e
MC
511 Z.hi ^= Htable[nlo].hi;
512 Z.lo ^= Htable[nlo].lo;
e747f4d4 513
0f113f3e 514 rem = (size_t)Z.lo & 0xf;
e747f4d4 515
0f113f3e
MC
516 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
517 Z.hi = (Z.hi >> 4);
e747f4d4 518
0f113f3e
MC
519 Z.hi ^= Htable[nhi].hi;
520 Z.lo ^= Htable[nhi].lo;
521 Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
522# endif
e7f5b1cd 523
0f113f3e
MC
524 if (is_endian.little) {
525# ifdef BSWAP8
526 Xi[0] = BSWAP8(Z.hi);
527 Xi[1] = BSWAP8(Z.lo);
528# else
529 u8 *p = (u8 *)Xi;
530 u32 v;
531 v = (u32)(Z.hi >> 32);
532 PUTU32(p, v);
533 v = (u32)(Z.hi);
534 PUTU32(p + 4, v);
535 v = (u32)(Z.lo >> 32);
536 PUTU32(p + 8, v);
537 v = (u32)(Z.lo);
538 PUTU32(p + 12, v);
539# endif
540 } else {
541 Xi[0] = Z.hi;
542 Xi[1] = Z.lo;
543 }
544 } while (inp += 16, len -= 16);
e7f5b1cd 545}
0f113f3e
MC
546# endif
547# else
548void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
549void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
550 size_t len);
551# endif
2262beef 552
0f113f3e
MC
553# define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
554# if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
555# define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
556/*
557 * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
558 * effect. In other words idea is to hash data while it's still in L1 cache
559 * after encryption pass...
560 */
561# define GHASH_CHUNK (3*1024)
562# endif
2262beef 563
0f113f3e 564#else /* TABLE_BITS */
e7f5b1cd 565
0f113f3e 566static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
e7f5b1cd 567{
0f113f3e
MC
568 u128 V, Z = { 0, 0 };
569 long X;
570 int i, j;
571 const long *xi = (const long *)Xi;
572 const union {
573 long one;
574 char little;
2e635aa8 575 } is_endian = { 1 };
0f113f3e
MC
576
577 V.hi = H[0]; /* H is in host byte order, no byte swapping */
578 V.lo = H[1];
579
580 for (j = 0; j < 16 / sizeof(long); ++j) {
581 if (is_endian.little) {
582 if (sizeof(long) == 8) {
583# ifdef BSWAP8
584 X = (long)(BSWAP8(xi[j]));
585# else
586 const u8 *p = (const u8 *)(xi + j);
587 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
588# endif
589 } else {
590 const u8 *p = (const u8 *)(xi + j);
591 X = (long)GETU32(p);
592 }
593 } else
594 X = xi[j];
595
596 for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
597 u64 M = (u64)(X >> (8 * sizeof(long) - 1));
598 Z.hi ^= V.hi & M;
599 Z.lo ^= V.lo & M;
600
601 REDUCE1BIT(V);
602 }
603 }
604
605 if (is_endian.little) {
606# ifdef BSWAP8
607 Xi[0] = BSWAP8(Z.hi);
608 Xi[1] = BSWAP8(Z.lo);
609# else
610 u8 *p = (u8 *)Xi;
611 u32 v;
612 v = (u32)(Z.hi >> 32);
613 PUTU32(p, v);
614 v = (u32)(Z.hi);
615 PUTU32(p + 4, v);
616 v = (u32)(Z.lo >> 32);
617 PUTU32(p + 8, v);
618 v = (u32)(Z.lo);
619 PUTU32(p + 12, v);
620# endif
621 } else {
622 Xi[0] = Z.hi;
623 Xi[1] = Z.lo;
624 }
e7f5b1cd 625}
0f113f3e
MC
626
627# define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
a595baff 628
e7f5b1cd
AP
629#endif
630
0f113f3e
MC
631#if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
632# if !defined(I386_ONLY) && \
633 (defined(__i386) || defined(__i386__) || \
634 defined(__x86_64) || defined(__x86_64__) || \
635 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
1e863180
AP
636# define GHASH_ASM_X86_OR_64
637# define GCM_FUNCREF_4BIT
75c4827d 638extern unsigned int OPENSSL_ia32cap_P[];
c1f092d1 639
0f113f3e
MC
640void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
641void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
642void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
643 size_t len);
c1f092d1 644
0f113f3e
MC
645# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
646# define gcm_init_avx gcm_init_clmul
647# define gcm_gmult_avx gcm_gmult_clmul
648# define gcm_ghash_avx gcm_ghash_clmul
649# else
650void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
651void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
652void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
653 size_t len);
654# endif
1da5d302 655
0f113f3e 656# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
1e863180 657# define GHASH_ASM_X86
0f113f3e
MC
658void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
659void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
660 size_t len);
c1f092d1 661
0f113f3e
MC
662void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
663void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
664 size_t len);
1e863180 665# endif
82741e9c 666# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
1e863180 667# include "arm_arch.h"
c1669e1c 668# if __ARM_MAX_ARCH__>=7
1e863180
AP
669# define GHASH_ASM_ARM
670# define GCM_FUNCREF_4BIT
0f113f3e 671# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
82741e9c 672# if defined(__arm__) || defined(__arm)
0f113f3e 673# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
82741e9c 674# endif
0f113f3e
MC
675void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
676void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
677void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
678 size_t len);
679void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
680void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
681void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
682 size_t len);
1e863180 683# endif
23328d4b
AP
684# elif defined(__sparc__) || defined(__sparc)
685# include "sparc_arch.h"
686# define GHASH_ASM_SPARC
687# define GCM_FUNCREF_4BIT
688extern unsigned int OPENSSL_sparcv9cap_P[];
0f113f3e
MC
689void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
690void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
691void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
692 size_t len);
693# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
0e716d92
AP
694# include "ppc_arch.h"
695# define GHASH_ASM_PPC
696# define GCM_FUNCREF_4BIT
0f113f3e
MC
697void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
698void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
699void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
700 size_t len);
c1f092d1 701# endif
c1f092d1
AP
702#endif
703
7af04002
AP
704#ifdef GCM_FUNCREF_4BIT
705# undef GCM_MUL
0f113f3e 706# define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
7af04002
AP
707# ifdef GHASH
708# undef GHASH
0f113f3e 709# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
7af04002
AP
710# endif
711#endif
712
0f113f3e 713void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
e7f5b1cd 714{
0f113f3e
MC
715 const union {
716 long one;
717 char little;
2e635aa8 718 } is_endian = { 1 };
e7f5b1cd 719
0f113f3e
MC
720 memset(ctx, 0, sizeof(*ctx));
721 ctx->block = block;
722 ctx->key = key;
e7f5b1cd 723
0f113f3e 724 (*block) (ctx->H.c, ctx->H.c, key);
e7f5b1cd 725
0f113f3e
MC
726 if (is_endian.little) {
727 /* H is stored in host byte order */
e7f5b1cd 728#ifdef BSWAP8
0f113f3e
MC
729 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
730 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
e7f5b1cd 731#else
0f113f3e
MC
732 u8 *p = ctx->H.c;
733 u64 hi, lo;
734 hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
735 lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
736 ctx->H.u[0] = hi;
737 ctx->H.u[1] = lo;
e7f5b1cd 738#endif
0f113f3e
MC
739 }
740#if TABLE_BITS==8
741 gcm_init_8bit(ctx->Htable, ctx->H.u);
742#elif TABLE_BITS==4
2e635aa8
AP
743# if defined(GHASH)
744# define CTX__GHASH(f) (ctx->ghash = (f))
745# else
746# define CTX__GHASH(f) (ctx->ghash = NULL)
747# endif
0f113f3e
MC
748# if defined(GHASH_ASM_X86_OR_64)
749# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
6e5a853b 750 if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
0f113f3e
MC
751 if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
752 gcm_init_avx(ctx->Htable, ctx->H.u);
753 ctx->gmult = gcm_gmult_avx;
2e635aa8 754 CTX__GHASH(gcm_ghash_avx);
0f113f3e
MC
755 } else {
756 gcm_init_clmul(ctx->Htable, ctx->H.u);
757 ctx->gmult = gcm_gmult_clmul;
2e635aa8 758 CTX__GHASH(gcm_ghash_clmul);
0f113f3e
MC
759 }
760 return;
761 }
a6d915e0 762# endif
0f113f3e
MC
763 gcm_init_4bit(ctx->Htable, ctx->H.u);
764# if defined(GHASH_ASM_X86) /* x86 only */
765# if defined(OPENSSL_IA32_SSE2)
766 if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
98909c1d 767# else
0f113f3e 768 if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
98909c1d 769# endif
0f113f3e 770 ctx->gmult = gcm_gmult_4bit_mmx;
2e635aa8 771 CTX__GHASH(gcm_ghash_4bit_mmx);
0f113f3e
MC
772 } else {
773 ctx->gmult = gcm_gmult_4bit_x86;
2e635aa8 774 CTX__GHASH(gcm_ghash_4bit_x86);
0f113f3e 775 }
c1f092d1 776# else
0f113f3e 777 ctx->gmult = gcm_gmult_4bit;
2e635aa8 778 CTX__GHASH(gcm_ghash_4bit);
c1f092d1 779# endif
0f113f3e 780# elif defined(GHASH_ASM_ARM)
82741e9c 781# ifdef PMULL_CAPABLE
0f113f3e
MC
782 if (PMULL_CAPABLE) {
783 gcm_init_v8(ctx->Htable, ctx->H.u);
784 ctx->gmult = gcm_gmult_v8;
2e635aa8 785 CTX__GHASH(gcm_ghash_v8);
0f113f3e 786 } else
82741e9c
AP
787# endif
788# ifdef NEON_CAPABLE
0f113f3e
MC
789 if (NEON_CAPABLE) {
790 gcm_init_neon(ctx->Htable, ctx->H.u);
791 ctx->gmult = gcm_gmult_neon;
2e635aa8 792 CTX__GHASH(gcm_ghash_neon);
0f113f3e 793 } else
82741e9c 794# endif
0f113f3e
MC
795 {
796 gcm_init_4bit(ctx->Htable, ctx->H.u);
797 ctx->gmult = gcm_gmult_4bit;
2e635aa8 798 CTX__GHASH(gcm_ghash_4bit);
0f113f3e
MC
799 }
800# elif defined(GHASH_ASM_SPARC)
801 if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
802 gcm_init_vis3(ctx->Htable, ctx->H.u);
803 ctx->gmult = gcm_gmult_vis3;
2e635aa8 804 CTX__GHASH(gcm_ghash_vis3);
0f113f3e
MC
805 } else {
806 gcm_init_4bit(ctx->Htable, ctx->H.u);
807 ctx->gmult = gcm_gmult_4bit;
2e635aa8 808 CTX__GHASH(gcm_ghash_4bit);
0f113f3e
MC
809 }
810# elif defined(GHASH_ASM_PPC)
811 if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
812 gcm_init_p8(ctx->Htable, ctx->H.u);
813 ctx->gmult = gcm_gmult_p8;
2e635aa8 814 CTX__GHASH(gcm_ghash_p8);
0f113f3e
MC
815 } else {
816 gcm_init_4bit(ctx->Htable, ctx->H.u);
817 ctx->gmult = gcm_gmult_4bit;
2e635aa8 818 CTX__GHASH(gcm_ghash_4bit);
0f113f3e 819 }
c1f092d1 820# else
0f113f3e 821 gcm_init_4bit(ctx->Htable, ctx->H.u);
c1f092d1 822# endif
2e635aa8 823# undef CTX__GHASH
a595baff 824#endif
e7f5b1cd
AP
825}
826
0f113f3e
MC
827void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
828 size_t len)
e7f5b1cd 829{
0f113f3e
MC
830 const union {
831 long one;
832 char little;
2e635aa8 833 } is_endian = { 1 };
0f113f3e 834 unsigned int ctr;
d8d95832 835#ifdef GCM_FUNCREF_4BIT
0f113f3e
MC
836 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
837#endif
838
839 ctx->Yi.u[0] = 0;
840 ctx->Yi.u[1] = 0;
841 ctx->Xi.u[0] = 0;
842 ctx->Xi.u[1] = 0;
843 ctx->len.u[0] = 0; /* AAD length */
844 ctx->len.u[1] = 0; /* message length */
845 ctx->ares = 0;
846 ctx->mres = 0;
847
848 if (len == 12) {
849 memcpy(ctx->Yi.c, iv, 12);
850 ctx->Yi.c[15] = 1;
851 ctr = 1;
852 } else {
853 size_t i;
854 u64 len0 = len;
855
856 while (len >= 16) {
857 for (i = 0; i < 16; ++i)
858 ctx->Yi.c[i] ^= iv[i];
859 GCM_MUL(ctx, Yi);
860 iv += 16;
861 len -= 16;
862 }
863 if (len) {
864 for (i = 0; i < len; ++i)
865 ctx->Yi.c[i] ^= iv[i];
866 GCM_MUL(ctx, Yi);
867 }
868 len0 <<= 3;
869 if (is_endian.little) {
e7f5b1cd 870#ifdef BSWAP8
0f113f3e 871 ctx->Yi.u[1] ^= BSWAP8(len0);
e7f5b1cd 872#else
0f113f3e
MC
873 ctx->Yi.c[8] ^= (u8)(len0 >> 56);
874 ctx->Yi.c[9] ^= (u8)(len0 >> 48);
875 ctx->Yi.c[10] ^= (u8)(len0 >> 40);
876 ctx->Yi.c[11] ^= (u8)(len0 >> 32);
877 ctx->Yi.c[12] ^= (u8)(len0 >> 24);
878 ctx->Yi.c[13] ^= (u8)(len0 >> 16);
879 ctx->Yi.c[14] ^= (u8)(len0 >> 8);
880 ctx->Yi.c[15] ^= (u8)(len0);
e7f5b1cd 881#endif
0f113f3e
MC
882 } else
883 ctx->Yi.u[1] ^= len0;
e7f5b1cd 884
0f113f3e 885 GCM_MUL(ctx, Yi);
e7f5b1cd 886
0f113f3e 887 if (is_endian.little)
997d1aac 888#ifdef BSWAP4
0f113f3e 889 ctr = BSWAP4(ctx->Yi.d[3]);
997d1aac 890#else
0f113f3e 891 ctr = GETU32(ctx->Yi.c + 12);
997d1aac 892#endif
0f113f3e
MC
893 else
894 ctr = ctx->Yi.d[3];
895 }
e7f5b1cd 896
0f113f3e
MC
897 (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
898 ++ctr;
899 if (is_endian.little)
997d1aac 900#ifdef BSWAP4
0f113f3e 901 ctx->Yi.d[3] = BSWAP4(ctr);
997d1aac 902#else
0f113f3e 903 PUTU32(ctx->Yi.c + 12, ctr);
997d1aac 904#endif
0f113f3e
MC
905 else
906 ctx->Yi.d[3] = ctr;
e7f5b1cd
AP
907}
908
0f113f3e
MC
909int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
910 size_t len)
e7f5b1cd 911{
0f113f3e
MC
912 size_t i;
913 unsigned int n;
914 u64 alen = ctx->len.u[0];
d8d95832 915#ifdef GCM_FUNCREF_4BIT
0f113f3e 916 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
d8d95832 917# ifdef GHASH
0f113f3e
MC
918 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
919 const u8 *inp, size_t len) = ctx->ghash;
d8d95832
AP
920# endif
921#endif
e7f5b1cd 922
0f113f3e
MC
923 if (ctx->len.u[1])
924 return -2;
925
926 alen += len;
927 if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
928 return -1;
929 ctx->len.u[0] = alen;
930
931 n = ctx->ares;
932 if (n) {
933 while (n && len) {
934 ctx->Xi.c[n] ^= *(aad++);
935 --len;
936 n = (n + 1) % 16;
937 }
938 if (n == 0)
939 GCM_MUL(ctx, Xi);
940 else {
941 ctx->ares = n;
942 return 0;
943 }
944 }
2262beef 945#ifdef GHASH
0f113f3e
MC
946 if ((i = (len & (size_t)-16))) {
947 GHASH(ctx, aad, i);
948 aad += i;
949 len -= i;
950 }
2262beef 951#else
0f113f3e
MC
952 while (len >= 16) {
953 for (i = 0; i < 16; ++i)
954 ctx->Xi.c[i] ^= aad[i];
955 GCM_MUL(ctx, Xi);
956 aad += 16;
957 len -= 16;
958 }
2262beef 959#endif
0f113f3e
MC
960 if (len) {
961 n = (unsigned int)len;
962 for (i = 0; i < len; ++i)
963 ctx->Xi.c[i] ^= aad[i];
964 }
b68c1315 965
0f113f3e
MC
966 ctx->ares = n;
967 return 0;
e7f5b1cd
AP
968}
969
1f2502eb 970int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
0f113f3e
MC
971 const unsigned char *in, unsigned char *out,
972 size_t len)
e7f5b1cd 973{
0f113f3e
MC
974 const union {
975 long one;
976 char little;
2e635aa8 977 } is_endian = { 1 };
0f113f3e
MC
978 unsigned int n, ctr;
979 size_t i;
980 u64 mlen = ctx->len.u[1];
981 block128_f block = ctx->block;
982 void *key = ctx->key;
d8d95832 983#ifdef GCM_FUNCREF_4BIT
0f113f3e 984 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
2e635aa8 985# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
0f113f3e
MC
986 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
987 const u8 *inp, size_t len) = ctx->ghash;
d8d95832
AP
988# endif
989#endif
1f2502eb 990
0f113f3e
MC
991 mlen += len;
992 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
993 return -1;
994 ctx->len.u[1] = mlen;
e7f5b1cd 995
0f113f3e
MC
996 if (ctx->ares) {
997 /* First call to encrypt finalizes GHASH(AAD) */
998 GCM_MUL(ctx, Xi);
999 ctx->ares = 0;
1000 }
96a4cf8c 1001
0f113f3e 1002 if (is_endian.little)
997d1aac 1003#ifdef BSWAP4
0f113f3e 1004 ctr = BSWAP4(ctx->Yi.d[3]);
997d1aac 1005#else
0f113f3e 1006 ctr = GETU32(ctx->Yi.c + 12);
997d1aac 1007#endif
0f113f3e
MC
1008 else
1009 ctr = ctx->Yi.d[3];
96a4cf8c 1010
0f113f3e
MC
1011 n = ctx->mres;
1012#if !defined(OPENSSL_SMALL_FOOTPRINT)
1013 if (16 % sizeof(size_t) == 0) { /* always true actually */
1014 do {
1015 if (n) {
1016 while (n && len) {
1017 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1018 --len;
1019 n = (n + 1) % 16;
1020 }
1021 if (n == 0)
1022 GCM_MUL(ctx, Xi);
1023 else {
1024 ctx->mres = n;
1025 return 0;
1026 }
1027 }
1028# if defined(STRICT_ALIGNMENT)
1029 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1030 break;
1031# endif
2e635aa8
AP
1032# if defined(GHASH)
1033# if defined(GHASH_CHUNK)
0f113f3e
MC
1034 while (len >= GHASH_CHUNK) {
1035 size_t j = GHASH_CHUNK;
1036
1037 while (j) {
1038 size_t *out_t = (size_t *)out;
1039 const size_t *in_t = (const size_t *)in;
1040
1041 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1042 ++ctr;
1043 if (is_endian.little)
2e635aa8 1044# ifdef BSWAP4
0f113f3e 1045 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1046# else
0f113f3e 1047 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1048# endif
0f113f3e
MC
1049 else
1050 ctx->Yi.d[3] = ctr;
1051 for (i = 0; i < 16 / sizeof(size_t); ++i)
1052 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1053 out += 16;
1054 in += 16;
1055 j -= 16;
1056 }
1057 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1058 len -= GHASH_CHUNK;
1059 }
2e635aa8 1060# endif
0f113f3e
MC
1061 if ((i = (len & (size_t)-16))) {
1062 size_t j = i;
1063
1064 while (len >= 16) {
1065 size_t *out_t = (size_t *)out;
1066 const size_t *in_t = (const size_t *)in;
1067
1068 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1069 ++ctr;
1070 if (is_endian.little)
1071# ifdef BSWAP4
1072 ctx->Yi.d[3] = BSWAP4(ctr);
1073# else
1074 PUTU32(ctx->Yi.c + 12, ctr);
1075# endif
1076 else
1077 ctx->Yi.d[3] = ctr;
1078 for (i = 0; i < 16 / sizeof(size_t); ++i)
1079 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1080 out += 16;
1081 in += 16;
1082 len -= 16;
1083 }
1084 GHASH(ctx, out - j, j);
1085 }
1086# else
1087 while (len >= 16) {
1088 size_t *out_t = (size_t *)out;
1089 const size_t *in_t = (const size_t *)in;
1090
1091 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1092 ++ctr;
1093 if (is_endian.little)
1094# ifdef BSWAP4
1095 ctx->Yi.d[3] = BSWAP4(ctr);
1096# else
1097 PUTU32(ctx->Yi.c + 12, ctr);
1098# endif
1099 else
1100 ctx->Yi.d[3] = ctr;
1101 for (i = 0; i < 16 / sizeof(size_t); ++i)
1102 ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1103 GCM_MUL(ctx, Xi);
1104 out += 16;
1105 in += 16;
1106 len -= 16;
1107 }
1108# endif
1109 if (len) {
1110 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1111 ++ctr;
1112 if (is_endian.little)
1113# ifdef BSWAP4
1114 ctx->Yi.d[3] = BSWAP4(ctr);
1115# else
1116 PUTU32(ctx->Yi.c + 12, ctr);
1117# endif
1118 else
1119 ctx->Yi.d[3] = ctr;
1120 while (len--) {
1121 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1122 ++n;
1123 }
1124 }
1125
1126 ctx->mres = n;
1127 return 0;
1128 } while (0);
1129 }
e7f5b1cd 1130#endif
0f113f3e
MC
1131 for (i = 0; i < len; ++i) {
1132 if (n == 0) {
1133 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1134 ++ctr;
1135 if (is_endian.little)
997d1aac 1136#ifdef BSWAP4
0f113f3e 1137 ctx->Yi.d[3] = BSWAP4(ctr);
997d1aac 1138#else
0f113f3e
MC
1139 PUTU32(ctx->Yi.c + 12, ctr);
1140#endif
1141 else
1142 ctx->Yi.d[3] = ctr;
1143 }
1144 ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1145 n = (n + 1) % 16;
1146 if (n == 0)
1147 GCM_MUL(ctx, Xi);
1148 }
1149
1150 ctx->mres = n;
1151 return 0;
e7f5b1cd
AP
1152}
1153
1f2502eb 1154int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
0f113f3e
MC
1155 const unsigned char *in, unsigned char *out,
1156 size_t len)
e7f5b1cd 1157{
0f113f3e
MC
1158 const union {
1159 long one;
1160 char little;
2e635aa8 1161 } is_endian = { 1 };
0f113f3e
MC
1162 unsigned int n, ctr;
1163 size_t i;
1164 u64 mlen = ctx->len.u[1];
1165 block128_f block = ctx->block;
1166 void *key = ctx->key;
d8d95832 1167#ifdef GCM_FUNCREF_4BIT
0f113f3e 1168 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
2e635aa8 1169# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
0f113f3e
MC
1170 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1171 const u8 *inp, size_t len) = ctx->ghash;
d8d95832
AP
1172# endif
1173#endif
1f2502eb 1174
0f113f3e
MC
1175 mlen += len;
1176 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1177 return -1;
1178 ctx->len.u[1] = mlen;
e7f5b1cd 1179
0f113f3e
MC
1180 if (ctx->ares) {
1181 /* First call to decrypt finalizes GHASH(AAD) */
1182 GCM_MUL(ctx, Xi);
1183 ctx->ares = 0;
1184 }
b68c1315 1185
0f113f3e 1186 if (is_endian.little)
997d1aac 1187#ifdef BSWAP4
0f113f3e 1188 ctr = BSWAP4(ctx->Yi.d[3]);
997d1aac 1189#else
0f113f3e 1190 ctr = GETU32(ctx->Yi.c + 12);
997d1aac 1191#endif
0f113f3e
MC
1192 else
1193 ctr = ctx->Yi.d[3];
e7f5b1cd 1194
0f113f3e 1195 n = ctx->mres;
e7f5b1cd 1196#if !defined(OPENSSL_SMALL_FOOTPRINT)
0f113f3e
MC
1197 if (16 % sizeof(size_t) == 0) { /* always true actually */
1198 do {
1199 if (n) {
1200 while (n && len) {
1201 u8 c = *(in++);
1202 *(out++) = c ^ ctx->EKi.c[n];
1203 ctx->Xi.c[n] ^= c;
1204 --len;
1205 n = (n + 1) % 16;
1206 }
1207 if (n == 0)
1208 GCM_MUL(ctx, Xi);
1209 else {
1210 ctx->mres = n;
1211 return 0;
1212 }
1213 }
1214# if defined(STRICT_ALIGNMENT)
1215 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1216 break;
1217# endif
2e635aa8
AP
1218# if defined(GHASH)
1219# if defined(GHASH_CHUNK)
0f113f3e
MC
1220 while (len >= GHASH_CHUNK) {
1221 size_t j = GHASH_CHUNK;
1222
1223 GHASH(ctx, in, GHASH_CHUNK);
1224 while (j) {
1225 size_t *out_t = (size_t *)out;
1226 const size_t *in_t = (const size_t *)in;
1227
1228 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1229 ++ctr;
1230 if (is_endian.little)
2e635aa8 1231# ifdef BSWAP4
0f113f3e 1232 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1233# else
0f113f3e 1234 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1235# endif
0f113f3e
MC
1236 else
1237 ctx->Yi.d[3] = ctr;
1238 for (i = 0; i < 16 / sizeof(size_t); ++i)
1239 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1240 out += 16;
1241 in += 16;
1242 j -= 16;
1243 }
1244 len -= GHASH_CHUNK;
1245 }
2e635aa8 1246# endif
0f113f3e
MC
1247 if ((i = (len & (size_t)-16))) {
1248 GHASH(ctx, in, i);
1249 while (len >= 16) {
1250 size_t *out_t = (size_t *)out;
1251 const size_t *in_t = (const size_t *)in;
1252
1253 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1254 ++ctr;
1255 if (is_endian.little)
1256# ifdef BSWAP4
1257 ctx->Yi.d[3] = BSWAP4(ctr);
1258# else
1259 PUTU32(ctx->Yi.c + 12, ctr);
1260# endif
1261 else
1262 ctx->Yi.d[3] = ctr;
1263 for (i = 0; i < 16 / sizeof(size_t); ++i)
1264 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1265 out += 16;
1266 in += 16;
1267 len -= 16;
1268 }
1269 }
1270# else
1271 while (len >= 16) {
1272 size_t *out_t = (size_t *)out;
1273 const size_t *in_t = (const size_t *)in;
1274
1275 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1276 ++ctr;
1277 if (is_endian.little)
1278# ifdef BSWAP4
1279 ctx->Yi.d[3] = BSWAP4(ctr);
1280# else
1281 PUTU32(ctx->Yi.c + 12, ctr);
1282# endif
1283 else
1284 ctx->Yi.d[3] = ctr;
1285 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1286 size_t c = in[i];
1287 out[i] = c ^ ctx->EKi.t[i];
1288 ctx->Xi.t[i] ^= c;
1289 }
1290 GCM_MUL(ctx, Xi);
1291 out += 16;
1292 in += 16;
1293 len -= 16;
1294 }
1295# endif
1296 if (len) {
1297 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1298 ++ctr;
1299 if (is_endian.little)
1300# ifdef BSWAP4
1301 ctx->Yi.d[3] = BSWAP4(ctr);
1302# else
1303 PUTU32(ctx->Yi.c + 12, ctr);
1304# endif
1305 else
1306 ctx->Yi.d[3] = ctr;
1307 while (len--) {
1308 u8 c = in[n];
1309 ctx->Xi.c[n] ^= c;
1310 out[n] = c ^ ctx->EKi.c[n];
1311 ++n;
1312 }
1313 }
1314
1315 ctx->mres = n;
1316 return 0;
1317 } while (0);
1318 }
997d1aac 1319#endif
0f113f3e
MC
1320 for (i = 0; i < len; ++i) {
1321 u8 c;
1322 if (n == 0) {
1323 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1324 ++ctr;
1325 if (is_endian.little)
997d1aac 1326#ifdef BSWAP4
0f113f3e 1327 ctx->Yi.d[3] = BSWAP4(ctr);
997d1aac 1328#else
0f113f3e
MC
1329 PUTU32(ctx->Yi.c + 12, ctr);
1330#endif
1331 else
1332 ctx->Yi.d[3] = ctr;
1333 }
1334 c = in[i];
1335 out[i] = c ^ ctx->EKi.c[n];
1336 ctx->Xi.c[n] ^= c;
1337 n = (n + 1) % 16;
1338 if (n == 0)
1339 GCM_MUL(ctx, Xi);
1340 }
96a4cf8c 1341
0f113f3e
MC
1342 ctx->mres = n;
1343 return 0;
e7f5b1cd
AP
1344}
1345
1f2502eb 1346int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
0f113f3e
MC
1347 const unsigned char *in, unsigned char *out,
1348 size_t len, ctr128_f stream)
f71c6ace 1349{
2e635aa8
AP
1350#if defined(OPENSSL_SMALL_FOOTPRINT)
1351 return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1352#else
0f113f3e
MC
1353 const union {
1354 long one;
1355 char little;
2e635aa8 1356 } is_endian = { 1 };
0f113f3e
MC
1357 unsigned int n, ctr;
1358 size_t i;
1359 u64 mlen = ctx->len.u[1];
1360 void *key = ctx->key;
2e635aa8 1361# ifdef GCM_FUNCREF_4BIT
0f113f3e 1362 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
2e635aa8 1363# ifdef GHASH
0f113f3e
MC
1364 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1365 const u8 *inp, size_t len) = ctx->ghash;
2e635aa8 1366# endif
d8d95832 1367# endif
1f2502eb 1368
0f113f3e
MC
1369 mlen += len;
1370 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1371 return -1;
1372 ctx->len.u[1] = mlen;
f71c6ace 1373
0f113f3e
MC
1374 if (ctx->ares) {
1375 /* First call to encrypt finalizes GHASH(AAD) */
1376 GCM_MUL(ctx, Xi);
1377 ctx->ares = 0;
1378 }
b68c1315 1379
0f113f3e 1380 if (is_endian.little)
2e635aa8 1381# ifdef BSWAP4
0f113f3e 1382 ctr = BSWAP4(ctx->Yi.d[3]);
2e635aa8 1383# else
0f113f3e 1384 ctr = GETU32(ctx->Yi.c + 12);
2e635aa8 1385# endif
0f113f3e
MC
1386 else
1387 ctr = ctx->Yi.d[3];
1388
1389 n = ctx->mres;
1390 if (n) {
1391 while (n && len) {
1392 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1393 --len;
1394 n = (n + 1) % 16;
1395 }
1396 if (n == 0)
1397 GCM_MUL(ctx, Xi);
1398 else {
1399 ctx->mres = n;
1400 return 0;
1401 }
1402 }
2e635aa8 1403# if defined(GHASH) && defined(GHASH_CHUNK)
0f113f3e
MC
1404 while (len >= GHASH_CHUNK) {
1405 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1406 ctr += GHASH_CHUNK / 16;
1407 if (is_endian.little)
2e635aa8 1408# ifdef BSWAP4
0f113f3e 1409 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1410# else
0f113f3e 1411 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1412# endif
0f113f3e
MC
1413 else
1414 ctx->Yi.d[3] = ctr;
1415 GHASH(ctx, out, GHASH_CHUNK);
1416 out += GHASH_CHUNK;
1417 in += GHASH_CHUNK;
1418 len -= GHASH_CHUNK;
1419 }
2e635aa8 1420# endif
0f113f3e
MC
1421 if ((i = (len & (size_t)-16))) {
1422 size_t j = i / 16;
f71c6ace 1423
0f113f3e
MC
1424 (*stream) (in, out, j, key, ctx->Yi.c);
1425 ctr += (unsigned int)j;
1426 if (is_endian.little)
2e635aa8 1427# ifdef BSWAP4
0f113f3e 1428 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1429# else
0f113f3e 1430 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1431# endif
0f113f3e
MC
1432 else
1433 ctx->Yi.d[3] = ctr;
1434 in += i;
1435 len -= i;
2e635aa8 1436# if defined(GHASH)
0f113f3e
MC
1437 GHASH(ctx, out, i);
1438 out += i;
2e635aa8 1439# else
0f113f3e
MC
1440 while (j--) {
1441 for (i = 0; i < 16; ++i)
1442 ctx->Xi.c[i] ^= out[i];
1443 GCM_MUL(ctx, Xi);
1444 out += 16;
1445 }
2e635aa8 1446# endif
0f113f3e
MC
1447 }
1448 if (len) {
1449 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1450 ++ctr;
1451 if (is_endian.little)
2e635aa8 1452# ifdef BSWAP4
0f113f3e 1453 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1454# else
0f113f3e 1455 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1456# endif
0f113f3e
MC
1457 else
1458 ctx->Yi.d[3] = ctr;
1459 while (len--) {
1460 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1461 ++n;
1462 }
1463 }
1464
1465 ctx->mres = n;
1466 return 0;
2e635aa8 1467#endif
f71c6ace
AP
1468}
1469
1f2502eb 1470int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
0f113f3e
MC
1471 const unsigned char *in, unsigned char *out,
1472 size_t len, ctr128_f stream)
f71c6ace 1473{
2e635aa8
AP
1474#if defined(OPENSSL_SMALL_FOOTPRINT)
1475 return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1476#else
0f113f3e
MC
1477 const union {
1478 long one;
1479 char little;
2e635aa8 1480 } is_endian = { 1 };
0f113f3e
MC
1481 unsigned int n, ctr;
1482 size_t i;
1483 u64 mlen = ctx->len.u[1];
1484 void *key = ctx->key;
2e635aa8 1485# ifdef GCM_FUNCREF_4BIT
0f113f3e 1486 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
2e635aa8 1487# ifdef GHASH
0f113f3e
MC
1488 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1489 const u8 *inp, size_t len) = ctx->ghash;
2e635aa8 1490# endif
d8d95832 1491# endif
1f2502eb 1492
0f113f3e
MC
1493 mlen += len;
1494 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1495 return -1;
1496 ctx->len.u[1] = mlen;
f71c6ace 1497
0f113f3e
MC
1498 if (ctx->ares) {
1499 /* First call to decrypt finalizes GHASH(AAD) */
1500 GCM_MUL(ctx, Xi);
1501 ctx->ares = 0;
1502 }
b68c1315 1503
0f113f3e 1504 if (is_endian.little)
2e635aa8 1505# ifdef BSWAP4
0f113f3e 1506 ctr = BSWAP4(ctx->Yi.d[3]);
2e635aa8 1507# else
0f113f3e 1508 ctr = GETU32(ctx->Yi.c + 12);
2e635aa8 1509# endif
0f113f3e
MC
1510 else
1511 ctr = ctx->Yi.d[3];
1512
1513 n = ctx->mres;
1514 if (n) {
1515 while (n && len) {
1516 u8 c = *(in++);
1517 *(out++) = c ^ ctx->EKi.c[n];
1518 ctx->Xi.c[n] ^= c;
1519 --len;
1520 n = (n + 1) % 16;
1521 }
1522 if (n == 0)
1523 GCM_MUL(ctx, Xi);
1524 else {
1525 ctx->mres = n;
1526 return 0;
1527 }
1528 }
2e635aa8 1529# if defined(GHASH) && defined(GHASH_CHUNK)
0f113f3e
MC
1530 while (len >= GHASH_CHUNK) {
1531 GHASH(ctx, in, GHASH_CHUNK);
1532 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1533 ctr += GHASH_CHUNK / 16;
1534 if (is_endian.little)
2e635aa8 1535# ifdef BSWAP4
0f113f3e 1536 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1537# else
0f113f3e 1538 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1539# endif
0f113f3e
MC
1540 else
1541 ctx->Yi.d[3] = ctr;
1542 out += GHASH_CHUNK;
1543 in += GHASH_CHUNK;
1544 len -= GHASH_CHUNK;
1545 }
2e635aa8 1546# endif
0f113f3e
MC
1547 if ((i = (len & (size_t)-16))) {
1548 size_t j = i / 16;
f71c6ace 1549
2e635aa8 1550# if defined(GHASH)
0f113f3e 1551 GHASH(ctx, in, i);
2e635aa8 1552# else
0f113f3e
MC
1553 while (j--) {
1554 size_t k;
1555 for (k = 0; k < 16; ++k)
1556 ctx->Xi.c[k] ^= in[k];
1557 GCM_MUL(ctx, Xi);
1558 in += 16;
1559 }
1560 j = i / 16;
1561 in -= i;
2e635aa8 1562# endif
0f113f3e
MC
1563 (*stream) (in, out, j, key, ctx->Yi.c);
1564 ctr += (unsigned int)j;
1565 if (is_endian.little)
2e635aa8 1566# ifdef BSWAP4
0f113f3e 1567 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1568# else
0f113f3e 1569 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1570# endif
0f113f3e
MC
1571 else
1572 ctx->Yi.d[3] = ctr;
1573 out += i;
1574 in += i;
1575 len -= i;
1576 }
1577 if (len) {
1578 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1579 ++ctr;
1580 if (is_endian.little)
2e635aa8 1581# ifdef BSWAP4
0f113f3e 1582 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1583# else
0f113f3e 1584 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1585# endif
0f113f3e
MC
1586 else
1587 ctx->Yi.d[3] = ctr;
1588 while (len--) {
1589 u8 c = in[n];
1590 ctx->Xi.c[n] ^= c;
1591 out[n] = c ^ ctx->EKi.c[n];
1592 ++n;
1593 }
1594 }
1595
1596 ctx->mres = n;
1597 return 0;
2e635aa8 1598#endif
f71c6ace
AP
1599}
1600
0f113f3e
MC
1601int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1602 size_t len)
e7f5b1cd 1603{
0f113f3e
MC
1604 const union {
1605 long one;
1606 char little;
2e635aa8 1607 } is_endian = { 1 };
0f113f3e
MC
1608 u64 alen = ctx->len.u[0] << 3;
1609 u64 clen = ctx->len.u[1] << 3;
d8d95832 1610#ifdef GCM_FUNCREF_4BIT
0f113f3e 1611 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
d8d95832 1612#endif
e7f5b1cd 1613
0f113f3e
MC
1614 if (ctx->mres || ctx->ares)
1615 GCM_MUL(ctx, Xi);
e7f5b1cd 1616
0f113f3e 1617 if (is_endian.little) {
e7f5b1cd 1618#ifdef BSWAP8
0f113f3e
MC
1619 alen = BSWAP8(alen);
1620 clen = BSWAP8(clen);
e7f5b1cd 1621#else
0f113f3e 1622 u8 *p = ctx->len.c;
e7f5b1cd 1623
0f113f3e
MC
1624 ctx->len.u[0] = alen;
1625 ctx->len.u[1] = clen;
e7f5b1cd 1626
0f113f3e
MC
1627 alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1628 clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
e7f5b1cd 1629#endif
0f113f3e 1630 }
e7f5b1cd 1631
0f113f3e
MC
1632 ctx->Xi.u[0] ^= alen;
1633 ctx->Xi.u[1] ^= clen;
1634 GCM_MUL(ctx, Xi);
e7f5b1cd 1635
0f113f3e
MC
1636 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1637 ctx->Xi.u[1] ^= ctx->EK0.u[1];
6acb4ff3 1638
0f113f3e 1639 if (tag && len <= sizeof(ctx->Xi))
1e4a355d 1640 return CRYPTO_memcmp(ctx->Xi.c, tag, len);
0f113f3e
MC
1641 else
1642 return -1;
6acb4ff3
AP
1643}
1644
fd3dbc1d
DSH
1645void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1646{
0f113f3e
MC
1647 CRYPTO_gcm128_finish(ctx, NULL, 0);
1648 memcpy(tag, ctx->Xi.c,
1649 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
fd3dbc1d
DSH
1650}
1651
6acb4ff3
AP
1652GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1653{
0f113f3e 1654 GCM128_CONTEXT *ret;
6acb4ff3 1655
90945fa3 1656 if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
0f113f3e 1657 CRYPTO_gcm128_init(ret, key, block);
6acb4ff3 1658
0f113f3e 1659 return ret;
6acb4ff3
AP
1660}
1661
1662void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1663{
4b45c6e5 1664 OPENSSL_clear_free(ctx, sizeof(*ctx));
e7f5b1cd 1665}