]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/modes/gcm128.c
Update copyright year
[thirdparty/openssl.git] / crypto / modes / gcm128.c
CommitLineData
4f22f405 1/*
00c405b3 2 * Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
e7f5b1cd 3 *
81cae8ce 4 * Licensed under the Apache License 2.0 (the "License"). You may not use
4f22f405
RS
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
e7f5b1cd
AP
8 */
9
e7f5b1cd 10#include <string.h>
459b15d4 11#include <openssl/crypto.h>
24fd8541 12#include "internal/cryptlib.h"
25f2138b 13#include "crypto/modes.h"
e7f5b1cd 14
77286fe3
BE
15#if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
16typedef size_t size_t_aX __attribute((__aligned__(1)));
17#else
18typedef size_t size_t_aX;
19#endif
20
f472ec8c
AP
21#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
22/* redefine, because alignment is ensured */
0f113f3e
MC
23# undef GETU32
24# define GETU32(p) BSWAP4(*(const u32 *)(p))
25# undef PUTU32
26# define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
27#endif
28
29#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
30#define REDUCE1BIT(V) do { \
31 if (sizeof(size_t)==8) { \
32 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
33 V.lo = (V.hi<<63)|(V.lo>>1); \
34 V.hi = (V.hi>>1 )^T; \
35 } \
36 else { \
37 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
38 V.lo = (V.hi<<63)|(V.lo>>1); \
39 V.hi = (V.hi>>1 )^((u64)T<<32); \
40 } \
c1f092d1
AP
41} while(0)
42
1d97c843 43/*-
d8d95832
AP
44 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
45 * never be set to 8. 8 is effectively reserved for testing purposes.
46 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
47 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
48 * whole spectrum of possible table driven implementations. Why? In
49 * non-"Shoup's" case memory access pattern is segmented in such manner,
50 * that it's trivial to see that cache timing information can reveal
51 * fair portion of intermediate hash value. Given that ciphertext is
52 * always available to attacker, it's possible for him to attempt to
53 * deduce secret parameter H and if successful, tamper with messages
54 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
55 * not as trivial, but there is no reason to believe that it's resistant
56 * to cache-timing attack. And the thing about "8-bit" implementation is
57 * that it consumes 16 (sixteen) times more memory, 4KB per individual
58 * key + 1KB shared. Well, on pros side it should be twice as fast as
59 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
60 * was observed to run ~75% faster, closer to 100% for commercial
61 * compilers... Yet "4-bit" procedure is preferred, because it's
62 * believed to provide better security-performance balance and adequate
63 * all-round performance. "All-round" refers to things like:
64 *
65 * - shorter setup time effectively improves overall timing for
66 * handling short messages;
67 * - larger table allocation can become unbearable because of VM
68 * subsystem penalties (for example on Windows large enough free
69 * results in VM working set trimming, meaning that consequent
70 * malloc would immediately incur working set expansion);
71 * - larger table has larger cache footprint, which can affect
72 * performance of other code paths (not necessarily even from same
73 * thread in Hyper-Threading world);
74 *
75 * Value of 1 is not appropriate for performance reasons.
76 */
0f113f3e 77#if TABLE_BITS==8
a595baff 78
e7f5b1cd
AP
79static void gcm_init_8bit(u128 Htable[256], u64 H[2])
80{
0f113f3e
MC
81 int i, j;
82 u128 V;
83
84 Htable[0].hi = 0;
85 Htable[0].lo = 0;
86 V.hi = H[0];
87 V.lo = H[1];
88
89 for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
90 REDUCE1BIT(V);
91 Htable[i] = V;
92 }
93
94 for (i = 2; i < 256; i <<= 1) {
95 u128 *Hi = Htable + i, H0 = *Hi;
96 for (j = 1; j < i; ++j) {
97 Hi[j].hi = H0.hi ^ Htable[j].hi;
98 Hi[j].lo = H0.lo ^ Htable[j].lo;
99 }
100 }
e7f5b1cd
AP
101}
102
d8d95832 103static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
e7f5b1cd 104{
0f113f3e
MC
105 u128 Z = { 0, 0 };
106 const u8 *xi = (const u8 *)Xi + 15;
107 size_t rem, n = *xi;
108 const union {
109 long one;
110 char little;
2e635aa8 111 } is_endian = { 1 };
0f113f3e
MC
112 static const size_t rem_8bit[256] = {
113 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
114 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
115 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
116 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
117 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
118 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
119 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
120 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
121 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
122 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
123 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
124 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
125 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
126 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
127 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
128 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
129 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
130 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
131 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
132 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
133 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
134 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
135 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
136 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
137 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
138 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
139 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
140 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
141 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
142 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
143 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
144 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
145 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
146 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
147 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
148 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
149 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
150 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
151 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
152 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
153 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
154 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
155 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
156 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
157 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
158 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
159 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
160 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
161 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
162 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
163 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
164 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
165 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
166 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
167 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
168 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
169 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
170 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
171 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
172 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
173 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
174 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
175 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
176 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
177 };
178
179 while (1) {
180 Z.hi ^= Htable[n].hi;
181 Z.lo ^= Htable[n].lo;
182
183 if ((u8 *)Xi == xi)
184 break;
185
186 n = *(--xi);
187
188 rem = (size_t)Z.lo & 0xff;
189 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
190 Z.hi = (Z.hi >> 8);
191 if (sizeof(size_t) == 8)
192 Z.hi ^= rem_8bit[rem];
193 else
194 Z.hi ^= (u64)rem_8bit[rem] << 32;
195 }
196
197 if (is_endian.little) {
198# ifdef BSWAP8
199 Xi[0] = BSWAP8(Z.hi);
200 Xi[1] = BSWAP8(Z.lo);
201# else
202 u8 *p = (u8 *)Xi;
203 u32 v;
204 v = (u32)(Z.hi >> 32);
205 PUTU32(p, v);
206 v = (u32)(Z.hi);
207 PUTU32(p + 4, v);
208 v = (u32)(Z.lo >> 32);
209 PUTU32(p + 8, v);
210 v = (u32)(Z.lo);
211 PUTU32(p + 12, v);
212# endif
213 } else {
214 Xi[0] = Z.hi;
215 Xi[1] = Z.lo;
216 }
e7f5b1cd 217}
e7f5b1cd 218
f5791af3 219# define GCM_MUL(ctx) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
0f113f3e
MC
220
221#elif TABLE_BITS==4
2262beef 222
e7f5b1cd
AP
223static void gcm_init_4bit(u128 Htable[16], u64 H[2])
224{
0f113f3e
MC
225 u128 V;
226# if defined(OPENSSL_SMALL_FOOTPRINT)
227 int i;
228# endif
e7f5b1cd 229
0f113f3e
MC
230 Htable[0].hi = 0;
231 Htable[0].lo = 0;
232 V.hi = H[0];
233 V.lo = H[1];
234
235# if defined(OPENSSL_SMALL_FOOTPRINT)
236 for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
237 REDUCE1BIT(V);
238 Htable[i] = V;
239 }
240
241 for (i = 2; i < 16; i <<= 1) {
242 u128 *Hi = Htable + i;
243 int j;
244 for (V = *Hi, j = 1; j < i; ++j) {
245 Hi[j].hi = V.hi ^ Htable[j].hi;
246 Hi[j].lo = V.lo ^ Htable[j].lo;
247 }
248 }
249# else
250 Htable[8] = V;
251 REDUCE1BIT(V);
252 Htable[4] = V;
253 REDUCE1BIT(V);
254 Htable[2] = V;
255 REDUCE1BIT(V);
256 Htable[1] = V;
257 Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
258 V = Htable[4];
259 Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
260 Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
261 Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
262 V = Htable[8];
263 Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
264 Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
265 Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
266 Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
267 Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
268 Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
269 Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
270# endif
271# if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
272 /*
273 * ARM assembler expects specific dword order in Htable.
274 */
275 {
276 int j;
277 const union {
278 long one;
279 char little;
2e635aa8 280 } is_endian = { 1 };
0f113f3e
MC
281
282 if (is_endian.little)
283 for (j = 0; j < 16; ++j) {
284 V = Htable[j];
285 Htable[j].hi = V.lo;
286 Htable[j].lo = V.hi;
287 } else
288 for (j = 0; j < 16; ++j) {
289 V = Htable[j];
290 Htable[j].hi = V.lo << 32 | V.lo >> 32;
291 Htable[j].lo = V.hi << 32 | V.hi >> 32;
292 }
293 }
294# endif
e7f5b1cd
AP
295}
296
0f113f3e 297# ifndef GHASH_ASM
2262beef 298static const size_t rem_4bit[16] = {
0f113f3e
MC
299 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
300 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
301 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
302 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
303};
2262beef 304
4f39edbf 305static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
e7f5b1cd 306{
0f113f3e
MC
307 u128 Z;
308 int cnt = 15;
309 size_t rem, nlo, nhi;
310 const union {
311 long one;
312 char little;
2e635aa8 313 } is_endian = { 1 };
0f113f3e
MC
314
315 nlo = ((const u8 *)Xi)[15];
316 nhi = nlo >> 4;
317 nlo &= 0xf;
318
319 Z.hi = Htable[nlo].hi;
320 Z.lo = Htable[nlo].lo;
321
322 while (1) {
323 rem = (size_t)Z.lo & 0xf;
324 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
325 Z.hi = (Z.hi >> 4);
326 if (sizeof(size_t) == 8)
327 Z.hi ^= rem_4bit[rem];
328 else
329 Z.hi ^= (u64)rem_4bit[rem] << 32;
330
331 Z.hi ^= Htable[nhi].hi;
332 Z.lo ^= Htable[nhi].lo;
333
334 if (--cnt < 0)
335 break;
336
337 nlo = ((const u8 *)Xi)[cnt];
338 nhi = nlo >> 4;
339 nlo &= 0xf;
340
341 rem = (size_t)Z.lo & 0xf;
342 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
343 Z.hi = (Z.hi >> 4);
344 if (sizeof(size_t) == 8)
345 Z.hi ^= rem_4bit[rem];
346 else
347 Z.hi ^= (u64)rem_4bit[rem] << 32;
348
349 Z.hi ^= Htable[nlo].hi;
350 Z.lo ^= Htable[nlo].lo;
351 }
352
353 if (is_endian.little) {
354# ifdef BSWAP8
355 Xi[0] = BSWAP8(Z.hi);
356 Xi[1] = BSWAP8(Z.lo);
357# else
358 u8 *p = (u8 *)Xi;
359 u32 v;
360 v = (u32)(Z.hi >> 32);
361 PUTU32(p, v);
362 v = (u32)(Z.hi);
363 PUTU32(p + 4, v);
364 v = (u32)(Z.lo >> 32);
365 PUTU32(p + 8, v);
366 v = (u32)(Z.lo);
367 PUTU32(p + 12, v);
368# endif
369 } else {
370 Xi[0] = Z.hi;
371 Xi[1] = Z.lo;
372 }
2262beef
AP
373}
374
0f113f3e 375# if !defined(OPENSSL_SMALL_FOOTPRINT)
2262beef
AP
376/*
377 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
a595baff
AP
378 * details... Compiler-generated code doesn't seem to give any
379 * performance improvement, at least not on x86[_64]. It's here
380 * mostly as reference and a placeholder for possible future
381 * non-trivial optimization[s]...
2262beef 382 */
0f113f3e
MC
383static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
384 const u8 *inp, size_t len)
2262beef
AP
385{
386 u128 Z;
387 int cnt;
388 size_t rem, nlo, nhi;
0f113f3e
MC
389 const union {
390 long one;
391 char little;
2e635aa8 392 } is_endian = { 1 };
0f113f3e
MC
393
394# if 1
2262beef 395 do {
0f113f3e
MC
396 cnt = 15;
397 nlo = ((const u8 *)Xi)[15];
398 nlo ^= inp[15];
399 nhi = nlo >> 4;
400 nlo &= 0xf;
401
402 Z.hi = Htable[nlo].hi;
403 Z.lo = Htable[nlo].lo;
404
405 while (1) {
406 rem = (size_t)Z.lo & 0xf;
407 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
408 Z.hi = (Z.hi >> 4);
409 if (sizeof(size_t) == 8)
410 Z.hi ^= rem_4bit[rem];
411 else
412 Z.hi ^= (u64)rem_4bit[rem] << 32;
413
414 Z.hi ^= Htable[nhi].hi;
415 Z.lo ^= Htable[nhi].lo;
416
417 if (--cnt < 0)
418 break;
419
420 nlo = ((const u8 *)Xi)[cnt];
421 nlo ^= inp[cnt];
422 nhi = nlo >> 4;
423 nlo &= 0xf;
424
425 rem = (size_t)Z.lo & 0xf;
426 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
427 Z.hi = (Z.hi >> 4);
428 if (sizeof(size_t) == 8)
429 Z.hi ^= rem_4bit[rem];
430 else
431 Z.hi ^= (u64)rem_4bit[rem] << 32;
432
433 Z.hi ^= Htable[nlo].hi;
434 Z.lo ^= Htable[nlo].lo;
435 }
436# else
e747f4d4
AP
437 /*
438 * Extra 256+16 bytes per-key plus 512 bytes shared tables
439 * [should] give ~50% improvement... One could have PACK()-ed
6acb4ff3
AP
440 * the rem_8bit even here, but the priority is to minimize
441 * cache footprint...
0f113f3e
MC
442 */
443 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
444 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
e747f4d4 445 static const unsigned short rem_8bit[256] = {
0f113f3e
MC
446 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
447 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
448 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
449 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
450 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
451 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
452 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
453 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
454 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
455 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
456 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
457 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
458 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
459 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
460 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
461 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
462 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
463 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
464 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
465 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
466 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
467 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
468 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
469 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
470 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
471 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
472 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
473 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
474 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
475 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
476 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
477 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
478 };
e747f4d4
AP
479 /*
480 * This pre-processing phase slows down procedure by approximately
481 * same time as it makes each loop spin faster. In other words
482 * single block performance is approximately same as straightforward
483 * "4-bit" implementation, and then it goes only faster...
484 */
0f113f3e
MC
485 for (cnt = 0; cnt < 16; ++cnt) {
486 Z.hi = Htable[cnt].hi;
487 Z.lo = Htable[cnt].lo;
488 Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
489 Hshr4[cnt].hi = (Z.hi >> 4);
490 Hshl4[cnt] = (u8)(Z.lo << 4);
e747f4d4
AP
491 }
492
493 do {
0f113f3e
MC
494 for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
495 nlo = ((const u8 *)Xi)[cnt];
496 nlo ^= inp[cnt];
497 nhi = nlo >> 4;
498 nlo &= 0xf;
e747f4d4 499
0f113f3e
MC
500 Z.hi ^= Htable[nlo].hi;
501 Z.lo ^= Htable[nlo].lo;
e747f4d4 502
0f113f3e 503 rem = (size_t)Z.lo & 0xff;
e747f4d4 504
0f113f3e
MC
505 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
506 Z.hi = (Z.hi >> 8);
e747f4d4 507
0f113f3e
MC
508 Z.hi ^= Hshr4[nhi].hi;
509 Z.lo ^= Hshr4[nhi].lo;
510 Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
511 }
e747f4d4 512
0f113f3e
MC
513 nlo = ((const u8 *)Xi)[0];
514 nlo ^= inp[0];
515 nhi = nlo >> 4;
516 nlo &= 0xf;
e747f4d4 517
0f113f3e
MC
518 Z.hi ^= Htable[nlo].hi;
519 Z.lo ^= Htable[nlo].lo;
e747f4d4 520
0f113f3e 521 rem = (size_t)Z.lo & 0xf;
e747f4d4 522
0f113f3e
MC
523 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
524 Z.hi = (Z.hi >> 4);
e747f4d4 525
0f113f3e
MC
526 Z.hi ^= Htable[nhi].hi;
527 Z.lo ^= Htable[nhi].lo;
528 Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
529# endif
e7f5b1cd 530
0f113f3e
MC
531 if (is_endian.little) {
532# ifdef BSWAP8
533 Xi[0] = BSWAP8(Z.hi);
534 Xi[1] = BSWAP8(Z.lo);
535# else
536 u8 *p = (u8 *)Xi;
537 u32 v;
538 v = (u32)(Z.hi >> 32);
539 PUTU32(p, v);
540 v = (u32)(Z.hi);
541 PUTU32(p + 4, v);
542 v = (u32)(Z.lo >> 32);
543 PUTU32(p + 8, v);
544 v = (u32)(Z.lo);
545 PUTU32(p + 12, v);
546# endif
547 } else {
548 Xi[0] = Z.hi;
549 Xi[1] = Z.lo;
550 }
551 } while (inp += 16, len -= 16);
e7f5b1cd 552}
0f113f3e
MC
553# endif
554# else
555void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
556void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
557 size_t len);
558# endif
2262beef 559
f5791af3 560# define GCM_MUL(ctx) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
0f113f3e
MC
561# if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
562# define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
563/*
564 * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
565 * effect. In other words idea is to hash data while it's still in L1 cache
566 * after encryption pass...
567 */
568# define GHASH_CHUNK (3*1024)
569# endif
2262beef 570
0f113f3e 571#else /* TABLE_BITS */
e7f5b1cd 572
0f113f3e 573static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
e7f5b1cd 574{
0f113f3e
MC
575 u128 V, Z = { 0, 0 };
576 long X;
577 int i, j;
578 const long *xi = (const long *)Xi;
579 const union {
580 long one;
581 char little;
2e635aa8 582 } is_endian = { 1 };
0f113f3e
MC
583
584 V.hi = H[0]; /* H is in host byte order, no byte swapping */
585 V.lo = H[1];
586
587 for (j = 0; j < 16 / sizeof(long); ++j) {
588 if (is_endian.little) {
589 if (sizeof(long) == 8) {
590# ifdef BSWAP8
591 X = (long)(BSWAP8(xi[j]));
592# else
593 const u8 *p = (const u8 *)(xi + j);
594 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
595# endif
596 } else {
597 const u8 *p = (const u8 *)(xi + j);
598 X = (long)GETU32(p);
599 }
600 } else
601 X = xi[j];
602
603 for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
604 u64 M = (u64)(X >> (8 * sizeof(long) - 1));
605 Z.hi ^= V.hi & M;
606 Z.lo ^= V.lo & M;
607
608 REDUCE1BIT(V);
609 }
610 }
611
612 if (is_endian.little) {
613# ifdef BSWAP8
614 Xi[0] = BSWAP8(Z.hi);
615 Xi[1] = BSWAP8(Z.lo);
616# else
617 u8 *p = (u8 *)Xi;
618 u32 v;
619 v = (u32)(Z.hi >> 32);
620 PUTU32(p, v);
621 v = (u32)(Z.hi);
622 PUTU32(p + 4, v);
623 v = (u32)(Z.lo >> 32);
624 PUTU32(p + 8, v);
625 v = (u32)(Z.lo);
626 PUTU32(p + 12, v);
627# endif
628 } else {
629 Xi[0] = Z.hi;
630 Xi[1] = Z.lo;
631 }
e7f5b1cd 632}
0f113f3e 633
f5791af3 634# define GCM_MUL(ctx) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
a595baff 635
e7f5b1cd
AP
636#endif
637
0f113f3e
MC
638#if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
639# if !defined(I386_ONLY) && \
640 (defined(__i386) || defined(__i386__) || \
641 defined(__x86_64) || defined(__x86_64__) || \
642 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
1e863180
AP
643# define GHASH_ASM_X86_OR_64
644# define GCM_FUNCREF_4BIT
c1f092d1 645
0f113f3e
MC
646void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
647void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
648void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
649 size_t len);
c1f092d1 650
0f113f3e
MC
651# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
652# define gcm_init_avx gcm_init_clmul
653# define gcm_gmult_avx gcm_gmult_clmul
654# define gcm_ghash_avx gcm_ghash_clmul
655# else
656void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
657void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
658void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
659 size_t len);
660# endif
1da5d302 661
0f113f3e 662# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
1e863180 663# define GHASH_ASM_X86
0f113f3e
MC
664void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
665void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
666 size_t len);
c1f092d1 667
0f113f3e
MC
668void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
669void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
670 size_t len);
1e863180 671# endif
82741e9c 672# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
1e863180 673# include "arm_arch.h"
c1669e1c 674# if __ARM_MAX_ARCH__>=7
1e863180
AP
675# define GHASH_ASM_ARM
676# define GCM_FUNCREF_4BIT
0f113f3e 677# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
82741e9c 678# if defined(__arm__) || defined(__arm)
0f113f3e 679# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
82741e9c 680# endif
0f113f3e
MC
681void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
682void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
683void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
684 size_t len);
685void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
686void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
687void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
688 size_t len);
1e863180 689# endif
23328d4b
AP
690# elif defined(__sparc__) || defined(__sparc)
691# include "sparc_arch.h"
692# define GHASH_ASM_SPARC
693# define GCM_FUNCREF_4BIT
694extern unsigned int OPENSSL_sparcv9cap_P[];
0f113f3e
MC
695void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
696void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
697void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
698 size_t len);
699# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
0e716d92
AP
700# include "ppc_arch.h"
701# define GHASH_ASM_PPC
702# define GCM_FUNCREF_4BIT
0f113f3e
MC
703void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
704void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
705void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
706 size_t len);
c1f092d1 707# endif
c1f092d1
AP
708#endif
709
7af04002
AP
710#ifdef GCM_FUNCREF_4BIT
711# undef GCM_MUL
f5791af3 712# define GCM_MUL(ctx) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
7af04002
AP
713# ifdef GHASH
714# undef GHASH
0f113f3e 715# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
7af04002
AP
716# endif
717#endif
718
0f113f3e 719void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
e7f5b1cd 720{
0f113f3e
MC
721 const union {
722 long one;
723 char little;
2e635aa8 724 } is_endian = { 1 };
e7f5b1cd 725
0f113f3e
MC
726 memset(ctx, 0, sizeof(*ctx));
727 ctx->block = block;
728 ctx->key = key;
e7f5b1cd 729
0f113f3e 730 (*block) (ctx->H.c, ctx->H.c, key);
e7f5b1cd 731
0f113f3e
MC
732 if (is_endian.little) {
733 /* H is stored in host byte order */
e7f5b1cd 734#ifdef BSWAP8
0f113f3e
MC
735 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
736 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
e7f5b1cd 737#else
0f113f3e
MC
738 u8 *p = ctx->H.c;
739 u64 hi, lo;
740 hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
741 lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
742 ctx->H.u[0] = hi;
743 ctx->H.u[1] = lo;
e7f5b1cd 744#endif
0f113f3e
MC
745 }
746#if TABLE_BITS==8
747 gcm_init_8bit(ctx->Htable, ctx->H.u);
748#elif TABLE_BITS==4
2e635aa8
AP
749# if defined(GHASH)
750# define CTX__GHASH(f) (ctx->ghash = (f))
751# else
752# define CTX__GHASH(f) (ctx->ghash = NULL)
753# endif
0f113f3e
MC
754# if defined(GHASH_ASM_X86_OR_64)
755# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
6e5a853b 756 if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
0f113f3e
MC
757 if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
758 gcm_init_avx(ctx->Htable, ctx->H.u);
759 ctx->gmult = gcm_gmult_avx;
2e635aa8 760 CTX__GHASH(gcm_ghash_avx);
0f113f3e
MC
761 } else {
762 gcm_init_clmul(ctx->Htable, ctx->H.u);
763 ctx->gmult = gcm_gmult_clmul;
2e635aa8 764 CTX__GHASH(gcm_ghash_clmul);
0f113f3e
MC
765 }
766 return;
767 }
a6d915e0 768# endif
0f113f3e
MC
769 gcm_init_4bit(ctx->Htable, ctx->H.u);
770# if defined(GHASH_ASM_X86) /* x86 only */
771# if defined(OPENSSL_IA32_SSE2)
772 if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
98909c1d 773# else
0f113f3e 774 if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
98909c1d 775# endif
0f113f3e 776 ctx->gmult = gcm_gmult_4bit_mmx;
2e635aa8 777 CTX__GHASH(gcm_ghash_4bit_mmx);
0f113f3e
MC
778 } else {
779 ctx->gmult = gcm_gmult_4bit_x86;
2e635aa8 780 CTX__GHASH(gcm_ghash_4bit_x86);
0f113f3e 781 }
c1f092d1 782# else
0f113f3e 783 ctx->gmult = gcm_gmult_4bit;
2e635aa8 784 CTX__GHASH(gcm_ghash_4bit);
c1f092d1 785# endif
0f113f3e 786# elif defined(GHASH_ASM_ARM)
82741e9c 787# ifdef PMULL_CAPABLE
0f113f3e
MC
788 if (PMULL_CAPABLE) {
789 gcm_init_v8(ctx->Htable, ctx->H.u);
790 ctx->gmult = gcm_gmult_v8;
2e635aa8 791 CTX__GHASH(gcm_ghash_v8);
0f113f3e 792 } else
82741e9c
AP
793# endif
794# ifdef NEON_CAPABLE
0f113f3e
MC
795 if (NEON_CAPABLE) {
796 gcm_init_neon(ctx->Htable, ctx->H.u);
797 ctx->gmult = gcm_gmult_neon;
2e635aa8 798 CTX__GHASH(gcm_ghash_neon);
0f113f3e 799 } else
82741e9c 800# endif
0f113f3e
MC
801 {
802 gcm_init_4bit(ctx->Htable, ctx->H.u);
803 ctx->gmult = gcm_gmult_4bit;
2e635aa8 804 CTX__GHASH(gcm_ghash_4bit);
0f113f3e
MC
805 }
806# elif defined(GHASH_ASM_SPARC)
807 if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
808 gcm_init_vis3(ctx->Htable, ctx->H.u);
809 ctx->gmult = gcm_gmult_vis3;
2e635aa8 810 CTX__GHASH(gcm_ghash_vis3);
0f113f3e
MC
811 } else {
812 gcm_init_4bit(ctx->Htable, ctx->H.u);
813 ctx->gmult = gcm_gmult_4bit;
2e635aa8 814 CTX__GHASH(gcm_ghash_4bit);
0f113f3e
MC
815 }
816# elif defined(GHASH_ASM_PPC)
817 if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
818 gcm_init_p8(ctx->Htable, ctx->H.u);
819 ctx->gmult = gcm_gmult_p8;
2e635aa8 820 CTX__GHASH(gcm_ghash_p8);
0f113f3e
MC
821 } else {
822 gcm_init_4bit(ctx->Htable, ctx->H.u);
823 ctx->gmult = gcm_gmult_4bit;
2e635aa8 824 CTX__GHASH(gcm_ghash_4bit);
0f113f3e 825 }
c1f092d1 826# else
0f113f3e 827 gcm_init_4bit(ctx->Htable, ctx->H.u);
c1f092d1 828# endif
2e635aa8 829# undef CTX__GHASH
a595baff 830#endif
e7f5b1cd
AP
831}
832
0f113f3e
MC
833void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
834 size_t len)
e7f5b1cd 835{
0f113f3e
MC
836 const union {
837 long one;
838 char little;
2e635aa8 839 } is_endian = { 1 };
0f113f3e 840 unsigned int ctr;
d8d95832 841#ifdef GCM_FUNCREF_4BIT
0f113f3e
MC
842 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
843#endif
844
0f113f3e
MC
845 ctx->len.u[0] = 0; /* AAD length */
846 ctx->len.u[1] = 0; /* message length */
847 ctx->ares = 0;
848 ctx->mres = 0;
849
850 if (len == 12) {
851 memcpy(ctx->Yi.c, iv, 12);
f5791af3
AP
852 ctx->Yi.c[12] = 0;
853 ctx->Yi.c[13] = 0;
854 ctx->Yi.c[14] = 0;
0f113f3e
MC
855 ctx->Yi.c[15] = 1;
856 ctr = 1;
857 } else {
858 size_t i;
859 u64 len0 = len;
860
f5791af3
AP
861 /* Borrow ctx->Xi to calculate initial Yi */
862 ctx->Xi.u[0] = 0;
863 ctx->Xi.u[1] = 0;
864
0f113f3e
MC
865 while (len >= 16) {
866 for (i = 0; i < 16; ++i)
f5791af3
AP
867 ctx->Xi.c[i] ^= iv[i];
868 GCM_MUL(ctx);
0f113f3e
MC
869 iv += 16;
870 len -= 16;
871 }
872 if (len) {
873 for (i = 0; i < len; ++i)
f5791af3
AP
874 ctx->Xi.c[i] ^= iv[i];
875 GCM_MUL(ctx);
0f113f3e
MC
876 }
877 len0 <<= 3;
878 if (is_endian.little) {
e7f5b1cd 879#ifdef BSWAP8
f5791af3 880 ctx->Xi.u[1] ^= BSWAP8(len0);
e7f5b1cd 881#else
f5791af3
AP
882 ctx->Xi.c[8] ^= (u8)(len0 >> 56);
883 ctx->Xi.c[9] ^= (u8)(len0 >> 48);
884 ctx->Xi.c[10] ^= (u8)(len0 >> 40);
885 ctx->Xi.c[11] ^= (u8)(len0 >> 32);
886 ctx->Xi.c[12] ^= (u8)(len0 >> 24);
887 ctx->Xi.c[13] ^= (u8)(len0 >> 16);
888 ctx->Xi.c[14] ^= (u8)(len0 >> 8);
889 ctx->Xi.c[15] ^= (u8)(len0);
e7f5b1cd 890#endif
f5791af3
AP
891 } else {
892 ctx->Xi.u[1] ^= len0;
893 }
e7f5b1cd 894
f5791af3 895 GCM_MUL(ctx);
e7f5b1cd 896
0f113f3e 897 if (is_endian.little)
997d1aac 898#ifdef BSWAP4
f5791af3 899 ctr = BSWAP4(ctx->Xi.d[3]);
997d1aac 900#else
f5791af3 901 ctr = GETU32(ctx->Xi.c + 12);
997d1aac 902#endif
0f113f3e 903 else
f5791af3
AP
904 ctr = ctx->Xi.d[3];
905
906 /* Copy borrowed Xi to Yi */
907 ctx->Yi.u[0] = ctx->Xi.u[0];
908 ctx->Yi.u[1] = ctx->Xi.u[1];
0f113f3e 909 }
e7f5b1cd 910
f5791af3
AP
911 ctx->Xi.u[0] = 0;
912 ctx->Xi.u[1] = 0;
913
0f113f3e
MC
914 (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
915 ++ctr;
916 if (is_endian.little)
997d1aac 917#ifdef BSWAP4
0f113f3e 918 ctx->Yi.d[3] = BSWAP4(ctr);
997d1aac 919#else
0f113f3e 920 PUTU32(ctx->Yi.c + 12, ctr);
997d1aac 921#endif
0f113f3e
MC
922 else
923 ctx->Yi.d[3] = ctr;
e7f5b1cd
AP
924}
925
0f113f3e
MC
926int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
927 size_t len)
e7f5b1cd 928{
0f113f3e
MC
929 size_t i;
930 unsigned int n;
931 u64 alen = ctx->len.u[0];
d8d95832 932#ifdef GCM_FUNCREF_4BIT
0f113f3e 933 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
d8d95832 934# ifdef GHASH
0f113f3e
MC
935 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
936 const u8 *inp, size_t len) = ctx->ghash;
d8d95832
AP
937# endif
938#endif
e7f5b1cd 939
0f113f3e
MC
940 if (ctx->len.u[1])
941 return -2;
942
943 alen += len;
944 if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
945 return -1;
946 ctx->len.u[0] = alen;
947
948 n = ctx->ares;
949 if (n) {
950 while (n && len) {
951 ctx->Xi.c[n] ^= *(aad++);
952 --len;
953 n = (n + 1) % 16;
954 }
955 if (n == 0)
f5791af3 956 GCM_MUL(ctx);
0f113f3e
MC
957 else {
958 ctx->ares = n;
959 return 0;
960 }
961 }
2262beef 962#ifdef GHASH
0f113f3e
MC
963 if ((i = (len & (size_t)-16))) {
964 GHASH(ctx, aad, i);
965 aad += i;
966 len -= i;
967 }
2262beef 968#else
0f113f3e
MC
969 while (len >= 16) {
970 for (i = 0; i < 16; ++i)
971 ctx->Xi.c[i] ^= aad[i];
f5791af3 972 GCM_MUL(ctx);
0f113f3e
MC
973 aad += 16;
974 len -= 16;
975 }
2262beef 976#endif
0f113f3e
MC
977 if (len) {
978 n = (unsigned int)len;
979 for (i = 0; i < len; ++i)
980 ctx->Xi.c[i] ^= aad[i];
981 }
b68c1315 982
0f113f3e
MC
983 ctx->ares = n;
984 return 0;
e7f5b1cd
AP
985}
986
1f2502eb 987int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
0f113f3e
MC
988 const unsigned char *in, unsigned char *out,
989 size_t len)
e7f5b1cd 990{
0f113f3e
MC
991 const union {
992 long one;
993 char little;
2e635aa8 994 } is_endian = { 1 };
c1b2569d 995 unsigned int n, ctr, mres;
0f113f3e
MC
996 size_t i;
997 u64 mlen = ctx->len.u[1];
998 block128_f block = ctx->block;
999 void *key = ctx->key;
d8d95832 1000#ifdef GCM_FUNCREF_4BIT
0f113f3e 1001 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
2e635aa8 1002# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
0f113f3e
MC
1003 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1004 const u8 *inp, size_t len) = ctx->ghash;
d8d95832
AP
1005# endif
1006#endif
1f2502eb 1007
0f113f3e
MC
1008 mlen += len;
1009 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1010 return -1;
1011 ctx->len.u[1] = mlen;
e7f5b1cd 1012
c1b2569d
AP
1013 mres = ctx->mres;
1014
0f113f3e
MC
1015 if (ctx->ares) {
1016 /* First call to encrypt finalizes GHASH(AAD) */
c1b2569d
AP
1017#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1018 if (len == 0) {
1019 GCM_MUL(ctx);
1020 ctx->ares = 0;
1021 return 0;
1022 }
1023 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1024 ctx->Xi.u[0] = 0;
1025 ctx->Xi.u[1] = 0;
1026 mres = sizeof(ctx->Xi);
1027#else
f5791af3 1028 GCM_MUL(ctx);
c1b2569d 1029#endif
0f113f3e
MC
1030 ctx->ares = 0;
1031 }
96a4cf8c 1032
0f113f3e 1033 if (is_endian.little)
997d1aac 1034#ifdef BSWAP4
0f113f3e 1035 ctr = BSWAP4(ctx->Yi.d[3]);
997d1aac 1036#else
0f113f3e 1037 ctr = GETU32(ctx->Yi.c + 12);
997d1aac 1038#endif
0f113f3e
MC
1039 else
1040 ctr = ctx->Yi.d[3];
96a4cf8c 1041
c1b2569d 1042 n = mres % 16;
0f113f3e
MC
1043#if !defined(OPENSSL_SMALL_FOOTPRINT)
1044 if (16 % sizeof(size_t) == 0) { /* always true actually */
1045 do {
1046 if (n) {
c1b2569d
AP
1047# if defined(GHASH)
1048 while (n && len) {
1049 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1050 --len;
1051 n = (n + 1) % 16;
1052 }
1053 if (n == 0) {
1054 GHASH(ctx, ctx->Xn, mres);
1055 mres = 0;
1056 } else {
1057 ctx->mres = mres;
1058 return 0;
1059 }
1060# else
0f113f3e
MC
1061 while (n && len) {
1062 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1063 --len;
1064 n = (n + 1) % 16;
1065 }
c1b2569d 1066 if (n == 0) {
f5791af3 1067 GCM_MUL(ctx);
c1b2569d
AP
1068 mres = 0;
1069 } else {
0f113f3e
MC
1070 ctx->mres = n;
1071 return 0;
1072 }
c1b2569d 1073# endif
0f113f3e
MC
1074 }
1075# if defined(STRICT_ALIGNMENT)
1076 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1077 break;
1078# endif
2e635aa8 1079# if defined(GHASH)
c1b2569d
AP
1080 if (len >= 16 && mres) {
1081 GHASH(ctx, ctx->Xn, mres);
1082 mres = 0;
1083 }
2e635aa8 1084# if defined(GHASH_CHUNK)
0f113f3e
MC
1085 while (len >= GHASH_CHUNK) {
1086 size_t j = GHASH_CHUNK;
1087
1088 while (j) {
77286fe3
BE
1089 size_t_aX *out_t = (size_t_aX *)out;
1090 const size_t_aX *in_t = (const size_t_aX *)in;
0f113f3e
MC
1091
1092 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1093 ++ctr;
1094 if (is_endian.little)
2e635aa8 1095# ifdef BSWAP4
0f113f3e 1096 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1097# else
0f113f3e 1098 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1099# endif
0f113f3e
MC
1100 else
1101 ctx->Yi.d[3] = ctr;
1102 for (i = 0; i < 16 / sizeof(size_t); ++i)
1103 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1104 out += 16;
1105 in += 16;
1106 j -= 16;
1107 }
1108 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1109 len -= GHASH_CHUNK;
1110 }
2e635aa8 1111# endif
0f113f3e
MC
1112 if ((i = (len & (size_t)-16))) {
1113 size_t j = i;
1114
1115 while (len >= 16) {
77286fe3
BE
1116 size_t_aX *out_t = (size_t_aX *)out;
1117 const size_t_aX *in_t = (const size_t_aX *)in;
0f113f3e
MC
1118
1119 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1120 ++ctr;
1121 if (is_endian.little)
1122# ifdef BSWAP4
1123 ctx->Yi.d[3] = BSWAP4(ctr);
1124# else
1125 PUTU32(ctx->Yi.c + 12, ctr);
1126# endif
1127 else
1128 ctx->Yi.d[3] = ctr;
1129 for (i = 0; i < 16 / sizeof(size_t); ++i)
1130 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1131 out += 16;
1132 in += 16;
1133 len -= 16;
1134 }
1135 GHASH(ctx, out - j, j);
1136 }
1137# else
1138 while (len >= 16) {
1139 size_t *out_t = (size_t *)out;
1140 const size_t *in_t = (const size_t *)in;
1141
1142 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1143 ++ctr;
1144 if (is_endian.little)
1145# ifdef BSWAP4
1146 ctx->Yi.d[3] = BSWAP4(ctr);
1147# else
1148 PUTU32(ctx->Yi.c + 12, ctr);
1149# endif
1150 else
1151 ctx->Yi.d[3] = ctr;
1152 for (i = 0; i < 16 / sizeof(size_t); ++i)
1153 ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
f5791af3 1154 GCM_MUL(ctx);
0f113f3e
MC
1155 out += 16;
1156 in += 16;
1157 len -= 16;
1158 }
1159# endif
1160 if (len) {
1161 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1162 ++ctr;
1163 if (is_endian.little)
1164# ifdef BSWAP4
1165 ctx->Yi.d[3] = BSWAP4(ctr);
1166# else
1167 PUTU32(ctx->Yi.c + 12, ctr);
1168# endif
1169 else
1170 ctx->Yi.d[3] = ctr;
c1b2569d
AP
1171# if defined(GHASH)
1172 while (len--) {
1173 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1174 ++n;
1175 }
1176# else
0f113f3e
MC
1177 while (len--) {
1178 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1179 ++n;
1180 }
c1b2569d
AP
1181 mres = n;
1182# endif
0f113f3e
MC
1183 }
1184
c1b2569d 1185 ctx->mres = mres;
0f113f3e
MC
1186 return 0;
1187 } while (0);
1188 }
e7f5b1cd 1189#endif
0f113f3e
MC
1190 for (i = 0; i < len; ++i) {
1191 if (n == 0) {
1192 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1193 ++ctr;
1194 if (is_endian.little)
997d1aac 1195#ifdef BSWAP4
0f113f3e 1196 ctx->Yi.d[3] = BSWAP4(ctr);
997d1aac 1197#else
0f113f3e
MC
1198 PUTU32(ctx->Yi.c + 12, ctr);
1199#endif
1200 else
1201 ctx->Yi.d[3] = ctr;
1202 }
c1b2569d
AP
1203#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1204 ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
0f113f3e 1205 n = (n + 1) % 16;
c1b2569d
AP
1206 if (mres == sizeof(ctx->Xn)) {
1207 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1208 mres = 0;
1209 }
1210#else
1211 ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1212 mres = n = (n + 1) % 16;
0f113f3e 1213 if (n == 0)
f5791af3 1214 GCM_MUL(ctx);
c1b2569d 1215#endif
0f113f3e
MC
1216 }
1217
c1b2569d 1218 ctx->mres = mres;
0f113f3e 1219 return 0;
e7f5b1cd
AP
1220}
1221
1f2502eb 1222int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
0f113f3e
MC
1223 const unsigned char *in, unsigned char *out,
1224 size_t len)
e7f5b1cd 1225{
0f113f3e
MC
1226 const union {
1227 long one;
1228 char little;
2e635aa8 1229 } is_endian = { 1 };
c1b2569d 1230 unsigned int n, ctr, mres;
0f113f3e
MC
1231 size_t i;
1232 u64 mlen = ctx->len.u[1];
1233 block128_f block = ctx->block;
1234 void *key = ctx->key;
d8d95832 1235#ifdef GCM_FUNCREF_4BIT
0f113f3e 1236 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
2e635aa8 1237# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
0f113f3e
MC
1238 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1239 const u8 *inp, size_t len) = ctx->ghash;
d8d95832
AP
1240# endif
1241#endif
1f2502eb 1242
0f113f3e
MC
1243 mlen += len;
1244 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1245 return -1;
1246 ctx->len.u[1] = mlen;
e7f5b1cd 1247
c1b2569d
AP
1248 mres = ctx->mres;
1249
0f113f3e
MC
1250 if (ctx->ares) {
1251 /* First call to decrypt finalizes GHASH(AAD) */
c1b2569d
AP
1252#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1253 if (len == 0) {
1254 GCM_MUL(ctx);
1255 ctx->ares = 0;
1256 return 0;
1257 }
1258 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1259 ctx->Xi.u[0] = 0;
1260 ctx->Xi.u[1] = 0;
1261 mres = sizeof(ctx->Xi);
1262#else
f5791af3 1263 GCM_MUL(ctx);
c1b2569d 1264#endif
0f113f3e
MC
1265 ctx->ares = 0;
1266 }
b68c1315 1267
0f113f3e 1268 if (is_endian.little)
997d1aac 1269#ifdef BSWAP4
0f113f3e 1270 ctr = BSWAP4(ctx->Yi.d[3]);
997d1aac 1271#else
0f113f3e 1272 ctr = GETU32(ctx->Yi.c + 12);
997d1aac 1273#endif
0f113f3e
MC
1274 else
1275 ctr = ctx->Yi.d[3];
e7f5b1cd 1276
c1b2569d 1277 n = mres % 16;
e7f5b1cd 1278#if !defined(OPENSSL_SMALL_FOOTPRINT)
0f113f3e
MC
1279 if (16 % sizeof(size_t) == 0) { /* always true actually */
1280 do {
1281 if (n) {
c1b2569d
AP
1282# if defined(GHASH)
1283 while (n && len) {
1284 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1285 --len;
1286 n = (n + 1) % 16;
1287 }
1288 if (n == 0) {
1289 GHASH(ctx, ctx->Xn, mres);
1290 mres = 0;
1291 } else {
1292 ctx->mres = mres;
1293 return 0;
1294 }
1295# else
0f113f3e
MC
1296 while (n && len) {
1297 u8 c = *(in++);
1298 *(out++) = c ^ ctx->EKi.c[n];
1299 ctx->Xi.c[n] ^= c;
1300 --len;
1301 n = (n + 1) % 16;
1302 }
c1b2569d 1303 if (n == 0) {
f5791af3 1304 GCM_MUL(ctx);
c1b2569d
AP
1305 mres = 0;
1306 } else {
0f113f3e
MC
1307 ctx->mres = n;
1308 return 0;
1309 }
c1b2569d 1310# endif
0f113f3e
MC
1311 }
1312# if defined(STRICT_ALIGNMENT)
1313 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1314 break;
1315# endif
2e635aa8 1316# if defined(GHASH)
c1b2569d
AP
1317 if (len >= 16 && mres) {
1318 GHASH(ctx, ctx->Xn, mres);
1319 mres = 0;
1320 }
2e635aa8 1321# if defined(GHASH_CHUNK)
0f113f3e
MC
1322 while (len >= GHASH_CHUNK) {
1323 size_t j = GHASH_CHUNK;
1324
1325 GHASH(ctx, in, GHASH_CHUNK);
1326 while (j) {
77286fe3
BE
1327 size_t_aX *out_t = (size_t_aX *)out;
1328 const size_t_aX *in_t = (const size_t_aX *)in;
0f113f3e
MC
1329
1330 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1331 ++ctr;
1332 if (is_endian.little)
2e635aa8 1333# ifdef BSWAP4
0f113f3e 1334 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1335# else
0f113f3e 1336 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1337# endif
0f113f3e
MC
1338 else
1339 ctx->Yi.d[3] = ctr;
1340 for (i = 0; i < 16 / sizeof(size_t); ++i)
1341 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1342 out += 16;
1343 in += 16;
1344 j -= 16;
1345 }
1346 len -= GHASH_CHUNK;
1347 }
2e635aa8 1348# endif
0f113f3e
MC
1349 if ((i = (len & (size_t)-16))) {
1350 GHASH(ctx, in, i);
1351 while (len >= 16) {
77286fe3
BE
1352 size_t_aX *out_t = (size_t_aX *)out;
1353 const size_t_aX *in_t = (const size_t_aX *)in;
0f113f3e
MC
1354
1355 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1356 ++ctr;
1357 if (is_endian.little)
1358# ifdef BSWAP4
1359 ctx->Yi.d[3] = BSWAP4(ctr);
1360# else
1361 PUTU32(ctx->Yi.c + 12, ctr);
1362# endif
1363 else
1364 ctx->Yi.d[3] = ctr;
1365 for (i = 0; i < 16 / sizeof(size_t); ++i)
1366 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1367 out += 16;
1368 in += 16;
1369 len -= 16;
1370 }
1371 }
1372# else
1373 while (len >= 16) {
1374 size_t *out_t = (size_t *)out;
1375 const size_t *in_t = (const size_t *)in;
1376
1377 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1378 ++ctr;
1379 if (is_endian.little)
1380# ifdef BSWAP4
1381 ctx->Yi.d[3] = BSWAP4(ctr);
1382# else
1383 PUTU32(ctx->Yi.c + 12, ctr);
1384# endif
1385 else
1386 ctx->Yi.d[3] = ctr;
1387 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1388 size_t c = in[i];
1389 out[i] = c ^ ctx->EKi.t[i];
1390 ctx->Xi.t[i] ^= c;
1391 }
f5791af3 1392 GCM_MUL(ctx);
0f113f3e
MC
1393 out += 16;
1394 in += 16;
1395 len -= 16;
1396 }
1397# endif
1398 if (len) {
1399 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1400 ++ctr;
1401 if (is_endian.little)
1402# ifdef BSWAP4
1403 ctx->Yi.d[3] = BSWAP4(ctr);
1404# else
1405 PUTU32(ctx->Yi.c + 12, ctr);
1406# endif
1407 else
1408 ctx->Yi.d[3] = ctr;
c1b2569d
AP
1409# if defined(GHASH)
1410 while (len--) {
1411 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1412 ++n;
1413 }
1414# else
0f113f3e
MC
1415 while (len--) {
1416 u8 c = in[n];
1417 ctx->Xi.c[n] ^= c;
1418 out[n] = c ^ ctx->EKi.c[n];
1419 ++n;
1420 }
c1b2569d
AP
1421 mres = n;
1422# endif
0f113f3e
MC
1423 }
1424
c1b2569d 1425 ctx->mres = mres;
0f113f3e
MC
1426 return 0;
1427 } while (0);
1428 }
997d1aac 1429#endif
0f113f3e
MC
1430 for (i = 0; i < len; ++i) {
1431 u8 c;
1432 if (n == 0) {
1433 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1434 ++ctr;
1435 if (is_endian.little)
997d1aac 1436#ifdef BSWAP4
0f113f3e 1437 ctx->Yi.d[3] = BSWAP4(ctr);
997d1aac 1438#else
0f113f3e
MC
1439 PUTU32(ctx->Yi.c + 12, ctr);
1440#endif
1441 else
1442 ctx->Yi.d[3] = ctr;
1443 }
c1b2569d
AP
1444#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1445 out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1446 n = (n + 1) % 16;
1447 if (mres == sizeof(ctx->Xn)) {
1448 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1449 mres = 0;
1450 }
1451#else
0f113f3e
MC
1452 c = in[i];
1453 out[i] = c ^ ctx->EKi.c[n];
1454 ctx->Xi.c[n] ^= c;
c1b2569d 1455 mres = n = (n + 1) % 16;
0f113f3e 1456 if (n == 0)
f5791af3 1457 GCM_MUL(ctx);
c1b2569d 1458#endif
0f113f3e 1459 }
96a4cf8c 1460
c1b2569d 1461 ctx->mres = mres;
0f113f3e 1462 return 0;
e7f5b1cd
AP
1463}
1464
1f2502eb 1465int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
0f113f3e
MC
1466 const unsigned char *in, unsigned char *out,
1467 size_t len, ctr128_f stream)
f71c6ace 1468{
2e635aa8
AP
1469#if defined(OPENSSL_SMALL_FOOTPRINT)
1470 return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1471#else
0f113f3e
MC
1472 const union {
1473 long one;
1474 char little;
2e635aa8 1475 } is_endian = { 1 };
c1b2569d 1476 unsigned int n, ctr, mres;
0f113f3e
MC
1477 size_t i;
1478 u64 mlen = ctx->len.u[1];
1479 void *key = ctx->key;
2e635aa8 1480# ifdef GCM_FUNCREF_4BIT
0f113f3e 1481 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
2e635aa8 1482# ifdef GHASH
0f113f3e
MC
1483 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1484 const u8 *inp, size_t len) = ctx->ghash;
2e635aa8 1485# endif
d8d95832 1486# endif
1f2502eb 1487
0f113f3e
MC
1488 mlen += len;
1489 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1490 return -1;
1491 ctx->len.u[1] = mlen;
f71c6ace 1492
c1b2569d
AP
1493 mres = ctx->mres;
1494
0f113f3e
MC
1495 if (ctx->ares) {
1496 /* First call to encrypt finalizes GHASH(AAD) */
c1b2569d
AP
1497#if defined(GHASH)
1498 if (len == 0) {
1499 GCM_MUL(ctx);
1500 ctx->ares = 0;
1501 return 0;
1502 }
1503 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1504 ctx->Xi.u[0] = 0;
1505 ctx->Xi.u[1] = 0;
1506 mres = sizeof(ctx->Xi);
1507#else
f5791af3 1508 GCM_MUL(ctx);
c1b2569d 1509#endif
0f113f3e
MC
1510 ctx->ares = 0;
1511 }
b68c1315 1512
0f113f3e 1513 if (is_endian.little)
2e635aa8 1514# ifdef BSWAP4
0f113f3e 1515 ctr = BSWAP4(ctx->Yi.d[3]);
2e635aa8 1516# else
0f113f3e 1517 ctr = GETU32(ctx->Yi.c + 12);
2e635aa8 1518# endif
0f113f3e
MC
1519 else
1520 ctr = ctx->Yi.d[3];
1521
c1b2569d 1522 n = mres % 16;
0f113f3e 1523 if (n) {
c1b2569d
AP
1524# if defined(GHASH)
1525 while (n && len) {
1526 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1527 --len;
1528 n = (n + 1) % 16;
1529 }
1530 if (n == 0) {
1531 GHASH(ctx, ctx->Xn, mres);
1532 mres = 0;
1533 } else {
1534 ctx->mres = mres;
1535 return 0;
1536 }
1537# else
0f113f3e
MC
1538 while (n && len) {
1539 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1540 --len;
1541 n = (n + 1) % 16;
1542 }
c1b2569d 1543 if (n == 0) {
f5791af3 1544 GCM_MUL(ctx);
c1b2569d
AP
1545 mres = 0;
1546 } else {
0f113f3e
MC
1547 ctx->mres = n;
1548 return 0;
1549 }
c1b2569d 1550# endif
0f113f3e 1551 }
c1b2569d
AP
1552# if defined(GHASH)
1553 if (len >= 16 && mres) {
1554 GHASH(ctx, ctx->Xn, mres);
1555 mres = 0;
1556 }
1557# if defined(GHASH_CHUNK)
0f113f3e
MC
1558 while (len >= GHASH_CHUNK) {
1559 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1560 ctr += GHASH_CHUNK / 16;
1561 if (is_endian.little)
c1b2569d 1562# ifdef BSWAP4
0f113f3e 1563 ctx->Yi.d[3] = BSWAP4(ctr);
c1b2569d 1564# else
0f113f3e 1565 PUTU32(ctx->Yi.c + 12, ctr);
c1b2569d 1566# endif
0f113f3e
MC
1567 else
1568 ctx->Yi.d[3] = ctr;
1569 GHASH(ctx, out, GHASH_CHUNK);
1570 out += GHASH_CHUNK;
1571 in += GHASH_CHUNK;
1572 len -= GHASH_CHUNK;
1573 }
c1b2569d 1574# endif
2e635aa8 1575# endif
0f113f3e
MC
1576 if ((i = (len & (size_t)-16))) {
1577 size_t j = i / 16;
f71c6ace 1578
0f113f3e
MC
1579 (*stream) (in, out, j, key, ctx->Yi.c);
1580 ctr += (unsigned int)j;
1581 if (is_endian.little)
2e635aa8 1582# ifdef BSWAP4
0f113f3e 1583 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1584# else
0f113f3e 1585 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1586# endif
0f113f3e
MC
1587 else
1588 ctx->Yi.d[3] = ctr;
1589 in += i;
1590 len -= i;
2e635aa8 1591# if defined(GHASH)
0f113f3e
MC
1592 GHASH(ctx, out, i);
1593 out += i;
2e635aa8 1594# else
0f113f3e
MC
1595 while (j--) {
1596 for (i = 0; i < 16; ++i)
1597 ctx->Xi.c[i] ^= out[i];
f5791af3 1598 GCM_MUL(ctx);
0f113f3e
MC
1599 out += 16;
1600 }
2e635aa8 1601# endif
0f113f3e
MC
1602 }
1603 if (len) {
1604 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1605 ++ctr;
1606 if (is_endian.little)
2e635aa8 1607# ifdef BSWAP4
0f113f3e 1608 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1609# else
0f113f3e 1610 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1611# endif
0f113f3e
MC
1612 else
1613 ctx->Yi.d[3] = ctr;
1614 while (len--) {
c1b2569d
AP
1615# if defined(GHASH)
1616 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1617# else
1618 ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1619# endif
0f113f3e
MC
1620 ++n;
1621 }
1622 }
1623
c1b2569d 1624 ctx->mres = mres;
0f113f3e 1625 return 0;
2e635aa8 1626#endif
f71c6ace
AP
1627}
1628
1f2502eb 1629int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
0f113f3e
MC
1630 const unsigned char *in, unsigned char *out,
1631 size_t len, ctr128_f stream)
f71c6ace 1632{
2e635aa8
AP
1633#if defined(OPENSSL_SMALL_FOOTPRINT)
1634 return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1635#else
0f113f3e
MC
1636 const union {
1637 long one;
1638 char little;
2e635aa8 1639 } is_endian = { 1 };
c1b2569d 1640 unsigned int n, ctr, mres;
0f113f3e
MC
1641 size_t i;
1642 u64 mlen = ctx->len.u[1];
1643 void *key = ctx->key;
2e635aa8 1644# ifdef GCM_FUNCREF_4BIT
0f113f3e 1645 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
2e635aa8 1646# ifdef GHASH
0f113f3e
MC
1647 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1648 const u8 *inp, size_t len) = ctx->ghash;
2e635aa8 1649# endif
d8d95832 1650# endif
1f2502eb 1651
0f113f3e
MC
1652 mlen += len;
1653 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1654 return -1;
1655 ctx->len.u[1] = mlen;
f71c6ace 1656
c1b2569d
AP
1657 mres = ctx->mres;
1658
0f113f3e
MC
1659 if (ctx->ares) {
1660 /* First call to decrypt finalizes GHASH(AAD) */
c1b2569d
AP
1661# if defined(GHASH)
1662 if (len == 0) {
1663 GCM_MUL(ctx);
1664 ctx->ares = 0;
1665 return 0;
1666 }
1667 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1668 ctx->Xi.u[0] = 0;
1669 ctx->Xi.u[1] = 0;
1670 mres = sizeof(ctx->Xi);
1671# else
f5791af3 1672 GCM_MUL(ctx);
c1b2569d 1673# endif
0f113f3e
MC
1674 ctx->ares = 0;
1675 }
b68c1315 1676
0f113f3e 1677 if (is_endian.little)
2e635aa8 1678# ifdef BSWAP4
0f113f3e 1679 ctr = BSWAP4(ctx->Yi.d[3]);
2e635aa8 1680# else
0f113f3e 1681 ctr = GETU32(ctx->Yi.c + 12);
2e635aa8 1682# endif
0f113f3e
MC
1683 else
1684 ctr = ctx->Yi.d[3];
1685
c1b2569d 1686 n = mres % 16;
0f113f3e 1687 if (n) {
c1b2569d
AP
1688# if defined(GHASH)
1689 while (n && len) {
1690 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1691 --len;
1692 n = (n + 1) % 16;
1693 }
1694 if (n == 0) {
1695 GHASH(ctx, ctx->Xn, mres);
1696 mres = 0;
1697 } else {
1698 ctx->mres = mres;
1699 return 0;
1700 }
1701# else
0f113f3e
MC
1702 while (n && len) {
1703 u8 c = *(in++);
1704 *(out++) = c ^ ctx->EKi.c[n];
1705 ctx->Xi.c[n] ^= c;
1706 --len;
1707 n = (n + 1) % 16;
1708 }
c1b2569d 1709 if (n == 0) {
f5791af3 1710 GCM_MUL(ctx);
c1b2569d
AP
1711 mres = 0;
1712 } else {
0f113f3e
MC
1713 ctx->mres = n;
1714 return 0;
1715 }
c1b2569d 1716# endif
0f113f3e 1717 }
c1b2569d
AP
1718# if defined(GHASH)
1719 if (len >= 16 && mres) {
1720 GHASH(ctx, ctx->Xn, mres);
1721 mres = 0;
1722 }
1723# if defined(GHASH_CHUNK)
0f113f3e
MC
1724 while (len >= GHASH_CHUNK) {
1725 GHASH(ctx, in, GHASH_CHUNK);
1726 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1727 ctr += GHASH_CHUNK / 16;
1728 if (is_endian.little)
c1b2569d 1729# ifdef BSWAP4
0f113f3e 1730 ctx->Yi.d[3] = BSWAP4(ctr);
c1b2569d 1731# else
0f113f3e 1732 PUTU32(ctx->Yi.c + 12, ctr);
c1b2569d 1733# endif
0f113f3e
MC
1734 else
1735 ctx->Yi.d[3] = ctr;
1736 out += GHASH_CHUNK;
1737 in += GHASH_CHUNK;
1738 len -= GHASH_CHUNK;
1739 }
c1b2569d 1740# endif
2e635aa8 1741# endif
0f113f3e
MC
1742 if ((i = (len & (size_t)-16))) {
1743 size_t j = i / 16;
f71c6ace 1744
2e635aa8 1745# if defined(GHASH)
0f113f3e 1746 GHASH(ctx, in, i);
2e635aa8 1747# else
0f113f3e
MC
1748 while (j--) {
1749 size_t k;
1750 for (k = 0; k < 16; ++k)
1751 ctx->Xi.c[k] ^= in[k];
f5791af3 1752 GCM_MUL(ctx);
0f113f3e
MC
1753 in += 16;
1754 }
1755 j = i / 16;
1756 in -= i;
2e635aa8 1757# endif
0f113f3e
MC
1758 (*stream) (in, out, j, key, ctx->Yi.c);
1759 ctr += (unsigned int)j;
1760 if (is_endian.little)
2e635aa8 1761# ifdef BSWAP4
0f113f3e 1762 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1763# else
0f113f3e 1764 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1765# endif
0f113f3e
MC
1766 else
1767 ctx->Yi.d[3] = ctr;
1768 out += i;
1769 in += i;
1770 len -= i;
1771 }
1772 if (len) {
1773 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1774 ++ctr;
1775 if (is_endian.little)
2e635aa8 1776# ifdef BSWAP4
0f113f3e 1777 ctx->Yi.d[3] = BSWAP4(ctr);
2e635aa8 1778# else
0f113f3e 1779 PUTU32(ctx->Yi.c + 12, ctr);
2e635aa8 1780# endif
0f113f3e
MC
1781 else
1782 ctx->Yi.d[3] = ctr;
1783 while (len--) {
c1b2569d
AP
1784# if defined(GHASH)
1785 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1786# else
0f113f3e 1787 u8 c = in[n];
c1b2569d 1788 ctx->Xi.c[mres++] ^= c;
0f113f3e 1789 out[n] = c ^ ctx->EKi.c[n];
c1b2569d 1790# endif
0f113f3e
MC
1791 ++n;
1792 }
1793 }
1794
c1b2569d 1795 ctx->mres = mres;
0f113f3e 1796 return 0;
2e635aa8 1797#endif
f71c6ace
AP
1798}
1799
0f113f3e
MC
1800int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1801 size_t len)
e7f5b1cd 1802{
0f113f3e
MC
1803 const union {
1804 long one;
1805 char little;
2e635aa8 1806 } is_endian = { 1 };
0f113f3e
MC
1807 u64 alen = ctx->len.u[0] << 3;
1808 u64 clen = ctx->len.u[1] << 3;
d8d95832 1809#ifdef GCM_FUNCREF_4BIT
0f113f3e 1810 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
c1b2569d
AP
1811# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1812 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1813 const u8 *inp, size_t len) = ctx->ghash;
1814# endif
d8d95832 1815#endif
e7f5b1cd 1816
c1b2569d
AP
1817#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1818 u128 bitlen;
1819 unsigned int mres = ctx->mres;
1820
1821 if (mres) {
1822 unsigned blocks = (mres + 15) & -16;
1823
1824 memset(ctx->Xn + mres, 0, blocks - mres);
1825 mres = blocks;
1826 if (mres == sizeof(ctx->Xn)) {
1827 GHASH(ctx, ctx->Xn, mres);
1828 mres = 0;
1829 }
1830 } else if (ctx->ares) {
1831 GCM_MUL(ctx);
1832 }
1833#else
0f113f3e 1834 if (ctx->mres || ctx->ares)
f5791af3 1835 GCM_MUL(ctx);
c1b2569d 1836#endif
e7f5b1cd 1837
0f113f3e 1838 if (is_endian.little) {
e7f5b1cd 1839#ifdef BSWAP8
0f113f3e
MC
1840 alen = BSWAP8(alen);
1841 clen = BSWAP8(clen);
e7f5b1cd 1842#else
0f113f3e 1843 u8 *p = ctx->len.c;
e7f5b1cd 1844
0f113f3e
MC
1845 ctx->len.u[0] = alen;
1846 ctx->len.u[1] = clen;
e7f5b1cd 1847
0f113f3e
MC
1848 alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1849 clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
e7f5b1cd 1850#endif
0f113f3e 1851 }
e7f5b1cd 1852
c1b2569d
AP
1853#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1854 bitlen.hi = alen;
1855 bitlen.lo = clen;
1856 memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1857 mres += sizeof(bitlen);
1858 GHASH(ctx, ctx->Xn, mres);
1859#else
0f113f3e
MC
1860 ctx->Xi.u[0] ^= alen;
1861 ctx->Xi.u[1] ^= clen;
f5791af3 1862 GCM_MUL(ctx);
c1b2569d 1863#endif
e7f5b1cd 1864
0f113f3e
MC
1865 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1866 ctx->Xi.u[1] ^= ctx->EK0.u[1];
6acb4ff3 1867
0f113f3e 1868 if (tag && len <= sizeof(ctx->Xi))
1e4a355d 1869 return CRYPTO_memcmp(ctx->Xi.c, tag, len);
0f113f3e
MC
1870 else
1871 return -1;
6acb4ff3
AP
1872}
1873
fd3dbc1d
DSH
1874void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1875{
0f113f3e
MC
1876 CRYPTO_gcm128_finish(ctx, NULL, 0);
1877 memcpy(tag, ctx->Xi.c,
1878 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
fd3dbc1d
DSH
1879}
1880
6acb4ff3
AP
1881GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1882{
0f113f3e 1883 GCM128_CONTEXT *ret;
6acb4ff3 1884
90945fa3 1885 if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
0f113f3e 1886 CRYPTO_gcm128_init(ret, key, block);
6acb4ff3 1887
0f113f3e 1888 return ret;
6acb4ff3
AP
1889}
1890
1891void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1892{
4b45c6e5 1893 OPENSSL_clear_free(ctx, sizeof(*ctx));
e7f5b1cd 1894}