]>
Commit | Line | Data |
---|---|---|
598848e4 ILT |
1 | /* sha1.c - Functions to compute SHA1 message digest of files or |
2 | memory blocks according to the NIST specification FIPS-180-1. | |
3 | ||
a945c346 | 4 | Copyright (C) 2000-2024 Free Software Foundation, Inc. |
598848e4 ILT |
5 | |
6 | This program is free software; you can redistribute it and/or modify it | |
7 | under the terms of the GNU General Public License as published by the | |
8 | Free Software Foundation; either version 2, or (at your option) any | |
9 | later version. | |
10 | ||
11 | This program is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | GNU General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with this program; if not, write to the Free Software Foundation, | |
18 | Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ | |
19 | ||
20 | /* Written by Scott G. Miller | |
21 | Credits: | |
22 | Robert Klep <robert@ilse.nl> -- Expansion function fix | |
23 | */ | |
24 | ||
25 | #include <config.h> | |
26 | ||
27 | #include "sha1.h" | |
28 | ||
29 | #include <stddef.h> | |
30 | #include <string.h> | |
31 | ||
bf4f40cc JJ |
32 | #ifdef HAVE_X86_SHA1_HW_SUPPORT |
33 | # include <x86intrin.h> | |
34 | # include <cpuid.h> | |
35 | #endif | |
36 | ||
598848e4 ILT |
37 | #if USE_UNLOCKED_IO |
38 | # include "unlocked-io.h" | |
39 | #endif | |
40 | ||
41 | #ifdef WORDS_BIGENDIAN | |
42 | # define SWAP(n) (n) | |
43 | #else | |
44 | # define SWAP(n) \ | |
45 | (((n) << 24) | (((n) & 0xff00) << 8) | (((n) >> 8) & 0xff00) | ((n) >> 24)) | |
46 | #endif | |
47 | ||
48 | #define BLOCKSIZE 4096 | |
49 | #if BLOCKSIZE % 64 != 0 | |
50 | # error "invalid BLOCKSIZE" | |
51 | #endif | |
52 | ||
53 | /* This array contains the bytes used to pad the buffer to the next | |
54 | 64-byte boundary. (RFC 1321, 3.1: Step 1) */ | |
55 | static const unsigned char fillbuf[64] = { 0x80, 0 /* , 0, 0, ... */ }; | |
56 | ||
57 | ||
58 | /* Take a pointer to a 160 bit block of data (five 32 bit ints) and | |
59 | initialize it to the start constants of the SHA1 algorithm. This | |
60 | must be called before using hash in the call to sha1_hash. */ | |
61 | void | |
62 | sha1_init_ctx (struct sha1_ctx *ctx) | |
63 | { | |
64 | ctx->A = 0x67452301; | |
65 | ctx->B = 0xefcdab89; | |
66 | ctx->C = 0x98badcfe; | |
67 | ctx->D = 0x10325476; | |
68 | ctx->E = 0xc3d2e1f0; | |
69 | ||
70 | ctx->total[0] = ctx->total[1] = 0; | |
71 | ctx->buflen = 0; | |
72 | } | |
73 | ||
74 | /* Put result from CTX in first 20 bytes following RESBUF. The result | |
75 | must be in little endian byte order. | |
76 | ||
77 | IMPORTANT: On some systems it is required that RESBUF is correctly | |
78 | aligned for a 32-bit value. */ | |
79 | void * | |
80 | sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf) | |
81 | { | |
82 | ((sha1_uint32 *) resbuf)[0] = SWAP (ctx->A); | |
83 | ((sha1_uint32 *) resbuf)[1] = SWAP (ctx->B); | |
84 | ((sha1_uint32 *) resbuf)[2] = SWAP (ctx->C); | |
85 | ((sha1_uint32 *) resbuf)[3] = SWAP (ctx->D); | |
86 | ((sha1_uint32 *) resbuf)[4] = SWAP (ctx->E); | |
87 | ||
88 | return resbuf; | |
89 | } | |
90 | ||
91 | /* Process the remaining bytes in the internal buffer and the usual | |
92 | prolog according to the standard and write the result to RESBUF. | |
93 | ||
94 | IMPORTANT: On some systems it is required that RESBUF is correctly | |
95 | aligned for a 32-bit value. */ | |
96 | void * | |
97 | sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf) | |
98 | { | |
99 | /* Take yet unprocessed bytes into account. */ | |
100 | sha1_uint32 bytes = ctx->buflen; | |
101 | size_t size = (bytes < 56) ? 64 / 4 : 64 * 2 / 4; | |
102 | ||
103 | /* Now count remaining bytes. */ | |
104 | ctx->total[0] += bytes; | |
105 | if (ctx->total[0] < bytes) | |
106 | ++ctx->total[1]; | |
107 | ||
108 | /* Put the 64-bit file length in *bits* at the end of the buffer. */ | |
109 | ctx->buffer[size - 2] = SWAP ((ctx->total[1] << 3) | (ctx->total[0] >> 29)); | |
110 | ctx->buffer[size - 1] = SWAP (ctx->total[0] << 3); | |
111 | ||
112 | memcpy (&((char *) ctx->buffer)[bytes], fillbuf, (size - 2) * 4 - bytes); | |
113 | ||
114 | /* Process last bytes. */ | |
115 | sha1_process_block (ctx->buffer, size * 4, ctx); | |
116 | ||
117 | return sha1_read_ctx (ctx, resbuf); | |
118 | } | |
119 | ||
120 | /* Compute SHA1 message digest for bytes read from STREAM. The | |
121 | resulting message digest number will be written into the 16 bytes | |
122 | beginning at RESBLOCK. */ | |
123 | int | |
124 | sha1_stream (FILE *stream, void *resblock) | |
125 | { | |
126 | struct sha1_ctx ctx; | |
127 | char buffer[BLOCKSIZE + 72]; | |
128 | size_t sum; | |
129 | ||
130 | /* Initialize the computation context. */ | |
131 | sha1_init_ctx (&ctx); | |
132 | ||
133 | /* Iterate over full file contents. */ | |
134 | while (1) | |
135 | { | |
136 | /* We read the file in blocks of BLOCKSIZE bytes. One call of the | |
137 | computation function processes the whole buffer so that with the | |
138 | next round of the loop another block can be read. */ | |
139 | size_t n; | |
140 | sum = 0; | |
141 | ||
142 | /* Read block. Take care for partial reads. */ | |
143 | while (1) | |
144 | { | |
145 | n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream); | |
146 | ||
147 | sum += n; | |
148 | ||
149 | if (sum == BLOCKSIZE) | |
150 | break; | |
151 | ||
152 | if (n == 0) | |
153 | { | |
154 | /* Check for the error flag IFF N == 0, so that we don't | |
155 | exit the loop after a partial read due to e.g., EAGAIN | |
156 | or EWOULDBLOCK. */ | |
157 | if (ferror (stream)) | |
158 | return 1; | |
159 | goto process_partial_block; | |
160 | } | |
161 | ||
162 | /* We've read at least one byte, so ignore errors. But always | |
163 | check for EOF, since feof may be true even though N > 0. | |
164 | Otherwise, we could end up calling fread after EOF. */ | |
165 | if (feof (stream)) | |
166 | goto process_partial_block; | |
167 | } | |
168 | ||
169 | /* Process buffer with BLOCKSIZE bytes. Note that | |
170 | BLOCKSIZE % 64 == 0 | |
171 | */ | |
172 | sha1_process_block (buffer, BLOCKSIZE, &ctx); | |
173 | } | |
174 | ||
175 | process_partial_block:; | |
176 | ||
177 | /* Process any remaining bytes. */ | |
178 | if (sum > 0) | |
179 | sha1_process_bytes (buffer, sum, &ctx); | |
180 | ||
181 | /* Construct result in desired memory. */ | |
182 | sha1_finish_ctx (&ctx, resblock); | |
183 | return 0; | |
184 | } | |
185 | ||
186 | /* Compute SHA1 message digest for LEN bytes beginning at BUFFER. The | |
187 | result is always in little endian byte order, so that a byte-wise | |
188 | output yields to the wanted ASCII representation of the message | |
189 | digest. */ | |
190 | void * | |
191 | sha1_buffer (const char *buffer, size_t len, void *resblock) | |
192 | { | |
193 | struct sha1_ctx ctx; | |
194 | ||
195 | /* Initialize the computation context. */ | |
196 | sha1_init_ctx (&ctx); | |
197 | ||
198 | /* Process whole buffer but last len % 64 bytes. */ | |
199 | sha1_process_bytes (buffer, len, &ctx); | |
200 | ||
201 | /* Put result in desired memory area. */ | |
202 | return sha1_finish_ctx (&ctx, resblock); | |
203 | } | |
204 | ||
205 | void | |
206 | sha1_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx) | |
207 | { | |
208 | /* When we already have some bits in our internal buffer concatenate | |
209 | both inputs first. */ | |
210 | if (ctx->buflen != 0) | |
211 | { | |
212 | size_t left_over = ctx->buflen; | |
213 | size_t add = 128 - left_over > len ? len : 128 - left_over; | |
214 | ||
215 | memcpy (&((char *) ctx->buffer)[left_over], buffer, add); | |
216 | ctx->buflen += add; | |
217 | ||
218 | if (ctx->buflen > 64) | |
219 | { | |
220 | sha1_process_block (ctx->buffer, ctx->buflen & ~63, ctx); | |
221 | ||
222 | ctx->buflen &= 63; | |
223 | /* The regions in the following copy operation cannot overlap. */ | |
224 | memcpy (ctx->buffer, | |
225 | &((char *) ctx->buffer)[(left_over + add) & ~63], | |
226 | ctx->buflen); | |
227 | } | |
228 | ||
229 | buffer = (const char *) buffer + add; | |
230 | len -= add; | |
231 | } | |
232 | ||
233 | /* Process available complete blocks. */ | |
234 | if (len >= 64) | |
235 | { | |
236 | #if !_STRING_ARCH_unaligned | |
237 | # define alignof(type) offsetof (struct { char c; type x; }, x) | |
238 | # define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0) | |
239 | if (UNALIGNED_P (buffer)) | |
240 | while (len > 64) | |
241 | { | |
242 | sha1_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx); | |
243 | buffer = (const char *) buffer + 64; | |
244 | len -= 64; | |
245 | } | |
246 | else | |
247 | #endif | |
248 | { | |
249 | sha1_process_block (buffer, len & ~63, ctx); | |
250 | buffer = (const char *) buffer + (len & ~63); | |
251 | len &= 63; | |
252 | } | |
253 | } | |
254 | ||
255 | /* Move remaining bytes in internal buffer. */ | |
256 | if (len > 0) | |
257 | { | |
258 | size_t left_over = ctx->buflen; | |
259 | ||
260 | memcpy (&((char *) ctx->buffer)[left_over], buffer, len); | |
261 | left_over += len; | |
262 | if (left_over >= 64) | |
263 | { | |
264 | sha1_process_block (ctx->buffer, 64, ctx); | |
265 | left_over -= 64; | |
f6e9c1c9 | 266 | memmove (ctx->buffer, &ctx->buffer[16], left_over); |
598848e4 ILT |
267 | } |
268 | ctx->buflen = left_over; | |
269 | } | |
270 | } | |
271 | ||
272 | /* --- Code below is the primary difference between md5.c and sha1.c --- */ | |
273 | ||
274 | /* SHA1 round constants */ | |
275 | #define K1 0x5a827999 | |
276 | #define K2 0x6ed9eba1 | |
277 | #define K3 0x8f1bbcdc | |
278 | #define K4 0xca62c1d6 | |
279 | ||
280 | /* Round functions. Note that F2 is the same as F4. */ | |
281 | #define F1(B,C,D) ( D ^ ( B & ( C ^ D ) ) ) | |
282 | #define F2(B,C,D) (B ^ C ^ D) | |
283 | #define F3(B,C,D) ( ( B & C ) | ( D & ( B | C ) ) ) | |
284 | #define F4(B,C,D) (B ^ C ^ D) | |
285 | ||
286 | /* Process LEN bytes of BUFFER, accumulating context into CTX. | |
287 | It is assumed that LEN % 64 == 0. | |
288 | Most of this code comes from GnuPG's cipher/sha1.c. */ | |
289 | ||
290 | void | |
291 | sha1_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx) | |
292 | { | |
293 | const sha1_uint32 *words = (const sha1_uint32*) buffer; | |
294 | size_t nwords = len / sizeof (sha1_uint32); | |
295 | const sha1_uint32 *endp = words + nwords; | |
296 | sha1_uint32 x[16]; | |
297 | sha1_uint32 a = ctx->A; | |
298 | sha1_uint32 b = ctx->B; | |
299 | sha1_uint32 c = ctx->C; | |
300 | sha1_uint32 d = ctx->D; | |
301 | sha1_uint32 e = ctx->E; | |
302 | ||
303 | /* First increment the byte count. RFC 1321 specifies the possible | |
304 | length of the file up to 2^64 bits. Here we only compute the | |
305 | number of bytes. Do a double word increment. */ | |
306 | ctx->total[0] += len; | |
1d77deec | 307 | ctx->total[1] += ((len >> 31) >> 1) + (ctx->total[0] < len); |
598848e4 ILT |
308 | |
309 | #define rol(x, n) (((x) << (n)) | ((sha1_uint32) (x) >> (32 - (n)))) | |
310 | ||
311 | #define M(I) ( tm = x[I&0x0f] ^ x[(I-14)&0x0f] \ | |
312 | ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f] \ | |
313 | , (x[I&0x0f] = rol(tm, 1)) ) | |
314 | ||
315 | #define R(A,B,C,D,E,F,K,M) do { E += rol( A, 5 ) \ | |
316 | + F( B, C, D ) \ | |
317 | + K \ | |
318 | + M; \ | |
319 | B = rol( B, 30 ); \ | |
320 | } while(0) | |
321 | ||
322 | while (words < endp) | |
323 | { | |
324 | sha1_uint32 tm; | |
325 | int t; | |
326 | for (t = 0; t < 16; t++) | |
327 | { | |
328 | x[t] = SWAP (*words); | |
329 | words++; | |
330 | } | |
331 | ||
332 | R( a, b, c, d, e, F1, K1, x[ 0] ); | |
333 | R( e, a, b, c, d, F1, K1, x[ 1] ); | |
334 | R( d, e, a, b, c, F1, K1, x[ 2] ); | |
335 | R( c, d, e, a, b, F1, K1, x[ 3] ); | |
336 | R( b, c, d, e, a, F1, K1, x[ 4] ); | |
337 | R( a, b, c, d, e, F1, K1, x[ 5] ); | |
338 | R( e, a, b, c, d, F1, K1, x[ 6] ); | |
339 | R( d, e, a, b, c, F1, K1, x[ 7] ); | |
340 | R( c, d, e, a, b, F1, K1, x[ 8] ); | |
341 | R( b, c, d, e, a, F1, K1, x[ 9] ); | |
342 | R( a, b, c, d, e, F1, K1, x[10] ); | |
343 | R( e, a, b, c, d, F1, K1, x[11] ); | |
344 | R( d, e, a, b, c, F1, K1, x[12] ); | |
345 | R( c, d, e, a, b, F1, K1, x[13] ); | |
346 | R( b, c, d, e, a, F1, K1, x[14] ); | |
347 | R( a, b, c, d, e, F1, K1, x[15] ); | |
348 | R( e, a, b, c, d, F1, K1, M(16) ); | |
349 | R( d, e, a, b, c, F1, K1, M(17) ); | |
350 | R( c, d, e, a, b, F1, K1, M(18) ); | |
351 | R( b, c, d, e, a, F1, K1, M(19) ); | |
352 | R( a, b, c, d, e, F2, K2, M(20) ); | |
353 | R( e, a, b, c, d, F2, K2, M(21) ); | |
354 | R( d, e, a, b, c, F2, K2, M(22) ); | |
355 | R( c, d, e, a, b, F2, K2, M(23) ); | |
356 | R( b, c, d, e, a, F2, K2, M(24) ); | |
357 | R( a, b, c, d, e, F2, K2, M(25) ); | |
358 | R( e, a, b, c, d, F2, K2, M(26) ); | |
359 | R( d, e, a, b, c, F2, K2, M(27) ); | |
360 | R( c, d, e, a, b, F2, K2, M(28) ); | |
361 | R( b, c, d, e, a, F2, K2, M(29) ); | |
362 | R( a, b, c, d, e, F2, K2, M(30) ); | |
363 | R( e, a, b, c, d, F2, K2, M(31) ); | |
364 | R( d, e, a, b, c, F2, K2, M(32) ); | |
365 | R( c, d, e, a, b, F2, K2, M(33) ); | |
366 | R( b, c, d, e, a, F2, K2, M(34) ); | |
367 | R( a, b, c, d, e, F2, K2, M(35) ); | |
368 | R( e, a, b, c, d, F2, K2, M(36) ); | |
369 | R( d, e, a, b, c, F2, K2, M(37) ); | |
370 | R( c, d, e, a, b, F2, K2, M(38) ); | |
371 | R( b, c, d, e, a, F2, K2, M(39) ); | |
372 | R( a, b, c, d, e, F3, K3, M(40) ); | |
373 | R( e, a, b, c, d, F3, K3, M(41) ); | |
374 | R( d, e, a, b, c, F3, K3, M(42) ); | |
375 | R( c, d, e, a, b, F3, K3, M(43) ); | |
376 | R( b, c, d, e, a, F3, K3, M(44) ); | |
377 | R( a, b, c, d, e, F3, K3, M(45) ); | |
378 | R( e, a, b, c, d, F3, K3, M(46) ); | |
379 | R( d, e, a, b, c, F3, K3, M(47) ); | |
380 | R( c, d, e, a, b, F3, K3, M(48) ); | |
381 | R( b, c, d, e, a, F3, K3, M(49) ); | |
382 | R( a, b, c, d, e, F3, K3, M(50) ); | |
383 | R( e, a, b, c, d, F3, K3, M(51) ); | |
384 | R( d, e, a, b, c, F3, K3, M(52) ); | |
385 | R( c, d, e, a, b, F3, K3, M(53) ); | |
386 | R( b, c, d, e, a, F3, K3, M(54) ); | |
387 | R( a, b, c, d, e, F3, K3, M(55) ); | |
388 | R( e, a, b, c, d, F3, K3, M(56) ); | |
389 | R( d, e, a, b, c, F3, K3, M(57) ); | |
390 | R( c, d, e, a, b, F3, K3, M(58) ); | |
391 | R( b, c, d, e, a, F3, K3, M(59) ); | |
392 | R( a, b, c, d, e, F4, K4, M(60) ); | |
393 | R( e, a, b, c, d, F4, K4, M(61) ); | |
394 | R( d, e, a, b, c, F4, K4, M(62) ); | |
395 | R( c, d, e, a, b, F4, K4, M(63) ); | |
396 | R( b, c, d, e, a, F4, K4, M(64) ); | |
397 | R( a, b, c, d, e, F4, K4, M(65) ); | |
398 | R( e, a, b, c, d, F4, K4, M(66) ); | |
399 | R( d, e, a, b, c, F4, K4, M(67) ); | |
400 | R( c, d, e, a, b, F4, K4, M(68) ); | |
401 | R( b, c, d, e, a, F4, K4, M(69) ); | |
402 | R( a, b, c, d, e, F4, K4, M(70) ); | |
403 | R( e, a, b, c, d, F4, K4, M(71) ); | |
404 | R( d, e, a, b, c, F4, K4, M(72) ); | |
405 | R( c, d, e, a, b, F4, K4, M(73) ); | |
406 | R( b, c, d, e, a, F4, K4, M(74) ); | |
407 | R( a, b, c, d, e, F4, K4, M(75) ); | |
408 | R( e, a, b, c, d, F4, K4, M(76) ); | |
409 | R( d, e, a, b, c, F4, K4, M(77) ); | |
410 | R( c, d, e, a, b, F4, K4, M(78) ); | |
411 | R( b, c, d, e, a, F4, K4, M(79) ); | |
412 | ||
413 | a = ctx->A += a; | |
414 | b = ctx->B += b; | |
415 | c = ctx->C += c; | |
416 | d = ctx->D += d; | |
417 | e = ctx->E += e; | |
418 | } | |
419 | } | |
bf4f40cc JJ |
420 | |
421 | #if defined(HAVE_X86_SHA1_HW_SUPPORT) | |
422 | /* HW specific version of sha1_process_bytes. */ | |
423 | ||
424 | static void sha1_hw_process_block (const void *, size_t, struct sha1_ctx *); | |
425 | ||
426 | static void | |
427 | sha1_hw_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx) | |
428 | { | |
429 | /* When we already have some bits in our internal buffer concatenate | |
430 | both inputs first. */ | |
431 | if (ctx->buflen != 0) | |
432 | { | |
433 | size_t left_over = ctx->buflen; | |
434 | size_t add = 128 - left_over > len ? len : 128 - left_over; | |
435 | ||
436 | memcpy (&((char *) ctx->buffer)[left_over], buffer, add); | |
437 | ctx->buflen += add; | |
438 | ||
439 | if (ctx->buflen > 64) | |
440 | { | |
441 | sha1_hw_process_block (ctx->buffer, ctx->buflen & ~63, ctx); | |
442 | ||
443 | ctx->buflen &= 63; | |
444 | /* The regions in the following copy operation cannot overlap. */ | |
445 | memcpy (ctx->buffer, | |
446 | &((char *) ctx->buffer)[(left_over + add) & ~63], | |
447 | ctx->buflen); | |
448 | } | |
449 | ||
450 | buffer = (const char *) buffer + add; | |
451 | len -= add; | |
452 | } | |
453 | ||
454 | /* Process available complete blocks. */ | |
455 | if (len >= 64) | |
456 | { | |
457 | #if !_STRING_ARCH_unaligned | |
458 | # define alignof(type) offsetof (struct { char c; type x; }, x) | |
459 | # define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0) | |
460 | if (UNALIGNED_P (buffer)) | |
461 | while (len > 64) | |
462 | { | |
463 | sha1_hw_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx); | |
464 | buffer = (const char *) buffer + 64; | |
465 | len -= 64; | |
466 | } | |
467 | else | |
468 | #endif | |
469 | { | |
470 | sha1_hw_process_block (buffer, len & ~63, ctx); | |
471 | buffer = (const char *) buffer + (len & ~63); | |
472 | len &= 63; | |
473 | } | |
474 | } | |
475 | ||
476 | /* Move remaining bytes in internal buffer. */ | |
477 | if (len > 0) | |
478 | { | |
479 | size_t left_over = ctx->buflen; | |
480 | ||
481 | memcpy (&((char *) ctx->buffer)[left_over], buffer, len); | |
482 | left_over += len; | |
483 | if (left_over >= 64) | |
484 | { | |
485 | sha1_hw_process_block (ctx->buffer, 64, ctx); | |
486 | left_over -= 64; | |
487 | memmove (ctx->buffer, &ctx->buffer[16], left_over); | |
488 | } | |
489 | ctx->buflen = left_over; | |
490 | } | |
491 | } | |
492 | ||
493 | /* Process LEN bytes of BUFFER, accumulating context into CTX. | |
494 | Using CPU specific intrinsics. */ | |
495 | ||
496 | #ifdef HAVE_X86_SHA1_HW_SUPPORT | |
497 | __attribute__((__target__ ("sse4.1,sha"))) | |
498 | #endif | |
499 | static void | |
500 | sha1_hw_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx) | |
501 | { | |
502 | #ifdef HAVE_X86_SHA1_HW_SUPPORT | |
503 | /* Implemented from | |
504 | https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html */ | |
505 | const __m128i *words = (const __m128i *) buffer; | |
506 | const __m128i *endp = (const __m128i *) ((const char *) buffer + len); | |
507 | __m128i abcd, abcd_save, e0, e0_save, e1, msg0, msg1, msg2, msg3; | |
508 | const __m128i shuf_mask | |
509 | = _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL); | |
510 | char check[((offsetof (struct sha1_ctx, B) | |
511 | == offsetof (struct sha1_ctx, A) + sizeof (ctx->A)) | |
512 | && (offsetof (struct sha1_ctx, C) | |
513 | == offsetof (struct sha1_ctx, A) + 2 * sizeof (ctx->A)) | |
514 | && (offsetof (struct sha1_ctx, D) | |
515 | == offsetof (struct sha1_ctx, A) + 3 * sizeof (ctx->A))) | |
516 | ? 1 : -1]; | |
517 | ||
518 | /* First increment the byte count. RFC 1321 specifies the possible | |
519 | length of the file up to 2^64 bits. Here we only compute the | |
520 | number of bytes. Do a double word increment. */ | |
521 | ctx->total[0] += len; | |
522 | ctx->total[1] += ((len >> 31) >> 1) + (ctx->total[0] < len); | |
523 | ||
524 | (void) &check[0]; | |
525 | abcd = _mm_loadu_si128 ((const __m128i *) &ctx->A); | |
526 | e0 = _mm_set_epi32 (ctx->E, 0, 0, 0); | |
527 | abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */ | |
528 | ||
529 | while (words < endp) | |
530 | { | |
531 | abcd_save = abcd; | |
532 | e0_save = e0; | |
533 | ||
534 | /* 0..3 */ | |
535 | msg0 = _mm_loadu_si128 (words); | |
536 | msg0 = _mm_shuffle_epi8 (msg0, shuf_mask); | |
537 | e0 = _mm_add_epi32 (e0, msg0); | |
538 | e1 = abcd; | |
539 | abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0); | |
540 | ||
541 | /* 4..7 */ | |
542 | msg1 = _mm_loadu_si128 (words + 1); | |
543 | msg1 = _mm_shuffle_epi8 (msg1, shuf_mask); | |
544 | e1 = _mm_sha1nexte_epu32 (e1, msg1); | |
545 | e0 = abcd; | |
546 | abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0); | |
547 | msg0 = _mm_sha1msg1_epu32 (msg0, msg1); | |
548 | ||
549 | /* 8..11 */ | |
550 | msg2 = _mm_loadu_si128 (words + 2); | |
551 | msg2 = _mm_shuffle_epi8 (msg2, shuf_mask); | |
552 | e0 = _mm_sha1nexte_epu32 (e0, msg2); | |
553 | e1 = abcd; | |
554 | abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0); | |
555 | msg1 = _mm_sha1msg1_epu32 (msg1, msg2); | |
556 | msg0 = _mm_xor_si128 (msg0, msg2); | |
557 | ||
558 | /* 12..15 */ | |
559 | msg3 = _mm_loadu_si128 (words + 3); | |
560 | msg3 = _mm_shuffle_epi8 (msg3, shuf_mask); | |
561 | e1 = _mm_sha1nexte_epu32 (e1, msg3); | |
562 | e0 = abcd; | |
563 | msg0 = _mm_sha1msg2_epu32 (msg0, msg3); | |
564 | abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0); | |
565 | msg2 = _mm_sha1msg1_epu32 (msg2, msg3); | |
566 | msg1 = _mm_xor_si128 (msg1, msg3); | |
567 | ||
568 | /* 16..19 */ | |
569 | e0 = _mm_sha1nexte_epu32 (e0, msg0); | |
570 | e1 = abcd; | |
571 | msg1 = _mm_sha1msg2_epu32 (msg1, msg0); | |
572 | abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0); | |
573 | msg3 = _mm_sha1msg1_epu32 (msg3, msg0); | |
574 | msg2 = _mm_xor_si128 (msg2, msg0); | |
575 | ||
576 | /* 20..23 */ | |
577 | e1 = _mm_sha1nexte_epu32 (e1, msg1); | |
578 | e0 = abcd; | |
579 | msg2 = _mm_sha1msg2_epu32 (msg2, msg1); | |
580 | abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1); | |
581 | msg0 = _mm_sha1msg1_epu32 (msg0, msg1); | |
582 | msg3 = _mm_xor_si128 (msg3, msg1); | |
583 | ||
584 | /* 24..27 */ | |
585 | e0 = _mm_sha1nexte_epu32 (e0, msg2); | |
586 | e1 = abcd; | |
587 | msg3 = _mm_sha1msg2_epu32 (msg3, msg2); | |
588 | abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1); | |
589 | msg1 = _mm_sha1msg1_epu32 (msg1, msg2); | |
590 | msg0 = _mm_xor_si128 (msg0, msg2); | |
591 | ||
592 | /* 28..31 */ | |
593 | e1 = _mm_sha1nexte_epu32 (e1, msg3); | |
594 | e0 = abcd; | |
595 | msg0 = _mm_sha1msg2_epu32 (msg0, msg3); | |
596 | abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1); | |
597 | msg2 = _mm_sha1msg1_epu32 (msg2, msg3); | |
598 | msg1 = _mm_xor_si128 (msg1, msg3); | |
599 | ||
600 | /* 32..35 */ | |
601 | e0 = _mm_sha1nexte_epu32 (e0, msg0); | |
602 | e1 = abcd; | |
603 | msg1 = _mm_sha1msg2_epu32 (msg1, msg0); | |
604 | abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1); | |
605 | msg3 = _mm_sha1msg1_epu32 (msg3, msg0); | |
606 | msg2 = _mm_xor_si128 (msg2, msg0); | |
607 | ||
608 | /* 36..39 */ | |
609 | e1 = _mm_sha1nexte_epu32 (e1, msg1); | |
610 | e0 = abcd; | |
611 | msg2 = _mm_sha1msg2_epu32 (msg2, msg1); | |
612 | abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1); | |
613 | msg0 = _mm_sha1msg1_epu32 (msg0, msg1); | |
614 | msg3 = _mm_xor_si128 (msg3, msg1); | |
615 | ||
616 | /* 40..43 */ | |
617 | e0 = _mm_sha1nexte_epu32 (e0, msg2); | |
618 | e1 = abcd; | |
619 | msg3 = _mm_sha1msg2_epu32 (msg3, msg2); | |
620 | abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2); | |
621 | msg1 = _mm_sha1msg1_epu32 (msg1, msg2); | |
622 | msg0 = _mm_xor_si128 (msg0, msg2); | |
623 | ||
624 | /* 44..47 */ | |
625 | e1 = _mm_sha1nexte_epu32 (e1, msg3); | |
626 | e0 = abcd; | |
627 | msg0 = _mm_sha1msg2_epu32 (msg0, msg3); | |
628 | abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2); | |
629 | msg2 = _mm_sha1msg1_epu32 (msg2, msg3); | |
630 | msg1 = _mm_xor_si128 (msg1, msg3); | |
631 | ||
632 | /* 48..51 */ | |
633 | e0 = _mm_sha1nexte_epu32 (e0, msg0); | |
634 | e1 = abcd; | |
635 | msg1 = _mm_sha1msg2_epu32 (msg1, msg0); | |
636 | abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2); | |
637 | msg3 = _mm_sha1msg1_epu32 (msg3, msg0); | |
638 | msg2 = _mm_xor_si128 (msg2, msg0); | |
639 | ||
640 | /* 52..55 */ | |
641 | e1 = _mm_sha1nexte_epu32 (e1, msg1); | |
642 | e0 = abcd; | |
643 | msg2 = _mm_sha1msg2_epu32 (msg2, msg1); | |
644 | abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2); | |
645 | msg0 = _mm_sha1msg1_epu32 (msg0, msg1); | |
646 | msg3 = _mm_xor_si128 (msg3, msg1); | |
647 | ||
648 | /* 56..59 */ | |
649 | e0 = _mm_sha1nexte_epu32 (e0, msg2); | |
650 | e1 = abcd; | |
651 | msg3 = _mm_sha1msg2_epu32 (msg3, msg2); | |
652 | abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2); | |
653 | msg1 = _mm_sha1msg1_epu32 (msg1, msg2); | |
654 | msg0 = _mm_xor_si128 (msg0, msg2); | |
655 | ||
656 | /* 60..63 */ | |
657 | e1 = _mm_sha1nexte_epu32 (e1, msg3); | |
658 | e0 = abcd; | |
659 | msg0 = _mm_sha1msg2_epu32 (msg0, msg3); | |
660 | abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3); | |
661 | msg2 = _mm_sha1msg1_epu32 (msg2, msg3); | |
662 | msg1 = _mm_xor_si128 (msg1, msg3); | |
663 | ||
664 | /* 64..67 */ | |
665 | e0 = _mm_sha1nexte_epu32 (e0, msg0); | |
666 | e1 = abcd; | |
667 | msg1 = _mm_sha1msg2_epu32 (msg1, msg0); | |
668 | abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3); | |
669 | msg3 = _mm_sha1msg1_epu32 (msg3, msg0); | |
670 | msg2 = _mm_xor_si128 (msg2, msg0); | |
671 | ||
672 | /* 68..71 */ | |
673 | e1 = _mm_sha1nexte_epu32 (e1, msg1); | |
674 | e0 = abcd; | |
675 | msg2 = _mm_sha1msg2_epu32 (msg2, msg1); | |
676 | abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3); | |
677 | msg3 = _mm_xor_si128 (msg3, msg1); | |
678 | ||
679 | /* 72..75 */ | |
680 | e0 = _mm_sha1nexte_epu32 (e0, msg2); | |
681 | e1 = abcd; | |
682 | msg3 = _mm_sha1msg2_epu32 (msg3, msg2); | |
683 | abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3); | |
684 | ||
685 | /* 76..79 */ | |
686 | e1 = _mm_sha1nexte_epu32 (e1, msg3); | |
687 | e0 = abcd; | |
688 | abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3); | |
689 | ||
690 | /* Finalize. */ | |
691 | e0 = _mm_sha1nexte_epu32 (e0, e0_save); | |
692 | abcd = _mm_add_epi32 (abcd, abcd_save); | |
693 | ||
694 | words = words + 4; | |
695 | } | |
696 | ||
697 | abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */ | |
698 | _mm_storeu_si128 ((__m128i *) &ctx->A, abcd); | |
699 | ctx->E = _mm_extract_epi32 (e0, 3); | |
700 | #endif | |
701 | } | |
702 | #endif | |
703 | ||
704 | /* Return sha1_process_bytes or some hardware optimized version thereof | |
705 | depending on current CPU. */ | |
706 | ||
707 | sha1_process_bytes_fn | |
708 | sha1_choose_process_bytes (void) | |
709 | { | |
710 | #ifdef HAVE_X86_SHA1_HW_SUPPORT | |
711 | unsigned int eax, ebx, ecx, edx; | |
712 | if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx) | |
713 | && (ebx & bit_SHA) != 0 | |
714 | && __get_cpuid (1, &eax, &ebx, &ecx, &edx) | |
715 | && (ecx & bit_SSE4_1) != 0) | |
716 | return sha1_hw_process_bytes; | |
717 | #endif | |
718 | return sha1_process_bytes; | |
719 | } |