]>
Commit | Line | Data |
---|---|---|
0ff57406 | 1 | /* |
faeb1b64 KZ |
2 | * SPDX-License-Identifier: BSD-2-Clause |
3 | * | |
0ff57406 TW |
4 | * xxHash - Extremely Fast Hash algorithm |
5 | * Header File | |
6 | * Copyright (C) 2012-2020 Yann Collet | |
7 | * | |
8 | * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) | |
9 | * | |
10 | * Redistribution and use in source and binary forms, with or without | |
11 | * modification, are permitted provided that the following conditions are | |
12 | * met: | |
13 | * | |
14 | * * Redistributions of source code must retain the above copyright | |
15 | * notice, this list of conditions and the following disclaimer. | |
16 | * * Redistributions in binary form must reproduce the above | |
17 | * copyright notice, this list of conditions and the following disclaimer | |
18 | * in the documentation and/or other materials provided with the | |
19 | * distribution. | |
20 | * | |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
24 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
25 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
26 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
27 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
28 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
29 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
30 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
31 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
32 | * | |
33 | * You can contact the author at: | |
34 | * - xxHash homepage: https://www.xxhash.com | |
35 | * - xxHash source repository: https://github.com/Cyan4973/xxHash | |
36 | */ | |
37 | /*! | |
38 | * @mainpage xxHash | |
39 | * | |
40 | * @file xxhash.h | |
41 | * xxHash prototypes and implementation | |
42 | */ | |
43 | /* TODO: update */ | |
44 | /* Notice extracted from xxHash homepage: | |
45 | ||
46 | xxHash is an extremely fast hash algorithm, running at RAM speed limits. | |
47 | It also successfully passes all tests from the SMHasher suite. | |
48 | ||
49 | Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) | |
50 | ||
51 | Name Speed Q.Score Author | |
52 | xxHash 5.4 GB/s 10 | |
53 | CrapWow 3.2 GB/s 2 Andrew | |
54 | MurmurHash 3a 2.7 GB/s 10 Austin Appleby | |
55 | SpookyHash 2.0 GB/s 10 Bob Jenkins | |
56 | SBox 1.4 GB/s 9 Bret Mulvey | |
57 | Lookup3 1.2 GB/s 9 Bob Jenkins | |
58 | SuperFastHash 1.2 GB/s 1 Paul Hsieh | |
59 | CityHash64 1.05 GB/s 10 Pike & Alakuijala | |
60 | FNV 0.55 GB/s 5 Fowler, Noll, Vo | |
61 | CRC32 0.43 GB/s 9 | |
62 | MD5-32 0.33 GB/s 10 Ronald L. Rivest | |
63 | SHA1-32 0.28 GB/s 10 | |
64 | ||
65 | Q.Score is a measure of quality of the hash function. | |
66 | It depends on successfully passing SMHasher test set. | |
67 | 10 is a perfect score. | |
68 | ||
69 | Note: SMHasher's CRC32 implementation is not the fastest one. | |
70 | Other speed-oriented implementations can be faster, | |
71 | especially in combination with PCLMUL instruction: | |
72 | https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735 | |
73 | ||
74 | A 64-bit version, named XXH64, is available since r35. | |
75 | It offers much better speed, but for 64-bit applications only. | |
76 | Name Speed on 64 bits Speed on 32 bits | |
77 | XXH64 13.8 GB/s 1.9 GB/s | |
78 | XXH32 6.8 GB/s 6.0 GB/s | |
79 | */ | |
80 | ||
4fdacbf6 TW |
81 | /* util-linux customizations */ |
82 | #define XXH_NO_XXH3 | |
83 | #define XXH_NAMESPACE ul_ | |
84 | ||
0ff57406 TW |
85 | #if defined (__cplusplus) |
86 | extern "C" { | |
87 | #endif | |
88 | ||
89 | /* **************************** | |
90 | * INLINE mode | |
91 | ******************************/ | |
92 | /*! | |
93 | * XXH_INLINE_ALL (and XXH_PRIVATE_API) | |
94 | * Use these build macros to inline xxhash into the target unit. | |
95 | * Inlining improves performance on small inputs, especially when the length is | |
96 | * expressed as a compile-time constant: | |
97 | * | |
98 | * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html | |
99 | * | |
100 | * It also keeps xxHash symbols private to the unit, so they are not exported. | |
101 | * | |
102 | * Usage: | |
103 | * #define XXH_INLINE_ALL | |
104 | * #include "xxhash.h" | |
105 | * | |
106 | * Do not compile and link xxhash.o as a separate object, as it is not useful. | |
107 | */ | |
108 | #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ | |
109 | && !defined(XXH_INLINE_ALL_31684351384) | |
110 | /* this section should be traversed only once */ | |
111 | # define XXH_INLINE_ALL_31684351384 | |
112 | /* give access to the advanced API, required to compile implementations */ | |
113 | # undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ | |
114 | # define XXH_STATIC_LINKING_ONLY | |
115 | /* make all functions private */ | |
116 | # undef XXH_PUBLIC_API | |
117 | # if defined(__GNUC__) | |
118 | # define XXH_PUBLIC_API static __inline __attribute__((unused)) | |
119 | # elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) | |
120 | # define XXH_PUBLIC_API static inline | |
121 | # elif defined(_MSC_VER) | |
122 | # define XXH_PUBLIC_API static __inline | |
123 | # else | |
124 | /* note: this version may generate warnings for unused static functions */ | |
125 | # define XXH_PUBLIC_API static | |
126 | # endif | |
127 | ||
128 | /* | |
129 | * This part deals with the special case where a unit wants to inline xxHash, | |
130 | * but "xxhash.h" has previously been included without XXH_INLINE_ALL, | |
131 | * such as part of some previously included *.h header file. | |
132 | * Without further action, the new include would just be ignored, | |
133 | * and functions would effectively _not_ be inlined (silent failure). | |
134 | * The following macros solve this situation by prefixing all inlined names, | |
135 | * avoiding naming collision with previous inclusions. | |
136 | */ | |
137 | /* Before that, we unconditionally #undef all symbols, | |
138 | * in case they were already defined with XXH_NAMESPACE. | |
139 | * They will then be redefined for XXH_INLINE_ALL | |
140 | */ | |
141 | # undef XXH_versionNumber | |
142 | /* XXH32 */ | |
143 | # undef XXH32 | |
144 | # undef XXH32_createState | |
145 | # undef XXH32_freeState | |
146 | # undef XXH32_reset | |
147 | # undef XXH32_update | |
148 | # undef XXH32_digest | |
149 | # undef XXH32_copyState | |
150 | # undef XXH32_canonicalFromHash | |
151 | # undef XXH32_hashFromCanonical | |
152 | /* XXH64 */ | |
153 | # undef XXH64 | |
154 | # undef XXH64_createState | |
155 | # undef XXH64_freeState | |
156 | # undef XXH64_reset | |
157 | # undef XXH64_update | |
158 | # undef XXH64_digest | |
159 | # undef XXH64_copyState | |
160 | # undef XXH64_canonicalFromHash | |
161 | # undef XXH64_hashFromCanonical | |
162 | /* XXH3_64bits */ | |
163 | # undef XXH3_64bits | |
164 | # undef XXH3_64bits_withSecret | |
165 | # undef XXH3_64bits_withSeed | |
166 | # undef XXH3_64bits_withSecretandSeed | |
167 | # undef XXH3_createState | |
168 | # undef XXH3_freeState | |
169 | # undef XXH3_copyState | |
170 | # undef XXH3_64bits_reset | |
171 | # undef XXH3_64bits_reset_withSeed | |
172 | # undef XXH3_64bits_reset_withSecret | |
173 | # undef XXH3_64bits_update | |
174 | # undef XXH3_64bits_digest | |
175 | # undef XXH3_generateSecret | |
176 | /* XXH3_128bits */ | |
177 | # undef XXH128 | |
178 | # undef XXH3_128bits | |
179 | # undef XXH3_128bits_withSeed | |
180 | # undef XXH3_128bits_withSecret | |
181 | # undef XXH3_128bits_reset | |
182 | # undef XXH3_128bits_reset_withSeed | |
183 | # undef XXH3_128bits_reset_withSecret | |
184 | # undef XXH3_128bits_reset_withSecretandSeed | |
185 | # undef XXH3_128bits_update | |
186 | # undef XXH3_128bits_digest | |
187 | # undef XXH128_isEqual | |
188 | # undef XXH128_cmp | |
189 | # undef XXH128_canonicalFromHash | |
190 | # undef XXH128_hashFromCanonical | |
191 | /* Finally, free the namespace itself */ | |
192 | # undef XXH_NAMESPACE | |
193 | ||
194 | /* employ the namespace for XXH_INLINE_ALL */ | |
195 | # define XXH_NAMESPACE XXH_INLINE_ | |
196 | /* | |
197 | * Some identifiers (enums, type names) are not symbols, | |
198 | * but they must nonetheless be renamed to avoid redeclaration. | |
199 | * Alternative solution: do not redeclare them. | |
200 | * However, this requires some #ifdefs, and has a more dispersed impact. | |
201 | * Meanwhile, renaming can be achieved in a single place. | |
202 | */ | |
203 | # define XXH_IPREF(Id) XXH_NAMESPACE ## Id | |
204 | # define XXH_OK XXH_IPREF(XXH_OK) | |
205 | # define XXH_ERROR XXH_IPREF(XXH_ERROR) | |
206 | # define XXH_errorcode XXH_IPREF(XXH_errorcode) | |
207 | # define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) | |
208 | # define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) | |
209 | # define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) | |
210 | # define XXH32_state_s XXH_IPREF(XXH32_state_s) | |
211 | # define XXH32_state_t XXH_IPREF(XXH32_state_t) | |
212 | # define XXH64_state_s XXH_IPREF(XXH64_state_s) | |
213 | # define XXH64_state_t XXH_IPREF(XXH64_state_t) | |
214 | # define XXH3_state_s XXH_IPREF(XXH3_state_s) | |
215 | # define XXH3_state_t XXH_IPREF(XXH3_state_t) | |
216 | # define XXH128_hash_t XXH_IPREF(XXH128_hash_t) | |
217 | /* Ensure the header is parsed again, even if it was previously included */ | |
218 | # undef XXHASH_H_5627135585666179 | |
219 | # undef XXHASH_H_STATIC_13879238742 | |
220 | #endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ | |
221 | ||
222 | ||
223 | ||
224 | /* **************************************************************** | |
225 | * Stable API | |
226 | *****************************************************************/ | |
227 | #ifndef XXHASH_H_5627135585666179 | |
228 | #define XXHASH_H_5627135585666179 1 | |
229 | ||
230 | ||
231 | /*! | |
232 | * @defgroup public Public API | |
233 | * Contains details on the public xxHash functions. | |
234 | * @{ | |
235 | */ | |
236 | /* specific declaration modes for Windows */ | |
237 | #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) | |
238 | # if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) | |
239 | # ifdef XXH_EXPORT | |
240 | # define XXH_PUBLIC_API __declspec(dllexport) | |
241 | # elif XXH_IMPORT | |
242 | # define XXH_PUBLIC_API __declspec(dllimport) | |
243 | # endif | |
244 | # else | |
245 | # define XXH_PUBLIC_API /* do nothing */ | |
246 | # endif | |
247 | #endif | |
248 | ||
249 | #ifdef XXH_DOXYGEN | |
250 | /*! | |
251 | * @brief Emulate a namespace by transparently prefixing all symbols. | |
252 | * | |
253 | * If you want to include _and expose_ xxHash functions from within your own | |
254 | * library, but also want to avoid symbol collisions with other libraries which | |
255 | * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix | |
256 | * any public symbol from xxhash library with the value of XXH_NAMESPACE | |
257 | * (therefore, avoid empty or numeric values). | |
258 | * | |
259 | * Note that no change is required within the calling program as long as it | |
260 | * includes `xxhash.h`: Regular symbol names will be automatically translated | |
261 | * by this header. | |
262 | */ | |
263 | # define XXH_NAMESPACE /* YOUR NAME HERE */ | |
264 | # undef XXH_NAMESPACE | |
265 | #endif | |
266 | ||
267 | #ifdef XXH_NAMESPACE | |
268 | # define XXH_CAT(A,B) A##B | |
269 | # define XXH_NAME2(A,B) XXH_CAT(A,B) | |
270 | # define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) | |
271 | /* XXH32 */ | |
272 | # define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) | |
273 | # define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) | |
274 | # define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) | |
275 | # define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) | |
276 | # define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) | |
277 | # define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) | |
278 | # define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) | |
279 | # define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) | |
280 | # define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) | |
281 | /* XXH64 */ | |
282 | # define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) | |
283 | # define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) | |
284 | # define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) | |
285 | # define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) | |
286 | # define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) | |
287 | # define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) | |
288 | # define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) | |
289 | # define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) | |
290 | # define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) | |
291 | /* XXH3_64bits */ | |
292 | # define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) | |
293 | # define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) | |
294 | # define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) | |
295 | # define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed) | |
296 | # define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) | |
297 | # define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) | |
298 | # define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) | |
299 | # define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) | |
300 | # define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) | |
301 | # define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) | |
302 | # define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed) | |
303 | # define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) | |
304 | # define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) | |
305 | # define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) | |
306 | # define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed) | |
307 | /* XXH3_128bits */ | |
308 | # define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) | |
309 | # define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) | |
310 | # define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) | |
311 | # define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) | |
312 | # define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed) | |
313 | # define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) | |
314 | # define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) | |
315 | # define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) | |
316 | # define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed) | |
317 | # define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) | |
318 | # define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) | |
319 | # define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) | |
320 | # define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) | |
321 | # define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) | |
322 | # define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) | |
323 | #endif | |
324 | ||
325 | ||
326 | /* ************************************* | |
327 | * Version | |
328 | ***************************************/ | |
329 | #define XXH_VERSION_MAJOR 0 | |
330 | #define XXH_VERSION_MINOR 8 | |
331 | #define XXH_VERSION_RELEASE 1 | |
332 | #define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) | |
333 | ||
334 | /*! | |
335 | * @brief Obtains the xxHash version. | |
336 | * | |
337 | * This is mostly useful when xxHash is compiled as a shared library, | |
338 | * since the returned value comes from the library, as opposed to header file. | |
339 | * | |
340 | * @return `XXH_VERSION_NUMBER` of the invoked library. | |
341 | */ | |
342 | XXH_PUBLIC_API unsigned XXH_versionNumber (void); | |
343 | ||
344 | ||
345 | /* **************************** | |
346 | * Common basic types | |
347 | ******************************/ | |
348 | #include <stddef.h> /* size_t */ | |
349 | typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; | |
350 | ||
351 | ||
352 | /*-********************************************************************** | |
353 | * 32-bit hash | |
354 | ************************************************************************/ | |
355 | #if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */ | |
356 | /*! | |
357 | * @brief An unsigned 32-bit integer. | |
358 | * | |
359 | * Not necessarily defined to `uint32_t` but functionally equivalent. | |
360 | */ | |
361 | typedef uint32_t XXH32_hash_t; | |
362 | ||
363 | #elif !defined (__VMS) \ | |
364 | && (defined (__cplusplus) \ | |
365 | || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) | |
366 | # include <stdint.h> | |
367 | typedef uint32_t XXH32_hash_t; | |
368 | ||
369 | #else | |
370 | # include <limits.h> | |
371 | # if UINT_MAX == 0xFFFFFFFFUL | |
372 | typedef unsigned int XXH32_hash_t; | |
373 | # else | |
374 | # if ULONG_MAX == 0xFFFFFFFFUL | |
375 | typedef unsigned long XXH32_hash_t; | |
376 | # else | |
377 | # error "unsupported platform: need a 32-bit type" | |
378 | # endif | |
379 | # endif | |
380 | #endif | |
381 | ||
382 | /*! | |
383 | * @} | |
384 | * | |
385 | * @defgroup xxh32_family XXH32 family | |
386 | * @ingroup public | |
387 | * Contains functions used in the classic 32-bit xxHash algorithm. | |
388 | * | |
389 | * @note | |
390 | * XXH32 is useful for older platforms, with no or poor 64-bit performance. | |
391 | * Note that @ref xxh3_family provides competitive speed | |
392 | * for both 32-bit and 64-bit systems, and offers true 64/128 bit hash results. | |
393 | * | |
394 | * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families | |
395 | * @see @ref xxh32_impl for implementation details | |
396 | * @{ | |
397 | */ | |
398 | ||
399 | /*! | |
400 | * @brief Calculates the 32-bit hash of @p input using xxHash32. | |
401 | * | |
402 | * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s | |
403 | * | |
404 | * @param input The block of data to be hashed, at least @p length bytes in size. | |
405 | * @param length The length of @p input, in bytes. | |
406 | * @param seed The 32-bit seed to alter the hash's output predictably. | |
407 | * | |
408 | * @pre | |
409 | * The memory between @p input and @p input + @p length must be valid, | |
410 | * readable, contiguous memory. However, if @p length is `0`, @p input may be | |
411 | * `NULL`. In C++, this also must be *TriviallyCopyable*. | |
412 | * | |
413 | * @return The calculated 32-bit hash value. | |
414 | * | |
415 | * @see | |
416 | * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): | |
417 | * Direct equivalents for the other variants of xxHash. | |
418 | * @see | |
419 | * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version. | |
420 | */ | |
421 | XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); | |
422 | ||
423 | /*! | |
424 | * Streaming functions generate the xxHash value from an incremental input. | |
425 | * This method is slower than single-call functions, due to state management. | |
426 | * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. | |
427 | * | |
428 | * An XXH state must first be allocated using `XXH*_createState()`. | |
429 | * | |
430 | * Start a new hash by initializing the state with a seed using `XXH*_reset()`. | |
431 | * | |
432 | * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. | |
433 | * | |
434 | * The function returns an error code, with 0 meaning OK, and any other value | |
435 | * meaning there is an error. | |
436 | * | |
437 | * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. | |
438 | * This function returns the nn-bits hash as an int or long long. | |
439 | * | |
440 | * It's still possible to continue inserting input into the hash state after a | |
441 | * digest, and generate new hash values later on by invoking `XXH*_digest()`. | |
442 | * | |
443 | * When done, release the state using `XXH*_freeState()`. | |
444 | * | |
445 | * Example code for incrementally hashing a file: | |
446 | * @code{.c} | |
447 | * #include <stdio.h> | |
448 | * #include <xxhash.h> | |
449 | * #define BUFFER_SIZE 256 | |
450 | * | |
451 | * // Note: XXH64 and XXH3 use the same interface. | |
452 | * XXH32_hash_t | |
453 | * hashFile(FILE* stream) | |
454 | * { | |
455 | * XXH32_state_t* state; | |
456 | * unsigned char buf[BUFFER_SIZE]; | |
457 | * size_t amt; | |
458 | * XXH32_hash_t hash; | |
459 | * | |
460 | * state = XXH32_createState(); // Create a state | |
461 | * assert(state != NULL); // Error check here | |
462 | * XXH32_reset(state, 0xbaad5eed); // Reset state with our seed | |
463 | * while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) { | |
464 | * XXH32_update(state, buf, amt); // Hash the file in chunks | |
465 | * } | |
466 | * hash = XXH32_digest(state); // Finalize the hash | |
467 | * XXH32_freeState(state); // Clean up | |
468 | * return hash; | |
469 | * } | |
470 | * @endcode | |
471 | */ | |
472 | ||
473 | /*! | |
474 | * @typedef struct XXH32_state_s XXH32_state_t | |
475 | * @brief The opaque state struct for the XXH32 streaming API. | |
476 | * | |
477 | * @see XXH32_state_s for details. | |
478 | */ | |
479 | typedef struct XXH32_state_s XXH32_state_t; | |
480 | ||
481 | /*! | |
482 | * @brief Allocates an @ref XXH32_state_t. | |
483 | * | |
484 | * Must be freed with XXH32_freeState(). | |
485 | * @return An allocated XXH32_state_t on success, `NULL` on failure. | |
486 | */ | |
487 | XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); | |
488 | /*! | |
489 | * @brief Frees an @ref XXH32_state_t. | |
490 | * | |
491 | * Must be allocated with XXH32_createState(). | |
492 | * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). | |
493 | * @return XXH_OK. | |
494 | */ | |
495 | XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); | |
496 | /*! | |
497 | * @brief Copies one @ref XXH32_state_t to another. | |
498 | * | |
499 | * @param dst_state The state to copy to. | |
500 | * @param src_state The state to copy from. | |
501 | * @pre | |
502 | * @p dst_state and @p src_state must not be `NULL` and must not overlap. | |
503 | */ | |
504 | XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); | |
505 | ||
506 | /*! | |
507 | * @brief Resets an @ref XXH32_state_t to begin a new hash. | |
508 | * | |
509 | * This function resets and seeds a state. Call it before @ref XXH32_update(). | |
510 | * | |
511 | * @param statePtr The state struct to reset. | |
512 | * @param seed The 32-bit seed to alter the hash result predictably. | |
513 | * | |
514 | * @pre | |
515 | * @p statePtr must not be `NULL`. | |
516 | * | |
517 | * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. | |
518 | */ | |
519 | XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); | |
520 | ||
521 | /*! | |
522 | * @brief Consumes a block of @p input to an @ref XXH32_state_t. | |
523 | * | |
524 | * Call this to incrementally consume blocks of data. | |
525 | * | |
526 | * @param statePtr The state struct to update. | |
527 | * @param input The block of data to be hashed, at least @p length bytes in size. | |
528 | * @param length The length of @p input, in bytes. | |
529 | * | |
530 | * @pre | |
531 | * @p statePtr must not be `NULL`. | |
532 | * @pre | |
533 | * The memory between @p input and @p input + @p length must be valid, | |
534 | * readable, contiguous memory. However, if @p length is `0`, @p input may be | |
535 | * `NULL`. In C++, this also must be *TriviallyCopyable*. | |
536 | * | |
537 | * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. | |
538 | */ | |
539 | XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); | |
540 | ||
541 | /*! | |
542 | * @brief Returns the calculated hash value from an @ref XXH32_state_t. | |
543 | * | |
544 | * @note | |
545 | * Calling XXH32_digest() will not affect @p statePtr, so you can update, | |
546 | * digest, and update again. | |
547 | * | |
548 | * @param statePtr The state struct to calculate the hash from. | |
549 | * | |
550 | * @pre | |
551 | * @p statePtr must not be `NULL`. | |
552 | * | |
553 | * @return The calculated xxHash32 value from that state. | |
554 | */ | |
555 | XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); | |
556 | ||
557 | /******* Canonical representation *******/ | |
558 | ||
559 | /* | |
560 | * The default return values from XXH functions are unsigned 32 and 64 bit | |
561 | * integers. | |
562 | * This the simplest and fastest format for further post-processing. | |
563 | * | |
564 | * However, this leaves open the question of what is the order on the byte level, | |
565 | * since little and big endian conventions will store the same number differently. | |
566 | * | |
567 | * The canonical representation settles this issue by mandating big-endian | |
568 | * convention, the same convention as human-readable numbers (large digits first). | |
569 | * | |
570 | * When writing hash values to storage, sending them over a network, or printing | |
571 | * them, it's highly recommended to use the canonical representation to ensure | |
572 | * portability across a wider range of systems, present and future. | |
573 | * | |
574 | * The following functions allow transformation of hash values to and from | |
575 | * canonical format. | |
576 | */ | |
577 | ||
578 | /*! | |
579 | * @brief Canonical (big endian) representation of @ref XXH32_hash_t. | |
580 | */ | |
581 | typedef struct { | |
582 | unsigned char digest[4]; /*!< Hash bytes, big endian */ | |
583 | } XXH32_canonical_t; | |
584 | ||
585 | /*! | |
586 | * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. | |
587 | * | |
588 | * @param dst The @ref XXH32_canonical_t pointer to be stored to. | |
589 | * @param hash The @ref XXH32_hash_t to be converted. | |
590 | * | |
591 | * @pre | |
592 | * @p dst must not be `NULL`. | |
593 | */ | |
594 | XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); | |
595 | ||
596 | /*! | |
597 | * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. | |
598 | * | |
599 | * @param src The @ref XXH32_canonical_t to convert. | |
600 | * | |
601 | * @pre | |
602 | * @p src must not be `NULL`. | |
603 | * | |
604 | * @return The converted hash. | |
605 | */ | |
606 | XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); | |
607 | ||
608 | ||
609 | #ifdef __has_attribute | |
610 | # define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) | |
611 | #else | |
612 | # define XXH_HAS_ATTRIBUTE(x) 0 | |
613 | #endif | |
614 | ||
615 | /* C-language Attributes are added in C23. */ | |
616 | #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute) | |
617 | # define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) | |
618 | #else | |
619 | # define XXH_HAS_C_ATTRIBUTE(x) 0 | |
620 | #endif | |
621 | ||
622 | #if defined(__cplusplus) && defined(__has_cpp_attribute) | |
623 | # define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) | |
624 | #else | |
625 | # define XXH_HAS_CPP_ATTRIBUTE(x) 0 | |
626 | #endif | |
627 | ||
628 | /* | |
629 | Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute | |
630 | introduced in CPP17 and C23. | |
631 | CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough | |
632 | C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough | |
633 | */ | |
634 | #if XXH_HAS_C_ATTRIBUTE(x) | |
635 | # define XXH_FALLTHROUGH [[fallthrough]] | |
636 | #elif XXH_HAS_CPP_ATTRIBUTE(x) | |
637 | # define XXH_FALLTHROUGH [[fallthrough]] | |
638 | #elif XXH_HAS_ATTRIBUTE(__fallthrough__) | |
639 | # define XXH_FALLTHROUGH __attribute__ ((fallthrough)) | |
640 | #else | |
641 | # define XXH_FALLTHROUGH | |
642 | #endif | |
643 | ||
644 | /*! | |
645 | * @} | |
646 | * @ingroup public | |
647 | * @{ | |
648 | */ | |
649 | ||
650 | #ifndef XXH_NO_LONG_LONG | |
651 | /*-********************************************************************** | |
652 | * 64-bit hash | |
653 | ************************************************************************/ | |
654 | #if defined(XXH_DOXYGEN) /* don't include <stdint.h> */ | |
655 | /*! | |
656 | * @brief An unsigned 64-bit integer. | |
657 | * | |
658 | * Not necessarily defined to `uint64_t` but functionally equivalent. | |
659 | */ | |
660 | typedef uint64_t XXH64_hash_t; | |
661 | #elif !defined (__VMS) \ | |
662 | && (defined (__cplusplus) \ | |
663 | || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) | |
664 | # include <stdint.h> | |
665 | typedef uint64_t XXH64_hash_t; | |
666 | #else | |
667 | # include <limits.h> | |
668 | # if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL | |
669 | /* LP64 ABI says uint64_t is unsigned long */ | |
670 | typedef unsigned long XXH64_hash_t; | |
671 | # else | |
672 | /* the following type must have a width of 64-bit */ | |
673 | typedef unsigned long long XXH64_hash_t; | |
674 | # endif | |
675 | #endif | |
676 | ||
677 | /*! | |
678 | * @} | |
679 | * | |
680 | * @defgroup xxh64_family XXH64 family | |
681 | * @ingroup public | |
682 | * @{ | |
683 | * Contains functions used in the classic 64-bit xxHash algorithm. | |
684 | * | |
685 | * @note | |
686 | * XXH3 provides competitive speed for both 32-bit and 64-bit systems, | |
687 | * and offers true 64/128 bit hash results. | |
688 | * It provides better speed for systems with vector processing capabilities. | |
689 | */ | |
690 | ||
691 | ||
692 | /*! | |
693 | * @brief Calculates the 64-bit hash of @p input using xxHash64. | |
694 | * | |
695 | * This function usually runs faster on 64-bit systems, but slower on 32-bit | |
696 | * systems (see benchmark). | |
697 | * | |
698 | * @param input The block of data to be hashed, at least @p length bytes in size. | |
699 | * @param length The length of @p input, in bytes. | |
700 | * @param seed The 64-bit seed to alter the hash's output predictably. | |
701 | * | |
702 | * @pre | |
703 | * The memory between @p input and @p input + @p length must be valid, | |
704 | * readable, contiguous memory. However, if @p length is `0`, @p input may be | |
705 | * `NULL`. In C++, this also must be *TriviallyCopyable*. | |
706 | * | |
707 | * @return The calculated 64-bit hash. | |
708 | * | |
709 | * @see | |
710 | * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): | |
711 | * Direct equivalents for the other variants of xxHash. | |
712 | * @see | |
713 | * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version. | |
714 | */ | |
715 | XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed); | |
716 | ||
717 | /******* Streaming *******/ | |
718 | /*! | |
719 | * @brief The opaque state struct for the XXH64 streaming API. | |
720 | * | |
721 | * @see XXH64_state_s for details. | |
722 | */ | |
723 | typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ | |
724 | XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); | |
725 | XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); | |
726 | XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); | |
727 | ||
728 | XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed); | |
729 | XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); | |
730 | XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); | |
731 | ||
732 | /******* Canonical representation *******/ | |
733 | typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; | |
734 | XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); | |
735 | XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); | |
736 | ||
737 | /*! | |
738 | * @} | |
739 | * ************************************************************************ | |
740 | * @defgroup xxh3_family XXH3 family | |
741 | * @ingroup public | |
742 | * @{ | |
743 | * | |
744 | * XXH3 is a more recent hash algorithm featuring: | |
745 | * - Improved speed for both small and large inputs | |
746 | * - True 64-bit and 128-bit outputs | |
747 | * - SIMD acceleration | |
748 | * - Improved 32-bit viability | |
749 | * | |
750 | * Speed analysis methodology is explained here: | |
751 | * | |
752 | * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html | |
753 | * | |
754 | * Compared to XXH64, expect XXH3 to run approximately | |
755 | * ~2x faster on large inputs and >3x faster on small ones, | |
756 | * exact differences vary depending on platform. | |
757 | * | |
758 | * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, | |
759 | * but does not require it. | |
760 | * Any 32-bit and 64-bit targets that can run XXH32 smoothly | |
761 | * can run XXH3 at competitive speeds, even without vector support. | |
762 | * Further details are explained in the implementation. | |
763 | * | |
764 | * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, | |
765 | * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro. | |
766 | * | |
767 | * XXH3 implementation is portable: | |
768 | * it has a generic C90 formulation that can be compiled on any platform, | |
769 | * all implementations generage exactly the same hash value on all platforms. | |
770 | * Starting from v0.8.0, it's also labelled "stable", meaning that | |
771 | * any future version will also generate the same hash value. | |
772 | * | |
773 | * XXH3 offers 2 variants, _64bits and _128bits. | |
774 | * | |
775 | * When only 64 bits are needed, prefer invoking the _64bits variant, as it | |
776 | * reduces the amount of mixing, resulting in faster speed on small inputs. | |
777 | * It's also generally simpler to manipulate a scalar return type than a struct. | |
778 | * | |
779 | * The API supports one-shot hashing, streaming mode, and custom secrets. | |
780 | */ | |
781 | ||
782 | /*-********************************************************************** | |
783 | * XXH3 64-bit variant | |
784 | ************************************************************************/ | |
785 | ||
786 | /* XXH3_64bits(): | |
787 | * default 64-bit variant, using default secret and default seed of 0. | |
788 | * It's the fastest variant. */ | |
789 | XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); | |
790 | ||
791 | /* | |
792 | * XXH3_64bits_withSeed(): | |
793 | * This variant generates a custom secret on the fly | |
794 | * based on default secret altered using the `seed` value. | |
795 | * While this operation is decently fast, note that it's not completely free. | |
796 | * Note: seed==0 produces the same results as XXH3_64bits(). | |
797 | */ | |
798 | XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); | |
799 | ||
800 | /*! | |
801 | * The bare minimum size for a custom secret. | |
802 | * | |
803 | * @see | |
804 | * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), | |
805 | * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). | |
806 | */ | |
807 | #define XXH3_SECRET_SIZE_MIN 136 | |
808 | ||
809 | /* | |
810 | * XXH3_64bits_withSecret(): | |
811 | * It's possible to provide any blob of bytes as a "secret" to generate the hash. | |
812 | * This makes it more difficult for an external actor to prepare an intentional collision. | |
813 | * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN). | |
814 | * However, the quality of the secret impacts the dispersion of the hash algorithm. | |
815 | * Therefore, the secret _must_ look like a bunch of random bytes. | |
816 | * Avoid "trivial" or structured data such as repeated sequences or a text document. | |
817 | * Whenever in doubt about the "randomness" of the blob of bytes, | |
818 | * consider employing "XXH3_generateSecret()" instead (see below). | |
819 | * It will generate a proper high entropy secret derived from the blob of bytes. | |
820 | * Another advantage of using XXH3_generateSecret() is that | |
821 | * it guarantees that all bits within the initial blob of bytes | |
822 | * will impact every bit of the output. | |
823 | * This is not necessarily the case when using the blob of bytes directly | |
824 | * because, when hashing _small_ inputs, only a portion of the secret is employed. | |
825 | */ | |
826 | XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); | |
827 | ||
828 | ||
829 | /******* Streaming *******/ | |
830 | /* | |
831 | * Streaming requires state maintenance. | |
832 | * This operation costs memory and CPU. | |
833 | * As a consequence, streaming is slower than one-shot hashing. | |
834 | * For better performance, prefer one-shot functions whenever applicable. | |
835 | */ | |
836 | ||
837 | /*! | |
838 | * @brief The state struct for the XXH3 streaming API. | |
839 | * | |
840 | * @see XXH3_state_s for details. | |
841 | */ | |
842 | typedef struct XXH3_state_s XXH3_state_t; | |
843 | XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void); | |
844 | XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); | |
845 | XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state); | |
846 | ||
847 | /* | |
848 | * XXH3_64bits_reset(): | |
849 | * Initialize with default parameters. | |
850 | * digest will be equivalent to `XXH3_64bits()`. | |
851 | */ | |
852 | XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr); | |
853 | /* | |
854 | * XXH3_64bits_reset_withSeed(): | |
855 | * Generate a custom secret from `seed`, and store it into `statePtr`. | |
856 | * digest will be equivalent to `XXH3_64bits_withSeed()`. | |
857 | */ | |
858 | XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); | |
859 | /* | |
860 | * XXH3_64bits_reset_withSecret(): | |
861 | * `secret` is referenced, it _must outlive_ the hash streaming session. | |
862 | * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, | |
863 | * and the quality of produced hash values depends on secret's entropy | |
864 | * (secret's content should look like a bunch of random bytes). | |
865 | * When in doubt about the randomness of a candidate `secret`, | |
866 | * consider employing `XXH3_generateSecret()` instead (see below). | |
867 | */ | |
868 | XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); | |
869 | ||
870 | XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length); | |
871 | XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr); | |
872 | ||
873 | /* note : canonical representation of XXH3 is the same as XXH64 | |
874 | * since they both produce XXH64_hash_t values */ | |
875 | ||
876 | ||
877 | /*-********************************************************************** | |
878 | * XXH3 128-bit variant | |
879 | ************************************************************************/ | |
880 | ||
881 | /*! | |
882 | * @brief The return value from 128-bit hashes. | |
883 | * | |
884 | * Stored in little endian order, although the fields themselves are in native | |
885 | * endianness. | |
886 | */ | |
887 | typedef struct { | |
888 | XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ | |
889 | XXH64_hash_t high64; /*!< `value >> 64` */ | |
890 | } XXH128_hash_t; | |
891 | ||
892 | XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len); | |
893 | XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); | |
894 | XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); | |
895 | ||
896 | /******* Streaming *******/ | |
897 | /* | |
898 | * Streaming requires state maintenance. | |
899 | * This operation costs memory and CPU. | |
900 | * As a consequence, streaming is slower than one-shot hashing. | |
901 | * For better performance, prefer one-shot functions whenever applicable. | |
902 | * | |
903 | * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). | |
904 | * Use already declared XXH3_createState() and XXH3_freeState(). | |
905 | * | |
906 | * All reset and streaming functions have same meaning as their 64-bit counterpart. | |
907 | */ | |
908 | ||
909 | XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr); | |
910 | XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); | |
911 | XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); | |
912 | ||
913 | XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length); | |
914 | XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr); | |
915 | ||
916 | /* Following helper functions make it possible to compare XXH128_hast_t values. | |
917 | * Since XXH128_hash_t is a structure, this capability is not offered by the language. | |
918 | * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ | |
919 | ||
920 | /*! | |
921 | * XXH128_isEqual(): | |
922 | * Return: 1 if `h1` and `h2` are equal, 0 if they are not. | |
923 | */ | |
924 | XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); | |
925 | ||
926 | /*! | |
927 | * XXH128_cmp(): | |
928 | * | |
929 | * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. | |
930 | * | |
931 | * return: >0 if *h128_1 > *h128_2 | |
932 | * =0 if *h128_1 == *h128_2 | |
933 | * <0 if *h128_1 < *h128_2 | |
934 | */ | |
935 | XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2); | |
936 | ||
937 | ||
938 | /******* Canonical representation *******/ | |
939 | typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; | |
940 | XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash); | |
941 | XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src); | |
942 | ||
943 | ||
944 | #endif /* XXH_NO_LONG_LONG */ | |
945 | ||
946 | /*! | |
947 | * @} | |
948 | */ | |
949 | #endif /* XXHASH_H_5627135585666179 */ | |
950 | ||
951 | ||
952 | ||
953 | #if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) | |
954 | #define XXHASH_H_STATIC_13879238742 | |
955 | /* **************************************************************************** | |
956 | * This section contains declarations which are not guaranteed to remain stable. | |
957 | * They may change in future versions, becoming incompatible with a different | |
958 | * version of the library. | |
959 | * These declarations should only be used with static linking. | |
960 | * Never use them in association with dynamic linking! | |
961 | ***************************************************************************** */ | |
962 | ||
963 | /* | |
964 | * These definitions are only present to allow static allocation | |
965 | * of XXH states, on stack or in a struct, for example. | |
966 | * Never **ever** access their members directly. | |
967 | */ | |
968 | ||
969 | /*! | |
970 | * @internal | |
971 | * @brief Structure for XXH32 streaming API. | |
972 | * | |
973 | * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, | |
974 | * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is | |
975 | * an opaque type. This allows fields to safely be changed. | |
976 | * | |
977 | * Typedef'd to @ref XXH32_state_t. | |
978 | * Do not access the members of this struct directly. | |
979 | * @see XXH64_state_s, XXH3_state_s | |
980 | */ | |
981 | struct XXH32_state_s { | |
982 | XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ | |
983 | XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ | |
984 | XXH32_hash_t v[4]; /*!< Accumulator lanes */ | |
985 | XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */ | |
986 | XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ | |
987 | XXH32_hash_t reserved; /*!< Reserved field. Do not read or write to it, it may be removed. */ | |
988 | }; /* typedef'd to XXH32_state_t */ | |
989 | ||
990 | ||
991 | #ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ | |
992 | ||
993 | /*! | |
994 | * @internal | |
995 | * @brief Structure for XXH64 streaming API. | |
996 | * | |
997 | * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, | |
998 | * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is | |
999 | * an opaque type. This allows fields to safely be changed. | |
1000 | * | |
1001 | * Typedef'd to @ref XXH64_state_t. | |
1002 | * Do not access the members of this struct directly. | |
1003 | * @see XXH32_state_s, XXH3_state_s | |
1004 | */ | |
1005 | struct XXH64_state_s { | |
1006 | XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ | |
1007 | XXH64_hash_t v[4]; /*!< Accumulator lanes */ | |
1008 | XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */ | |
1009 | XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ | |
1010 | XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ | |
1011 | XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it, it may be removed. */ | |
1012 | }; /* typedef'd to XXH64_state_t */ | |
1013 | ||
1014 | #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ | |
1015 | # include <stdalign.h> | |
1016 | # define XXH_ALIGN(n) alignas(n) | |
1017 | #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ | |
1018 | /* In C++ alignas() is a keyword */ | |
1019 | # define XXH_ALIGN(n) alignas(n) | |
1020 | #elif defined(__GNUC__) | |
1021 | # define XXH_ALIGN(n) __attribute__ ((aligned(n))) | |
1022 | #elif defined(_MSC_VER) | |
1023 | # define XXH_ALIGN(n) __declspec(align(n)) | |
1024 | #else | |
1025 | # define XXH_ALIGN(n) /* disabled */ | |
1026 | #endif | |
1027 | ||
1028 | /* Old GCC versions only accept the attribute after the type in structures. */ | |
1029 | #if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ | |
1030 | && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ | |
1031 | && defined(__GNUC__) | |
1032 | # define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) | |
1033 | #else | |
1034 | # define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type | |
1035 | #endif | |
1036 | ||
1037 | /*! | |
1038 | * @brief The size of the internal XXH3 buffer. | |
1039 | * | |
1040 | * This is the optimal update size for incremental hashing. | |
1041 | * | |
1042 | * @see XXH3_64b_update(), XXH3_128b_update(). | |
1043 | */ | |
1044 | #define XXH3_INTERNALBUFFER_SIZE 256 | |
1045 | ||
1046 | /*! | |
1047 | * @brief Default size of the secret buffer (and @ref XXH3_kSecret). | |
1048 | * | |
1049 | * This is the size used in @ref XXH3_kSecret and the seeded functions. | |
1050 | * | |
1051 | * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. | |
1052 | */ | |
1053 | #define XXH3_SECRET_DEFAULT_SIZE 192 | |
1054 | ||
1055 | /*! | |
1056 | * @internal | |
1057 | * @brief Structure for XXH3 streaming API. | |
1058 | * | |
1059 | * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, | |
1060 | * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. | |
1061 | * Otherwise it is an opaque type. | |
1062 | * Never use this definition in combination with dynamic library. | |
1063 | * This allows fields to safely be changed in the future. | |
1064 | * | |
1065 | * @note ** This structure has a strict alignment requirement of 64 bytes!! ** | |
1066 | * Do not allocate this with `malloc()` or `new`, | |
1067 | * it will not be sufficiently aligned. | |
1068 | * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. | |
1069 | * | |
1070 | * Typedef'd to @ref XXH3_state_t. | |
1071 | * Do never access the members of this struct directly. | |
1072 | * | |
1073 | * @see XXH3_INITSTATE() for stack initialization. | |
1074 | * @see XXH3_createState(), XXH3_freeState(). | |
1075 | * @see XXH32_state_s, XXH64_state_s | |
1076 | */ | |
1077 | struct XXH3_state_s { | |
1078 | XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); | |
1079 | /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */ | |
1080 | XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); | |
1081 | /*!< Used to store a custom secret generated from a seed. */ | |
1082 | XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); | |
1083 | /*!< The internal buffer. @see XXH32_state_s::mem32 */ | |
1084 | XXH32_hash_t bufferedSize; | |
1085 | /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ | |
1086 | XXH32_hash_t useSeed; | |
1087 | /*!< Reserved field. Needed for padding on 64-bit. */ | |
1088 | size_t nbStripesSoFar; | |
1089 | /*!< Number or stripes processed. */ | |
1090 | XXH64_hash_t totalLen; | |
1091 | /*!< Total length hashed. 64-bit even on 32-bit targets. */ | |
1092 | size_t nbStripesPerBlock; | |
1093 | /*!< Number of stripes per block. */ | |
1094 | size_t secretLimit; | |
1095 | /*!< Size of @ref customSecret or @ref extSecret */ | |
1096 | XXH64_hash_t seed; | |
1097 | /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */ | |
1098 | XXH64_hash_t reserved64; | |
1099 | /*!< Reserved field. */ | |
1100 | const unsigned char* extSecret; | |
1101 | /*!< Reference to an external secret for the _withSecret variants, NULL | |
1102 | * for other variants. */ | |
1103 | /* note: there may be some padding at the end due to alignment on 64 bytes */ | |
1104 | }; /* typedef'd to XXH3_state_t */ | |
1105 | ||
1106 | #undef XXH_ALIGN_MEMBER | |
1107 | ||
1108 | /*! | |
1109 | * @brief Initializes a stack-allocated `XXH3_state_s`. | |
1110 | * | |
1111 | * When the @ref XXH3_state_t structure is merely emplaced on stack, | |
1112 | * it should be initialized with XXH3_INITSTATE() or a memset() | |
1113 | * in case its first reset uses XXH3_NNbits_reset_withSeed(). | |
1114 | * This init can be omitted if the first reset uses default or _withSecret mode. | |
1115 | * This operation isn't necessary when the state is created with XXH3_createState(). | |
1116 | * Note that this doesn't prepare the state for a streaming operation, | |
1117 | * it's still necessary to use XXH3_NNbits_reset*() afterwards. | |
1118 | */ | |
1119 | #define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; } | |
1120 | ||
1121 | ||
1122 | /* XXH128() : | |
1123 | * simple alias to pre-selected XXH3_128bits variant | |
1124 | */ | |
1125 | XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed); | |
1126 | ||
1127 | ||
1128 | /* === Experimental API === */ | |
1129 | /* Symbols defined below must be considered tied to a specific library version. */ | |
1130 | ||
1131 | /* | |
1132 | * XXH3_generateSecret(): | |
1133 | * | |
1134 | * Derive a high-entropy secret from any user-defined content, named customSeed. | |
1135 | * The generated secret can be used in combination with `*_withSecret()` functions. | |
1136 | * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed, | |
1137 | * as it becomes much more difficult for an external actor to guess how to impact the calculation logic. | |
1138 | * | |
1139 | * The function accepts as input a custom seed of any length and any content, | |
1140 | * and derives from it a high-entropy secret of length @secretSize | |
1141 | * into an already allocated buffer @secretBuffer. | |
1142 | * @secretSize must be >= XXH3_SECRET_SIZE_MIN | |
1143 | * | |
1144 | * The generated secret can then be used with any `*_withSecret()` variant. | |
1145 | * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`, | |
1146 | * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()` | |
1147 | * are part of this list. They all accept a `secret` parameter | |
1148 | * which must be large enough for implementation reasons (>= XXH3_SECRET_SIZE_MIN) | |
1149 | * _and_ feature very high entropy (consist of random-looking bytes). | |
1150 | * These conditions can be a high bar to meet, so | |
1151 | * XXH3_generateSecret() can be employed to ensure proper quality. | |
1152 | * | |
1153 | * customSeed can be anything. It can have any size, even small ones, | |
1154 | * and its content can be anything, even "poor entropy" sources such as a bunch of zeroes. | |
1155 | * The resulting `secret` will nonetheless provide all required qualities. | |
1156 | * | |
1157 | * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior. | |
1158 | */ | |
1159 | XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize); | |
1160 | ||
1161 | ||
1162 | /* | |
1163 | * XXH3_generateSecret_fromSeed(): | |
1164 | * | |
1165 | * Generate the same secret as the _withSeed() variants. | |
1166 | * | |
1167 | * The resulting secret has a length of XXH3_SECRET_DEFAULT_SIZE (necessarily). | |
1168 | * @secretBuffer must be already allocated, of size at least XXH3_SECRET_DEFAULT_SIZE bytes. | |
1169 | * | |
1170 | * The generated secret can be used in combination with | |
1171 | *`*_withSecret()` and `_withSecretandSeed()` variants. | |
1172 | * This generator is notably useful in combination with `_withSecretandSeed()`, | |
1173 | * as a way to emulate a faster `_withSeed()` variant. | |
1174 | */ | |
1175 | XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed); | |
1176 | ||
1177 | /* | |
1178 | * *_withSecretandSeed() : | |
1179 | * These variants generate hash values using either | |
1180 | * @seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes) | |
1181 | * or @secret for "large" keys (>= XXH3_MIDSIZE_MAX). | |
1182 | * | |
1183 | * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. | |
1184 | * `_withSeed()` has to generate the secret on the fly for "large" keys. | |
1185 | * It's fast, but can be perceptible for "not so large" keys (< 1 KB). | |
1186 | * `_withSecret()` has to generate the masks on the fly for "small" keys, | |
1187 | * which requires more instructions than _withSeed() variants. | |
1188 | * Therefore, _withSecretandSeed variant combines the best of both worlds. | |
1189 | * | |
1190 | * When @secret has been generated by XXH3_generateSecret_fromSeed(), | |
1191 | * this variant produces *exactly* the same results as `_withSeed()` variant, | |
1192 | * hence offering only a pure speed benefit on "large" input, | |
1193 | * by skipping the need to regenerate the secret for every large input. | |
1194 | * | |
1195 | * Another usage scenario is to hash the secret to a 64-bit hash value, | |
1196 | * for example with XXH3_64bits(), which then becomes the seed, | |
1197 | * and then employ both the seed and the secret in _withSecretandSeed(). | |
1198 | * On top of speed, an added benefit is that each bit in the secret | |
1199 | * has a 50% chance to swap each bit in the output, | |
1200 | * via its impact to the seed. | |
1201 | * This is not guaranteed when using the secret directly in "small data" scenarios, | |
1202 | * because only portions of the secret are employed for small data. | |
1203 | */ | |
1204 | XXH_PUBLIC_API XXH64_hash_t | |
1205 | XXH3_64bits_withSecretandSeed(const void* data, size_t len, | |
1206 | const void* secret, size_t secretSize, | |
1207 | XXH64_hash_t seed); | |
1208 | ||
1209 | XXH_PUBLIC_API XXH128_hash_t | |
1210 | XXH3_128bits_withSecretandSeed(const void* data, size_t len, | |
1211 | const void* secret, size_t secretSize, | |
1212 | XXH64_hash_t seed64); | |
1213 | ||
1214 | XXH_PUBLIC_API XXH_errorcode | |
1215 | XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, | |
1216 | const void* secret, size_t secretSize, | |
1217 | XXH64_hash_t seed64); | |
1218 | ||
1219 | XXH_PUBLIC_API XXH_errorcode | |
1220 | XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, | |
1221 | const void* secret, size_t secretSize, | |
1222 | XXH64_hash_t seed64); | |
1223 | ||
1224 | ||
1225 | #endif /* XXH_NO_LONG_LONG */ | |
1226 | #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) | |
1227 | # define XXH_IMPLEMENTATION | |
1228 | #endif | |
1229 | ||
1230 | #endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ | |
1231 | ||
1232 | ||
1233 | /* ======================================================================== */ | |
1234 | /* ======================================================================== */ | |
1235 | /* ======================================================================== */ | |
1236 | ||
1237 | ||
1238 | /*-********************************************************************** | |
1239 | * xxHash implementation | |
1240 | *-********************************************************************** | |
1241 | * xxHash's implementation used to be hosted inside xxhash.c. | |
1242 | * | |
1243 | * However, inlining requires implementation to be visible to the compiler, | |
1244 | * hence be included alongside the header. | |
1245 | * Previously, implementation was hosted inside xxhash.c, | |
1246 | * which was then #included when inlining was activated. | |
1247 | * This construction created issues with a few build and install systems, | |
1248 | * as it required xxhash.c to be stored in /include directory. | |
1249 | * | |
1250 | * xxHash implementation is now directly integrated within xxhash.h. | |
1251 | * As a consequence, xxhash.c is no longer needed in /include. | |
1252 | * | |
1253 | * xxhash.c is still available and is still useful. | |
1254 | * In a "normal" setup, when xxhash is not inlined, | |
1255 | * xxhash.h only exposes the prototypes and public symbols, | |
1256 | * while xxhash.c can be built into an object file xxhash.o | |
1257 | * which can then be linked into the final binary. | |
1258 | ************************************************************************/ | |
1259 | ||
1260 | #if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ | |
1261 | || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) | |
1262 | # define XXH_IMPLEM_13a8737387 | |
1263 | ||
1264 | /* ************************************* | |
1265 | * Tuning parameters | |
1266 | ***************************************/ | |
1267 | ||
1268 | /*! | |
1269 | * @defgroup tuning Tuning parameters | |
1270 | * @{ | |
1271 | * | |
1272 | * Various macros to control xxHash's behavior. | |
1273 | */ | |
1274 | #ifdef XXH_DOXYGEN | |
1275 | /*! | |
1276 | * @brief Define this to disable 64-bit code. | |
1277 | * | |
1278 | * Useful if only using the @ref xxh32_family and you have a strict C90 compiler. | |
1279 | */ | |
1280 | # define XXH_NO_LONG_LONG | |
1281 | # undef XXH_NO_LONG_LONG /* don't actually */ | |
1282 | /*! | |
1283 | * @brief Controls how unaligned memory is accessed. | |
1284 | * | |
1285 | * By default, access to unaligned memory is controlled by `memcpy()`, which is | |
1286 | * safe and portable. | |
1287 | * | |
1288 | * Unfortunately, on some target/compiler combinations, the generated assembly | |
1289 | * is sub-optimal. | |
1290 | * | |
1291 | * The below switch allow selection of a different access method | |
1292 | * in the search for improved performance. | |
1293 | * | |
1294 | * @par Possible options: | |
1295 | * | |
1296 | * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` | |
1297 | * @par | |
1298 | * Use `memcpy()`. Safe and portable. Note that most modern compilers will | |
1299 | * eliminate the function call and treat it as an unaligned access. | |
1300 | * | |
1301 | * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))` | |
1302 | * @par | |
1303 | * Depends on compiler extensions and is therefore not portable. | |
1304 | * This method is safe _if_ your compiler supports it, | |
1305 | * and *generally* as fast or faster than `memcpy`. | |
1306 | * | |
1307 | * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast | |
1308 | * @par | |
1309 | * Casts directly and dereferences. This method doesn't depend on the | |
1310 | * compiler, but it violates the C standard as it directly dereferences an | |
1311 | * unaligned pointer. It can generate buggy code on targets which do not | |
1312 | * support unaligned memory accesses, but in some circumstances, it's the | |
1313 | * only known way to get the most performance. | |
1314 | * | |
1315 | * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift | |
1316 | * @par | |
1317 | * Also portable. This can generate the best code on old compilers which don't | |
1318 | * inline small `memcpy()` calls, and it might also be faster on big-endian | |
1319 | * systems which lack a native byteswap instruction. However, some compilers | |
1320 | * will emit literal byteshifts even if the target supports unaligned access. | |
1321 | * . | |
1322 | * | |
1323 | * @warning | |
1324 | * Methods 1 and 2 rely on implementation-defined behavior. Use these with | |
1325 | * care, as what works on one compiler/platform/optimization level may cause | |
1326 | * another to read garbage data or even crash. | |
1327 | * | |
1328 | * See http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details. | |
1329 | * | |
1330 | * Prefer these methods in priority order (0 > 3 > 1 > 2) | |
1331 | */ | |
1332 | # define XXH_FORCE_MEMORY_ACCESS 0 | |
1333 | ||
1334 | /*! | |
1335 | * @def XXH_FORCE_ALIGN_CHECK | |
1336 | * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() | |
1337 | * and XXH64() only). | |
1338 | * | |
1339 | * This is an important performance trick for architectures without decent | |
1340 | * unaligned memory access performance. | |
1341 | * | |
1342 | * It checks for input alignment, and when conditions are met, uses a "fast | |
1343 | * path" employing direct 32-bit/64-bit reads, resulting in _dramatically | |
1344 | * faster_ read speed. | |
1345 | * | |
1346 | * The check costs one initial branch per hash, which is generally negligible, | |
1347 | * but not zero. | |
1348 | * | |
1349 | * Moreover, it's not useful to generate an additional code path if memory | |
1350 | * access uses the same instruction for both aligned and unaligned | |
1351 | * addresses (e.g. x86 and aarch64). | |
1352 | * | |
1353 | * In these cases, the alignment check can be removed by setting this macro to 0. | |
1354 | * Then the code will always use unaligned memory access. | |
1355 | * Align check is automatically disabled on x86, x64 & arm64, | |
1356 | * which are platforms known to offer good unaligned memory accesses performance. | |
1357 | * | |
1358 | * This option does not affect XXH3 (only XXH32 and XXH64). | |
1359 | */ | |
1360 | # define XXH_FORCE_ALIGN_CHECK 0 | |
1361 | ||
1362 | /*! | |
1363 | * @def XXH_NO_INLINE_HINTS | |
1364 | * @brief When non-zero, sets all functions to `static`. | |
1365 | * | |
1366 | * By default, xxHash tries to force the compiler to inline almost all internal | |
1367 | * functions. | |
1368 | * | |
1369 | * This can usually improve performance due to reduced jumping and improved | |
1370 | * constant folding, but significantly increases the size of the binary which | |
1371 | * might not be favorable. | |
1372 | * | |
1373 | * Additionally, sometimes the forced inlining can be detrimental to performance, | |
1374 | * depending on the architecture. | |
1375 | * | |
1376 | * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the | |
1377 | * compiler full control on whether to inline or not. | |
1378 | * | |
1379 | * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using | |
1380 | * -fno-inline with GCC or Clang, this will automatically be defined. | |
1381 | */ | |
1382 | # define XXH_NO_INLINE_HINTS 0 | |
1383 | ||
1384 | /*! | |
1385 | * @def XXH32_ENDJMP | |
1386 | * @brief Whether to use a jump for `XXH32_finalize`. | |
1387 | * | |
1388 | * For performance, `XXH32_finalize` uses multiple branches in the finalizer. | |
1389 | * This is generally preferable for performance, | |
1390 | * but depending on exact architecture, a jmp may be preferable. | |
1391 | * | |
1392 | * This setting is only possibly making a difference for very small inputs. | |
1393 | */ | |
1394 | # define XXH32_ENDJMP 0 | |
1395 | ||
1396 | /*! | |
1397 | * @internal | |
1398 | * @brief Redefines old internal names. | |
1399 | * | |
1400 | * For compatibility with code that uses xxHash's internals before the names | |
1401 | * were changed to improve namespacing. There is no other reason to use this. | |
1402 | */ | |
1403 | # define XXH_OLD_NAMES | |
1404 | # undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ | |
1405 | #endif /* XXH_DOXYGEN */ | |
1406 | /*! | |
1407 | * @} | |
1408 | */ | |
1409 | ||
1410 | #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ | |
1411 | /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */ | |
1412 | # if !defined(__clang__) && \ | |
1413 | ( \ | |
1414 | (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ | |
1415 | ( \ | |
1416 | defined(__GNUC__) && ( \ | |
1417 | (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \ | |
1418 | ( \ | |
1419 | defined(__mips__) && \ | |
1420 | (__mips <= 5 || __mips_isa_rev < 6) && \ | |
1421 | (!defined(__mips16) || defined(__mips_mips16e2)) \ | |
1422 | ) \ | |
1423 | ) \ | |
1424 | ) \ | |
1425 | ) | |
1426 | # define XXH_FORCE_MEMORY_ACCESS 1 | |
1427 | # endif | |
1428 | #endif | |
1429 | ||
1430 | #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ | |
1431 | # if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) \ | |
1432 | || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */ | |
1433 | # define XXH_FORCE_ALIGN_CHECK 0 | |
1434 | # else | |
1435 | # define XXH_FORCE_ALIGN_CHECK 1 | |
1436 | # endif | |
1437 | #endif | |
1438 | ||
1439 | #ifndef XXH_NO_INLINE_HINTS | |
1440 | # if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \ | |
1441 | || defined(__NO_INLINE__) /* -O0, -fno-inline */ | |
1442 | # define XXH_NO_INLINE_HINTS 1 | |
1443 | # else | |
1444 | # define XXH_NO_INLINE_HINTS 0 | |
1445 | # endif | |
1446 | #endif | |
1447 | ||
1448 | #ifndef XXH32_ENDJMP | |
1449 | /* generally preferable for performance */ | |
1450 | # define XXH32_ENDJMP 0 | |
1451 | #endif | |
1452 | ||
1453 | /*! | |
1454 | * @defgroup impl Implementation | |
1455 | * @{ | |
1456 | */ | |
1457 | ||
1458 | ||
1459 | /* ************************************* | |
1460 | * Includes & Memory related functions | |
1461 | ***************************************/ | |
1462 | /* | |
1463 | * Modify the local functions below should you wish to use | |
1464 | * different memory routines for malloc() and free() | |
1465 | */ | |
1466 | #include <stdlib.h> | |
1467 | ||
1468 | /*! | |
1469 | * @internal | |
1470 | * @brief Modify this function to use a different routine than malloc(). | |
1471 | */ | |
1472 | static void* XXH_malloc(size_t s) { return malloc(s); } | |
1473 | ||
1474 | /*! | |
1475 | * @internal | |
1476 | * @brief Modify this function to use a different routine than free(). | |
1477 | */ | |
1478 | static void XXH_free(void* p) { free(p); } | |
1479 | ||
1480 | #include <string.h> | |
1481 | ||
1482 | /*! | |
1483 | * @internal | |
1484 | * @brief Modify this function to use a different routine than memcpy(). | |
1485 | */ | |
1486 | static void* XXH_memcpy(void* dest, const void* src, size_t size) | |
1487 | { | |
1488 | return memcpy(dest,src,size); | |
1489 | } | |
1490 | ||
1491 | #include <limits.h> /* ULLONG_MAX */ | |
1492 | ||
1493 | ||
1494 | /* ************************************* | |
1495 | * Compiler Specific Options | |
1496 | ***************************************/ | |
1497 | #ifdef _MSC_VER /* Visual Studio warning fix */ | |
1498 | # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ | |
1499 | #endif | |
1500 | ||
1501 | #if XXH_NO_INLINE_HINTS /* disable inlining hints */ | |
1502 | # if defined(__GNUC__) || defined(__clang__) | |
1503 | # define XXH_FORCE_INLINE static __attribute__((unused)) | |
1504 | # else | |
1505 | # define XXH_FORCE_INLINE static | |
1506 | # endif | |
1507 | # define XXH_NO_INLINE static | |
1508 | /* enable inlining hints */ | |
1509 | #elif defined(__GNUC__) || defined(__clang__) | |
1510 | # define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) | |
1511 | # define XXH_NO_INLINE static __attribute__((noinline)) | |
1512 | #elif defined(_MSC_VER) /* Visual Studio */ | |
1513 | # define XXH_FORCE_INLINE static __forceinline | |
1514 | # define XXH_NO_INLINE static __declspec(noinline) | |
1515 | #elif defined (__cplusplus) \ | |
1516 | || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ | |
1517 | # define XXH_FORCE_INLINE static inline | |
1518 | # define XXH_NO_INLINE static | |
1519 | #else | |
1520 | # define XXH_FORCE_INLINE static | |
1521 | # define XXH_NO_INLINE static | |
1522 | #endif | |
1523 | ||
1524 | ||
1525 | ||
1526 | /* ************************************* | |
1527 | * Debug | |
1528 | ***************************************/ | |
1529 | /*! | |
1530 | * @ingroup tuning | |
1531 | * @def XXH_DEBUGLEVEL | |
1532 | * @brief Sets the debugging level. | |
1533 | * | |
1534 | * XXH_DEBUGLEVEL is expected to be defined externally, typically via the | |
1535 | * compiler's command line options. The value must be a number. | |
1536 | */ | |
1537 | #ifndef XXH_DEBUGLEVEL | |
1538 | # ifdef DEBUGLEVEL /* backwards compat */ | |
1539 | # define XXH_DEBUGLEVEL DEBUGLEVEL | |
1540 | # else | |
1541 | # define XXH_DEBUGLEVEL 0 | |
1542 | # endif | |
1543 | #endif | |
1544 | ||
1545 | #if (XXH_DEBUGLEVEL>=1) | |
1546 | # include <assert.h> /* note: can still be disabled with NDEBUG */ | |
1547 | # define XXH_ASSERT(c) assert(c) | |
1548 | #else | |
1549 | # define XXH_ASSERT(c) ((void)0) | |
1550 | #endif | |
1551 | ||
1552 | /* note: use after variable declarations */ | |
1553 | #ifndef XXH_STATIC_ASSERT | |
1554 | # if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ | |
1555 | # include <assert.h> | |
1556 | # define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) | |
1557 | # elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ | |
1558 | # define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) | |
1559 | # else | |
1560 | # define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0) | |
1561 | # endif | |
1562 | # define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c) | |
1563 | #endif | |
1564 | ||
1565 | /*! | |
1566 | * @internal | |
1567 | * @def XXH_COMPILER_GUARD(var) | |
1568 | * @brief Used to prevent unwanted optimizations for @p var. | |
1569 | * | |
1570 | * It uses an empty GCC inline assembly statement with a register constraint | |
1571 | * which forces @p var into a general purpose register (eg eax, ebx, ecx | |
1572 | * on x86) and marks it as modified. | |
1573 | * | |
1574 | * This is used in a few places to avoid unwanted autovectorization (e.g. | |
1575 | * XXH32_round()). All vectorization we want is explicit via intrinsics, | |
1576 | * and _usually_ isn't wanted elsewhere. | |
1577 | * | |
1578 | * We also use it to prevent unwanted constant folding for AArch64 in | |
1579 | * XXH3_initCustomSecret_scalar(). | |
1580 | */ | |
1581 | #if defined(__GNUC__) || defined(__clang__) | |
1582 | # define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var)) | |
1583 | #else | |
1584 | # define XXH_COMPILER_GUARD(var) ((void)0) | |
1585 | #endif | |
1586 | ||
1587 | /* ************************************* | |
1588 | * Basic Types | |
1589 | ***************************************/ | |
1590 | #if !defined (__VMS) \ | |
1591 | && (defined (__cplusplus) \ | |
1592 | || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) | |
1593 | # include <stdint.h> | |
1594 | typedef uint8_t xxh_u8; | |
1595 | #else | |
1596 | typedef unsigned char xxh_u8; | |
1597 | #endif | |
1598 | typedef XXH32_hash_t xxh_u32; | |
1599 | ||
1600 | #ifdef XXH_OLD_NAMES | |
1601 | # define BYTE xxh_u8 | |
1602 | # define U8 xxh_u8 | |
1603 | # define U32 xxh_u32 | |
1604 | #endif | |
1605 | ||
1606 | /* *** Memory access *** */ | |
1607 | ||
1608 | /*! | |
1609 | * @internal | |
1610 | * @fn xxh_u32 XXH_read32(const void* ptr) | |
1611 | * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. | |
1612 | * | |
1613 | * Affected by @ref XXH_FORCE_MEMORY_ACCESS. | |
1614 | * | |
1615 | * @param ptr The pointer to read from. | |
1616 | * @return The 32-bit native endian integer from the bytes at @p ptr. | |
1617 | */ | |
1618 | ||
1619 | /*! | |
1620 | * @internal | |
1621 | * @fn xxh_u32 XXH_readLE32(const void* ptr) | |
1622 | * @brief Reads an unaligned 32-bit little endian integer from @p ptr. | |
1623 | * | |
1624 | * Affected by @ref XXH_FORCE_MEMORY_ACCESS. | |
1625 | * | |
1626 | * @param ptr The pointer to read from. | |
1627 | * @return The 32-bit little endian integer from the bytes at @p ptr. | |
1628 | */ | |
1629 | ||
1630 | /*! | |
1631 | * @internal | |
1632 | * @fn xxh_u32 XXH_readBE32(const void* ptr) | |
1633 | * @brief Reads an unaligned 32-bit big endian integer from @p ptr. | |
1634 | * | |
1635 | * Affected by @ref XXH_FORCE_MEMORY_ACCESS. | |
1636 | * | |
1637 | * @param ptr The pointer to read from. | |
1638 | * @return The 32-bit big endian integer from the bytes at @p ptr. | |
1639 | */ | |
1640 | ||
1641 | /*! | |
1642 | * @internal | |
1643 | * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) | |
1644 | * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. | |
1645 | * | |
1646 | * Affected by @ref XXH_FORCE_MEMORY_ACCESS. | |
1647 | * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is | |
1648 | * always @ref XXH_alignment::XXH_unaligned. | |
1649 | * | |
1650 | * @param ptr The pointer to read from. | |
1651 | * @param align Whether @p ptr is aligned. | |
1652 | * @pre | |
1653 | * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte | |
1654 | * aligned. | |
1655 | * @return The 32-bit little endian integer from the bytes at @p ptr. | |
1656 | */ | |
1657 | ||
1658 | #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) | |
1659 | /* | |
1660 | * Manual byteshift. Best for old compilers which don't inline memcpy. | |
1661 | * We actually directly use XXH_readLE32 and XXH_readBE32. | |
1662 | */ | |
1663 | #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) | |
1664 | ||
1665 | /* | |
1666 | * Force direct memory access. Only works on CPU which support unaligned memory | |
1667 | * access in hardware. | |
1668 | */ | |
1669 | static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } | |
1670 | ||
1671 | #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) | |
1672 | ||
1673 | /* | |
1674 | * __pack instructions are safer but compiler specific, hence potentially | |
1675 | * problematic for some compilers. | |
1676 | * | |
1677 | * Currently only defined for GCC and ICC. | |
1678 | */ | |
1679 | #ifdef XXH_OLD_NAMES | |
1680 | typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; | |
1681 | #endif | |
1682 | static xxh_u32 XXH_read32(const void* ptr) | |
1683 | { | |
1684 | typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign; | |
1685 | return ((const xxh_unalign*)ptr)->u32; | |
1686 | } | |
1687 | ||
1688 | #else | |
1689 | ||
1690 | /* | |
1691 | * Portable and safe solution. Generally efficient. | |
1692 | * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html | |
1693 | */ | |
1694 | static xxh_u32 XXH_read32(const void* memPtr) | |
1695 | { | |
1696 | xxh_u32 val; | |
1697 | XXH_memcpy(&val, memPtr, sizeof(val)); | |
1698 | return val; | |
1699 | } | |
1700 | ||
1701 | #endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ | |
1702 | ||
1703 | ||
1704 | /* *** Endianness *** */ | |
1705 | ||
1706 | /*! | |
1707 | * @ingroup tuning | |
1708 | * @def XXH_CPU_LITTLE_ENDIAN | |
1709 | * @brief Whether the target is little endian. | |
1710 | * | |
1711 | * Defined to 1 if the target is little endian, or 0 if it is big endian. | |
1712 | * It can be defined externally, for example on the compiler command line. | |
1713 | * | |
1714 | * If it is not defined, | |
1715 | * a runtime check (which is usually constant folded) is used instead. | |
1716 | * | |
1717 | * @note | |
1718 | * This is not necessarily defined to an integer constant. | |
1719 | * | |
1720 | * @see XXH_isLittleEndian() for the runtime check. | |
1721 | */ | |
1722 | #ifndef XXH_CPU_LITTLE_ENDIAN | |
1723 | /* | |
1724 | * Try to detect endianness automatically, to avoid the nonstandard behavior | |
1725 | * in `XXH_isLittleEndian()` | |
1726 | */ | |
1727 | # if defined(_WIN32) /* Windows is always little endian */ \ | |
1728 | || defined(__LITTLE_ENDIAN__) \ | |
1729 | || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) | |
1730 | # define XXH_CPU_LITTLE_ENDIAN 1 | |
1731 | # elif defined(__BIG_ENDIAN__) \ | |
1732 | || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |
1733 | # define XXH_CPU_LITTLE_ENDIAN 0 | |
1734 | # else | |
1735 | /*! | |
1736 | * @internal | |
1737 | * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. | |
1738 | * | |
1739 | * Most compilers will constant fold this. | |
1740 | */ | |
1741 | static int XXH_isLittleEndian(void) | |
1742 | { | |
1743 | /* | |
1744 | * Portable and well-defined behavior. | |
1745 | * Don't use static: it is detrimental to performance. | |
1746 | */ | |
1747 | const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; | |
1748 | return one.c[0]; | |
1749 | } | |
1750 | # define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() | |
1751 | # endif | |
1752 | #endif | |
1753 | ||
1754 | ||
1755 | ||
1756 | ||
1757 | /* **************************************** | |
1758 | * Compiler-specific Functions and Macros | |
1759 | ******************************************/ | |
1760 | #define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) | |
1761 | ||
1762 | #ifdef __has_builtin | |
1763 | # define XXH_HAS_BUILTIN(x) __has_builtin(x) | |
1764 | #else | |
1765 | # define XXH_HAS_BUILTIN(x) 0 | |
1766 | #endif | |
1767 | ||
1768 | /*! | |
1769 | * @internal | |
1770 | * @def XXH_rotl32(x,r) | |
1771 | * @brief 32-bit rotate left. | |
1772 | * | |
1773 | * @param x The 32-bit integer to be rotated. | |
1774 | * @param r The number of bits to rotate. | |
1775 | * @pre | |
1776 | * @p r > 0 && @p r < 32 | |
1777 | * @note | |
1778 | * @p x and @p r may be evaluated multiple times. | |
1779 | * @return The rotated result. | |
1780 | */ | |
1781 | #if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ | |
1782 | && XXH_HAS_BUILTIN(__builtin_rotateleft64) | |
1783 | # define XXH_rotl32 __builtin_rotateleft32 | |
1784 | # define XXH_rotl64 __builtin_rotateleft64 | |
1785 | /* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ | |
1786 | #elif defined(_MSC_VER) | |
1787 | # define XXH_rotl32(x,r) _rotl(x,r) | |
1788 | # define XXH_rotl64(x,r) _rotl64(x,r) | |
1789 | #else | |
1790 | # define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) | |
1791 | # define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) | |
1792 | #endif | |
1793 | ||
1794 | /*! | |
1795 | * @internal | |
1796 | * @fn xxh_u32 XXH_swap32(xxh_u32 x) | |
1797 | * @brief A 32-bit byteswap. | |
1798 | * | |
1799 | * @param x The 32-bit integer to byteswap. | |
1800 | * @return @p x, byteswapped. | |
1801 | */ | |
1802 | #if defined(_MSC_VER) /* Visual Studio */ | |
1803 | # define XXH_swap32 _byteswap_ulong | |
1804 | #elif XXH_GCC_VERSION >= 403 | |
1805 | # define XXH_swap32 __builtin_bswap32 | |
1806 | #else | |
1807 | static xxh_u32 XXH_swap32 (xxh_u32 x) | |
1808 | { | |
1809 | return ((x << 24) & 0xff000000 ) | | |
1810 | ((x << 8) & 0x00ff0000 ) | | |
1811 | ((x >> 8) & 0x0000ff00 ) | | |
1812 | ((x >> 24) & 0x000000ff ); | |
1813 | } | |
1814 | #endif | |
1815 | ||
1816 | ||
1817 | /* *************************** | |
1818 | * Memory reads | |
1819 | *****************************/ | |
1820 | ||
1821 | /*! | |
1822 | * @internal | |
1823 | * @brief Enum to indicate whether a pointer is aligned. | |
1824 | */ | |
1825 | typedef enum { | |
1826 | XXH_aligned, /*!< Aligned */ | |
1827 | XXH_unaligned /*!< Possibly unaligned */ | |
1828 | } XXH_alignment; | |
1829 | ||
1830 | /* | |
1831 | * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. | |
1832 | * | |
1833 | * This is ideal for older compilers which don't inline memcpy. | |
1834 | */ | |
1835 | #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) | |
1836 | ||
1837 | XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) | |
1838 | { | |
1839 | const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; | |
1840 | return bytePtr[0] | |
1841 | | ((xxh_u32)bytePtr[1] << 8) | |
1842 | | ((xxh_u32)bytePtr[2] << 16) | |
1843 | | ((xxh_u32)bytePtr[3] << 24); | |
1844 | } | |
1845 | ||
1846 | XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) | |
1847 | { | |
1848 | const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; | |
1849 | return bytePtr[3] | |
1850 | | ((xxh_u32)bytePtr[2] << 8) | |
1851 | | ((xxh_u32)bytePtr[1] << 16) | |
1852 | | ((xxh_u32)bytePtr[0] << 24); | |
1853 | } | |
1854 | ||
1855 | #else | |
1856 | XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) | |
1857 | { | |
1858 | return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); | |
1859 | } | |
1860 | ||
1861 | static xxh_u32 XXH_readBE32(const void* ptr) | |
1862 | { | |
1863 | return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); | |
1864 | } | |
1865 | #endif | |
1866 | ||
1867 | XXH_FORCE_INLINE xxh_u32 | |
1868 | XXH_readLE32_align(const void* ptr, XXH_alignment align) | |
1869 | { | |
1870 | if (align==XXH_unaligned) { | |
1871 | return XXH_readLE32(ptr); | |
1872 | } else { | |
1873 | return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); | |
1874 | } | |
1875 | } | |
1876 | ||
1877 | ||
1878 | /* ************************************* | |
1879 | * Misc | |
1880 | ***************************************/ | |
1881 | /*! @ingroup public */ | |
1882 | XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } | |
1883 | ||
1884 | ||
1885 | /* ******************************************************************* | |
1886 | * 32-bit hash functions | |
1887 | *********************************************************************/ | |
1888 | /*! | |
1889 | * @} | |
1890 | * @defgroup xxh32_impl XXH32 implementation | |
1891 | * @ingroup impl | |
1892 | * @{ | |
1893 | */ | |
1894 | /* #define instead of static const, to be used as initializers */ | |
1895 | #define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ | |
1896 | #define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ | |
1897 | #define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ | |
1898 | #define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ | |
1899 | #define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ | |
1900 | ||
1901 | #ifdef XXH_OLD_NAMES | |
1902 | # define PRIME32_1 XXH_PRIME32_1 | |
1903 | # define PRIME32_2 XXH_PRIME32_2 | |
1904 | # define PRIME32_3 XXH_PRIME32_3 | |
1905 | # define PRIME32_4 XXH_PRIME32_4 | |
1906 | # define PRIME32_5 XXH_PRIME32_5 | |
1907 | #endif | |
1908 | ||
1909 | /*! | |
1910 | * @internal | |
1911 | * @brief Normal stripe processing routine. | |
1912 | * | |
1913 | * This shuffles the bits so that any bit from @p input impacts several bits in | |
1914 | * @p acc. | |
1915 | * | |
1916 | * @param acc The accumulator lane. | |
1917 | * @param input The stripe of input to mix. | |
1918 | * @return The mixed accumulator lane. | |
1919 | */ | |
1920 | static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) | |
1921 | { | |
1922 | acc += input * XXH_PRIME32_2; | |
1923 | acc = XXH_rotl32(acc, 13); | |
1924 | acc *= XXH_PRIME32_1; | |
1925 | #if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) | |
1926 | /* | |
1927 | * UGLY HACK: | |
1928 | * A compiler fence is the only thing that prevents GCC and Clang from | |
1929 | * autovectorizing the XXH32 loop (pragmas and attributes don't work for some | |
1930 | * reason) without globally disabling SSE4.1. | |
1931 | * | |
1932 | * The reason we want to avoid vectorization is because despite working on | |
1933 | * 4 integers at a time, there are multiple factors slowing XXH32 down on | |
1934 | * SSE4: | |
1935 | * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on | |
1936 | * newer chips!) making it slightly slower to multiply four integers at | |
1937 | * once compared to four integers independently. Even when pmulld was | |
1938 | * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE | |
1939 | * just to multiply unless doing a long operation. | |
1940 | * | |
1941 | * - Four instructions are required to rotate, | |
1942 | * movqda tmp, v // not required with VEX encoding | |
1943 | * pslld tmp, 13 // tmp <<= 13 | |
1944 | * psrld v, 19 // x >>= 19 | |
1945 | * por v, tmp // x |= tmp | |
1946 | * compared to one for scalar: | |
1947 | * roll v, 13 // reliably fast across the board | |
1948 | * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason | |
1949 | * | |
1950 | * - Instruction level parallelism is actually more beneficial here because | |
1951 | * the SIMD actually serializes this operation: While v1 is rotating, v2 | |
1952 | * can load data, while v3 can multiply. SSE forces them to operate | |
1953 | * together. | |
1954 | * | |
1955 | * This is also enabled on AArch64, as Clang autovectorizes it incorrectly | |
1956 | * and it is pointless writing a NEON implementation that is basically the | |
1957 | * same speed as scalar for XXH32. | |
1958 | */ | |
1959 | XXH_COMPILER_GUARD(acc); | |
1960 | #endif | |
1961 | return acc; | |
1962 | } | |
1963 | ||
1964 | /*! | |
1965 | * @internal | |
1966 | * @brief Mixes all bits to finalize the hash. | |
1967 | * | |
1968 | * The final mix ensures that all input bits have a chance to impact any bit in | |
1969 | * the output digest, resulting in an unbiased distribution. | |
1970 | * | |
1971 | * @param h32 The hash to avalanche. | |
1972 | * @return The avalanched hash. | |
1973 | */ | |
1974 | static xxh_u32 XXH32_avalanche(xxh_u32 h32) | |
1975 | { | |
1976 | h32 ^= h32 >> 15; | |
1977 | h32 *= XXH_PRIME32_2; | |
1978 | h32 ^= h32 >> 13; | |
1979 | h32 *= XXH_PRIME32_3; | |
1980 | h32 ^= h32 >> 16; | |
1981 | return(h32); | |
1982 | } | |
1983 | ||
1984 | #define XXH_get32bits(p) XXH_readLE32_align(p, align) | |
1985 | ||
1986 | /*! | |
1987 | * @internal | |
1988 | * @brief Processes the last 0-15 bytes of @p ptr. | |
1989 | * | |
1990 | * There may be up to 15 bytes remaining to consume from the input. | |
1991 | * This final stage will digest them to ensure that all input bytes are present | |
1992 | * in the final mix. | |
1993 | * | |
1994 | * @param h32 The hash to finalize. | |
1995 | * @param ptr The pointer to the remaining input. | |
1996 | * @param len The remaining length, modulo 16. | |
1997 | * @param align Whether @p ptr is aligned. | |
1998 | * @return The finalized hash. | |
1999 | */ | |
2000 | static xxh_u32 | |
2001 | XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) | |
2002 | { | |
2003 | #define XXH_PROCESS1 do { \ | |
2004 | h32 += (*ptr++) * XXH_PRIME32_5; \ | |
2005 | h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \ | |
2006 | } while (0) | |
2007 | ||
2008 | #define XXH_PROCESS4 do { \ | |
2009 | h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \ | |
2010 | ptr += 4; \ | |
2011 | h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \ | |
2012 | } while (0) | |
2013 | ||
2014 | if (ptr==NULL) XXH_ASSERT(len == 0); | |
2015 | ||
2016 | /* Compact rerolled version; generally faster */ | |
2017 | if (!XXH32_ENDJMP) { | |
2018 | len &= 15; | |
2019 | while (len >= 4) { | |
2020 | XXH_PROCESS4; | |
2021 | len -= 4; | |
2022 | } | |
2023 | while (len > 0) { | |
2024 | XXH_PROCESS1; | |
2025 | --len; | |
2026 | } | |
2027 | return XXH32_avalanche(h32); | |
2028 | } else { | |
2029 | switch(len&15) /* or switch(bEnd - p) */ { | |
2030 | case 12: XXH_PROCESS4; | |
2031 | XXH_FALLTHROUGH; | |
2032 | case 8: XXH_PROCESS4; | |
2033 | XXH_FALLTHROUGH; | |
2034 | case 4: XXH_PROCESS4; | |
2035 | return XXH32_avalanche(h32); | |
2036 | ||
2037 | case 13: XXH_PROCESS4; | |
2038 | XXH_FALLTHROUGH; | |
2039 | case 9: XXH_PROCESS4; | |
2040 | XXH_FALLTHROUGH; | |
2041 | case 5: XXH_PROCESS4; | |
2042 | XXH_PROCESS1; | |
2043 | return XXH32_avalanche(h32); | |
2044 | ||
2045 | case 14: XXH_PROCESS4; | |
2046 | XXH_FALLTHROUGH; | |
2047 | case 10: XXH_PROCESS4; | |
2048 | XXH_FALLTHROUGH; | |
2049 | case 6: XXH_PROCESS4; | |
2050 | XXH_PROCESS1; | |
2051 | XXH_PROCESS1; | |
2052 | return XXH32_avalanche(h32); | |
2053 | ||
2054 | case 15: XXH_PROCESS4; | |
2055 | XXH_FALLTHROUGH; | |
2056 | case 11: XXH_PROCESS4; | |
2057 | XXH_FALLTHROUGH; | |
2058 | case 7: XXH_PROCESS4; | |
2059 | XXH_FALLTHROUGH; | |
2060 | case 3: XXH_PROCESS1; | |
2061 | XXH_FALLTHROUGH; | |
2062 | case 2: XXH_PROCESS1; | |
2063 | XXH_FALLTHROUGH; | |
2064 | case 1: XXH_PROCESS1; | |
2065 | XXH_FALLTHROUGH; | |
2066 | case 0: return XXH32_avalanche(h32); | |
2067 | } | |
2068 | XXH_ASSERT(0); | |
2069 | return h32; /* reaching this point is deemed impossible */ | |
2070 | } | |
2071 | } | |
2072 | ||
2073 | #ifdef XXH_OLD_NAMES | |
2074 | # define PROCESS1 XXH_PROCESS1 | |
2075 | # define PROCESS4 XXH_PROCESS4 | |
2076 | #else | |
2077 | # undef XXH_PROCESS1 | |
2078 | # undef XXH_PROCESS4 | |
2079 | #endif | |
2080 | ||
2081 | /*! | |
2082 | * @internal | |
2083 | * @brief The implementation for @ref XXH32(). | |
2084 | * | |
2085 | * @param input , len , seed Directly passed from @ref XXH32(). | |
2086 | * @param align Whether @p input is aligned. | |
2087 | * @return The calculated hash. | |
2088 | */ | |
2089 | XXH_FORCE_INLINE xxh_u32 | |
2090 | XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) | |
2091 | { | |
2092 | xxh_u32 h32; | |
2093 | ||
2094 | if (input==NULL) XXH_ASSERT(len == 0); | |
2095 | ||
2096 | if (len>=16) { | |
2097 | const xxh_u8* const bEnd = input + len; | |
2098 | const xxh_u8* const limit = bEnd - 15; | |
2099 | xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; | |
2100 | xxh_u32 v2 = seed + XXH_PRIME32_2; | |
2101 | xxh_u32 v3 = seed + 0; | |
2102 | xxh_u32 v4 = seed - XXH_PRIME32_1; | |
2103 | ||
2104 | do { | |
2105 | v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; | |
2106 | v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; | |
2107 | v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; | |
2108 | v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; | |
2109 | } while (input < limit); | |
2110 | ||
2111 | h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) | |
2112 | + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); | |
2113 | } else { | |
2114 | h32 = seed + XXH_PRIME32_5; | |
2115 | } | |
2116 | ||
2117 | h32 += (xxh_u32)len; | |
2118 | ||
2119 | return XXH32_finalize(h32, input, len&15, align); | |
2120 | } | |
2121 | ||
2122 | /*! @ingroup xxh32_family */ | |
2123 | XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) | |
2124 | { | |
2125 | #if 0 | |
2126 | /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ | |
2127 | XXH32_state_t state; | |
2128 | XXH32_reset(&state, seed); | |
2129 | XXH32_update(&state, (const xxh_u8*)input, len); | |
2130 | return XXH32_digest(&state); | |
2131 | #else | |
2132 | if (XXH_FORCE_ALIGN_CHECK) { | |
2133 | if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ | |
2134 | return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); | |
2135 | } } | |
2136 | ||
2137 | return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); | |
2138 | #endif | |
2139 | } | |
2140 | ||
2141 | ||
2142 | ||
2143 | /******* Hash streaming *******/ | |
2144 | /*! | |
2145 | * @ingroup xxh32_family | |
2146 | */ | |
2147 | XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) | |
2148 | { | |
2149 | return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); | |
2150 | } | |
2151 | /*! @ingroup xxh32_family */ | |
2152 | XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) | |
2153 | { | |
2154 | XXH_free(statePtr); | |
2155 | return XXH_OK; | |
2156 | } | |
2157 | ||
2158 | /*! @ingroup xxh32_family */ | |
2159 | XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) | |
2160 | { | |
2161 | XXH_memcpy(dstState, srcState, sizeof(*dstState)); | |
2162 | } | |
2163 | ||
2164 | /*! @ingroup xxh32_family */ | |
2165 | XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) | |
2166 | { | |
2167 | XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ | |
2168 | memset(&state, 0, sizeof(state)); | |
2169 | state.v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; | |
2170 | state.v[1] = seed + XXH_PRIME32_2; | |
2171 | state.v[2] = seed + 0; | |
2172 | state.v[3] = seed - XXH_PRIME32_1; | |
2173 | /* do not write into reserved, planned to be removed in a future version */ | |
2174 | XXH_memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); | |
2175 | return XXH_OK; | |
2176 | } | |
2177 | ||
2178 | ||
2179 | /*! @ingroup xxh32_family */ | |
2180 | XXH_PUBLIC_API XXH_errorcode | |
2181 | XXH32_update(XXH32_state_t* state, const void* input, size_t len) | |
2182 | { | |
2183 | if (input==NULL) { | |
2184 | XXH_ASSERT(len == 0); | |
2185 | return XXH_OK; | |
2186 | } | |
2187 | ||
2188 | { const xxh_u8* p = (const xxh_u8*)input; | |
2189 | const xxh_u8* const bEnd = p + len; | |
2190 | ||
2191 | state->total_len_32 += (XXH32_hash_t)len; | |
2192 | state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); | |
2193 | ||
2194 | if (state->memsize + len < 16) { /* fill in tmp buffer */ | |
2195 | XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); | |
2196 | state->memsize += (XXH32_hash_t)len; | |
2197 | return XXH_OK; | |
2198 | } | |
2199 | ||
2200 | if (state->memsize) { /* some data left from previous update */ | |
2201 | XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); | |
2202 | { const xxh_u32* p32 = state->mem32; | |
2203 | state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++; | |
2204 | state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++; | |
2205 | state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++; | |
2206 | state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32)); | |
2207 | } | |
2208 | p += 16-state->memsize; | |
2209 | state->memsize = 0; | |
2210 | } | |
2211 | ||
2212 | if (p <= bEnd-16) { | |
2213 | const xxh_u8* const limit = bEnd - 16; | |
2214 | ||
2215 | do { | |
2216 | state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4; | |
2217 | state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4; | |
2218 | state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4; | |
2219 | state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4; | |
2220 | } while (p<=limit); | |
2221 | ||
2222 | } | |
2223 | ||
2224 | if (p < bEnd) { | |
2225 | XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); | |
2226 | state->memsize = (unsigned)(bEnd-p); | |
2227 | } | |
2228 | } | |
2229 | ||
2230 | return XXH_OK; | |
2231 | } | |
2232 | ||
2233 | ||
2234 | /*! @ingroup xxh32_family */ | |
2235 | XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) | |
2236 | { | |
2237 | xxh_u32 h32; | |
2238 | ||
2239 | if (state->large_len) { | |
2240 | h32 = XXH_rotl32(state->v[0], 1) | |
2241 | + XXH_rotl32(state->v[1], 7) | |
2242 | + XXH_rotl32(state->v[2], 12) | |
2243 | + XXH_rotl32(state->v[3], 18); | |
2244 | } else { | |
2245 | h32 = state->v[2] /* == seed */ + XXH_PRIME32_5; | |
2246 | } | |
2247 | ||
2248 | h32 += state->total_len_32; | |
2249 | ||
2250 | return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); | |
2251 | } | |
2252 | ||
2253 | ||
2254 | /******* Canonical representation *******/ | |
2255 | ||
2256 | /*! | |
2257 | * @ingroup xxh32_family | |
2258 | * The default return values from XXH functions are unsigned 32 and 64 bit | |
2259 | * integers. | |
2260 | * | |
2261 | * The canonical representation uses big endian convention, the same convention | |
2262 | * as human-readable numbers (large digits first). | |
2263 | * | |
2264 | * This way, hash values can be written into a file or buffer, remaining | |
2265 | * comparable across different systems. | |
2266 | * | |
2267 | * The following functions allow transformation of hash values to and from their | |
2268 | * canonical format. | |
2269 | */ | |
2270 | XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) | |
2271 | { | |
2272 | XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); | |
2273 | if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); | |
2274 | XXH_memcpy(dst, &hash, sizeof(*dst)); | |
2275 | } | |
2276 | /*! @ingroup xxh32_family */ | |
2277 | XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) | |
2278 | { | |
2279 | return XXH_readBE32(src); | |
2280 | } | |
2281 | ||
2282 | ||
2283 | #ifndef XXH_NO_LONG_LONG | |
2284 | ||
2285 | /* ******************************************************************* | |
2286 | * 64-bit hash functions | |
2287 | *********************************************************************/ | |
2288 | /*! | |
2289 | * @} | |
2290 | * @ingroup impl | |
2291 | * @{ | |
2292 | */ | |
2293 | /******* Memory access *******/ | |
2294 | ||
2295 | typedef XXH64_hash_t xxh_u64; | |
2296 | ||
2297 | #ifdef XXH_OLD_NAMES | |
2298 | # define U64 xxh_u64 | |
2299 | #endif | |
2300 | ||
2301 | #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) | |
2302 | /* | |
2303 | * Manual byteshift. Best for old compilers which don't inline memcpy. | |
2304 | * We actually directly use XXH_readLE64 and XXH_readBE64. | |
2305 | */ | |
2306 | #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) | |
2307 | ||
2308 | /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ | |
2309 | static xxh_u64 XXH_read64(const void* memPtr) | |
2310 | { | |
2311 | return *(const xxh_u64*) memPtr; | |
2312 | } | |
2313 | ||
2314 | #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) | |
2315 | ||
2316 | /* | |
2317 | * __pack instructions are safer, but compiler specific, hence potentially | |
2318 | * problematic for some compilers. | |
2319 | * | |
2320 | * Currently only defined for GCC and ICC. | |
2321 | */ | |
2322 | #ifdef XXH_OLD_NAMES | |
2323 | typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; | |
2324 | #endif | |
2325 | static xxh_u64 XXH_read64(const void* ptr) | |
2326 | { | |
2327 | typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64; | |
2328 | return ((const xxh_unalign64*)ptr)->u64; | |
2329 | } | |
2330 | ||
2331 | #else | |
2332 | ||
2333 | /* | |
2334 | * Portable and safe solution. Generally efficient. | |
2335 | * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html | |
2336 | */ | |
2337 | static xxh_u64 XXH_read64(const void* memPtr) | |
2338 | { | |
2339 | xxh_u64 val; | |
2340 | XXH_memcpy(&val, memPtr, sizeof(val)); | |
2341 | return val; | |
2342 | } | |
2343 | ||
2344 | #endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ | |
2345 | ||
2346 | #if defined(_MSC_VER) /* Visual Studio */ | |
2347 | # define XXH_swap64 _byteswap_uint64 | |
2348 | #elif XXH_GCC_VERSION >= 403 | |
2349 | # define XXH_swap64 __builtin_bswap64 | |
2350 | #else | |
2351 | static xxh_u64 XXH_swap64(xxh_u64 x) | |
2352 | { | |
2353 | return ((x << 56) & 0xff00000000000000ULL) | | |
2354 | ((x << 40) & 0x00ff000000000000ULL) | | |
2355 | ((x << 24) & 0x0000ff0000000000ULL) | | |
2356 | ((x << 8) & 0x000000ff00000000ULL) | | |
2357 | ((x >> 8) & 0x00000000ff000000ULL) | | |
2358 | ((x >> 24) & 0x0000000000ff0000ULL) | | |
2359 | ((x >> 40) & 0x000000000000ff00ULL) | | |
2360 | ((x >> 56) & 0x00000000000000ffULL); | |
2361 | } | |
2362 | #endif | |
2363 | ||
2364 | ||
2365 | /* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ | |
2366 | #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) | |
2367 | ||
2368 | XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) | |
2369 | { | |
2370 | const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; | |
2371 | return bytePtr[0] | |
2372 | | ((xxh_u64)bytePtr[1] << 8) | |
2373 | | ((xxh_u64)bytePtr[2] << 16) | |
2374 | | ((xxh_u64)bytePtr[3] << 24) | |
2375 | | ((xxh_u64)bytePtr[4] << 32) | |
2376 | | ((xxh_u64)bytePtr[5] << 40) | |
2377 | | ((xxh_u64)bytePtr[6] << 48) | |
2378 | | ((xxh_u64)bytePtr[7] << 56); | |
2379 | } | |
2380 | ||
2381 | XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) | |
2382 | { | |
2383 | const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; | |
2384 | return bytePtr[7] | |
2385 | | ((xxh_u64)bytePtr[6] << 8) | |
2386 | | ((xxh_u64)bytePtr[5] << 16) | |
2387 | | ((xxh_u64)bytePtr[4] << 24) | |
2388 | | ((xxh_u64)bytePtr[3] << 32) | |
2389 | | ((xxh_u64)bytePtr[2] << 40) | |
2390 | | ((xxh_u64)bytePtr[1] << 48) | |
2391 | | ((xxh_u64)bytePtr[0] << 56); | |
2392 | } | |
2393 | ||
2394 | #else | |
2395 | XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) | |
2396 | { | |
2397 | return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); | |
2398 | } | |
2399 | ||
2400 | static xxh_u64 XXH_readBE64(const void* ptr) | |
2401 | { | |
2402 | return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); | |
2403 | } | |
2404 | #endif | |
2405 | ||
2406 | XXH_FORCE_INLINE xxh_u64 | |
2407 | XXH_readLE64_align(const void* ptr, XXH_alignment align) | |
2408 | { | |
2409 | if (align==XXH_unaligned) | |
2410 | return XXH_readLE64(ptr); | |
2411 | else | |
2412 | return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); | |
2413 | } | |
2414 | ||
2415 | ||
2416 | /******* xxh64 *******/ | |
2417 | /*! | |
2418 | * @} | |
2419 | * @defgroup xxh64_impl XXH64 implementation | |
2420 | * @ingroup impl | |
2421 | * @{ | |
2422 | */ | |
2423 | /* #define rather that static const, to be used as initializers */ | |
2424 | #define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */ | |
2425 | #define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */ | |
2426 | #define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */ | |
2427 | #define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */ | |
2428 | #define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */ | |
2429 | ||
2430 | #ifdef XXH_OLD_NAMES | |
2431 | # define PRIME64_1 XXH_PRIME64_1 | |
2432 | # define PRIME64_2 XXH_PRIME64_2 | |
2433 | # define PRIME64_3 XXH_PRIME64_3 | |
2434 | # define PRIME64_4 XXH_PRIME64_4 | |
2435 | # define PRIME64_5 XXH_PRIME64_5 | |
2436 | #endif | |
2437 | ||
2438 | static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) | |
2439 | { | |
2440 | acc += input * XXH_PRIME64_2; | |
2441 | acc = XXH_rotl64(acc, 31); | |
2442 | acc *= XXH_PRIME64_1; | |
2443 | return acc; | |
2444 | } | |
2445 | ||
2446 | static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) | |
2447 | { | |
2448 | val = XXH64_round(0, val); | |
2449 | acc ^= val; | |
2450 | acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; | |
2451 | return acc; | |
2452 | } | |
2453 | ||
2454 | static xxh_u64 XXH64_avalanche(xxh_u64 h64) | |
2455 | { | |
2456 | h64 ^= h64 >> 33; | |
2457 | h64 *= XXH_PRIME64_2; | |
2458 | h64 ^= h64 >> 29; | |
2459 | h64 *= XXH_PRIME64_3; | |
2460 | h64 ^= h64 >> 32; | |
2461 | return h64; | |
2462 | } | |
2463 | ||
2464 | ||
2465 | #define XXH_get64bits(p) XXH_readLE64_align(p, align) | |
2466 | ||
2467 | static xxh_u64 | |
2468 | XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) | |
2469 | { | |
2470 | if (ptr==NULL) XXH_ASSERT(len == 0); | |
2471 | len &= 31; | |
2472 | while (len >= 8) { | |
2473 | xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); | |
2474 | ptr += 8; | |
2475 | h64 ^= k1; | |
2476 | h64 = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4; | |
2477 | len -= 8; | |
2478 | } | |
2479 | if (len >= 4) { | |
2480 | h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; | |
2481 | ptr += 4; | |
2482 | h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; | |
2483 | len -= 4; | |
2484 | } | |
2485 | while (len > 0) { | |
2486 | h64 ^= (*ptr++) * XXH_PRIME64_5; | |
2487 | h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; | |
2488 | --len; | |
2489 | } | |
2490 | return XXH64_avalanche(h64); | |
2491 | } | |
2492 | ||
2493 | #ifdef XXH_OLD_NAMES | |
2494 | # define PROCESS1_64 XXH_PROCESS1_64 | |
2495 | # define PROCESS4_64 XXH_PROCESS4_64 | |
2496 | # define PROCESS8_64 XXH_PROCESS8_64 | |
2497 | #else | |
2498 | # undef XXH_PROCESS1_64 | |
2499 | # undef XXH_PROCESS4_64 | |
2500 | # undef XXH_PROCESS8_64 | |
2501 | #endif | |
2502 | ||
2503 | XXH_FORCE_INLINE xxh_u64 | |
2504 | XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) | |
2505 | { | |
2506 | xxh_u64 h64; | |
2507 | if (input==NULL) XXH_ASSERT(len == 0); | |
2508 | ||
2509 | if (len>=32) { | |
2510 | const xxh_u8* const bEnd = input + len; | |
2511 | const xxh_u8* const limit = bEnd - 31; | |
2512 | xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; | |
2513 | xxh_u64 v2 = seed + XXH_PRIME64_2; | |
2514 | xxh_u64 v3 = seed + 0; | |
2515 | xxh_u64 v4 = seed - XXH_PRIME64_1; | |
2516 | ||
2517 | do { | |
2518 | v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; | |
2519 | v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; | |
2520 | v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; | |
2521 | v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; | |
2522 | } while (input<limit); | |
2523 | ||
2524 | h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); | |
2525 | h64 = XXH64_mergeRound(h64, v1); | |
2526 | h64 = XXH64_mergeRound(h64, v2); | |
2527 | h64 = XXH64_mergeRound(h64, v3); | |
2528 | h64 = XXH64_mergeRound(h64, v4); | |
2529 | ||
2530 | } else { | |
2531 | h64 = seed + XXH_PRIME64_5; | |
2532 | } | |
2533 | ||
2534 | h64 += (xxh_u64) len; | |
2535 | ||
2536 | return XXH64_finalize(h64, input, len, align); | |
2537 | } | |
2538 | ||
2539 | ||
2540 | /*! @ingroup xxh64_family */ | |
2541 | XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed) | |
2542 | { | |
2543 | #if 0 | |
2544 | /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ | |
2545 | XXH64_state_t state; | |
2546 | XXH64_reset(&state, seed); | |
2547 | XXH64_update(&state, (const xxh_u8*)input, len); | |
2548 | return XXH64_digest(&state); | |
2549 | #else | |
2550 | if (XXH_FORCE_ALIGN_CHECK) { | |
2551 | if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ | |
2552 | return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); | |
2553 | } } | |
2554 | ||
2555 | return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); | |
2556 | ||
2557 | #endif | |
2558 | } | |
2559 | ||
2560 | /******* Hash Streaming *******/ | |
2561 | ||
2562 | /*! @ingroup xxh64_family*/ | |
2563 | XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) | |
2564 | { | |
2565 | return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); | |
2566 | } | |
2567 | /*! @ingroup xxh64_family */ | |
2568 | XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) | |
2569 | { | |
2570 | XXH_free(statePtr); | |
2571 | return XXH_OK; | |
2572 | } | |
2573 | ||
2574 | /*! @ingroup xxh64_family */ | |
2575 | XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) | |
2576 | { | |
2577 | XXH_memcpy(dstState, srcState, sizeof(*dstState)); | |
2578 | } | |
2579 | ||
2580 | /*! @ingroup xxh64_family */ | |
2581 | XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed) | |
2582 | { | |
2583 | XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */ | |
2584 | memset(&state, 0, sizeof(state)); | |
2585 | state.v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; | |
2586 | state.v[1] = seed + XXH_PRIME64_2; | |
2587 | state.v[2] = seed + 0; | |
2588 | state.v[3] = seed - XXH_PRIME64_1; | |
2589 | /* do not write into reserved64, might be removed in a future version */ | |
2590 | XXH_memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64)); | |
2591 | return XXH_OK; | |
2592 | } | |
2593 | ||
2594 | /*! @ingroup xxh64_family */ | |
2595 | XXH_PUBLIC_API XXH_errorcode | |
2596 | XXH64_update (XXH64_state_t* state, const void* input, size_t len) | |
2597 | { | |
2598 | if (input==NULL) { | |
2599 | XXH_ASSERT(len == 0); | |
2600 | return XXH_OK; | |
2601 | } | |
2602 | ||
2603 | { const xxh_u8* p = (const xxh_u8*)input; | |
2604 | const xxh_u8* const bEnd = p + len; | |
2605 | ||
2606 | state->total_len += len; | |
2607 | ||
2608 | if (state->memsize + len < 32) { /* fill in tmp buffer */ | |
2609 | XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); | |
2610 | state->memsize += (xxh_u32)len; | |
2611 | return XXH_OK; | |
2612 | } | |
2613 | ||
2614 | if (state->memsize) { /* tmp buffer is full */ | |
2615 | XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); | |
2616 | state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0)); | |
2617 | state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1)); | |
2618 | state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2)); | |
2619 | state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3)); | |
2620 | p += 32 - state->memsize; | |
2621 | state->memsize = 0; | |
2622 | } | |
2623 | ||
2624 | if (p+32 <= bEnd) { | |
2625 | const xxh_u8* const limit = bEnd - 32; | |
2626 | ||
2627 | do { | |
2628 | state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8; | |
2629 | state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8; | |
2630 | state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8; | |
2631 | state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8; | |
2632 | } while (p<=limit); | |
2633 | ||
2634 | } | |
2635 | ||
2636 | if (p < bEnd) { | |
2637 | XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); | |
2638 | state->memsize = (unsigned)(bEnd-p); | |
2639 | } | |
2640 | } | |
2641 | ||
2642 | return XXH_OK; | |
2643 | } | |
2644 | ||
2645 | ||
2646 | /*! @ingroup xxh64_family */ | |
2647 | XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state) | |
2648 | { | |
2649 | xxh_u64 h64; | |
2650 | ||
2651 | if (state->total_len >= 32) { | |
2652 | h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18); | |
2653 | h64 = XXH64_mergeRound(h64, state->v[0]); | |
2654 | h64 = XXH64_mergeRound(h64, state->v[1]); | |
2655 | h64 = XXH64_mergeRound(h64, state->v[2]); | |
2656 | h64 = XXH64_mergeRound(h64, state->v[3]); | |
2657 | } else { | |
2658 | h64 = state->v[2] /*seed*/ + XXH_PRIME64_5; | |
2659 | } | |
2660 | ||
2661 | h64 += (xxh_u64) state->total_len; | |
2662 | ||
2663 | return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); | |
2664 | } | |
2665 | ||
2666 | ||
2667 | /******* Canonical representation *******/ | |
2668 | ||
2669 | /*! @ingroup xxh64_family */ | |
2670 | XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) | |
2671 | { | |
2672 | XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); | |
2673 | if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); | |
2674 | XXH_memcpy(dst, &hash, sizeof(*dst)); | |
2675 | } | |
2676 | ||
2677 | /*! @ingroup xxh64_family */ | |
2678 | XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) | |
2679 | { | |
2680 | return XXH_readBE64(src); | |
2681 | } | |
2682 | ||
2683 | #ifndef XXH_NO_XXH3 | |
2684 | ||
2685 | /* ********************************************************************* | |
2686 | * XXH3 | |
2687 | * New generation hash designed for speed on small keys and vectorization | |
2688 | ************************************************************************ */ | |
2689 | /*! | |
2690 | * @} | |
2691 | * @defgroup xxh3_impl XXH3 implementation | |
2692 | * @ingroup impl | |
2693 | * @{ | |
2694 | */ | |
2695 | ||
2696 | /* === Compiler specifics === */ | |
2697 | ||
2698 | #if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ | |
2699 | # define XXH_RESTRICT /* disable */ | |
2700 | #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ | |
2701 | # define XXH_RESTRICT restrict | |
2702 | #else | |
2703 | /* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */ | |
2704 | # define XXH_RESTRICT /* disable */ | |
2705 | #endif | |
2706 | ||
2707 | #if (defined(__GNUC__) && (__GNUC__ >= 3)) \ | |
2708 | || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ | |
2709 | || defined(__clang__) | |
2710 | # define XXH_likely(x) __builtin_expect(x, 1) | |
2711 | # define XXH_unlikely(x) __builtin_expect(x, 0) | |
2712 | #else | |
2713 | # define XXH_likely(x) (x) | |
2714 | # define XXH_unlikely(x) (x) | |
2715 | #endif | |
2716 | ||
2717 | #if defined(__GNUC__) | |
2718 | # if defined(__AVX2__) | |
2719 | # include <immintrin.h> | |
2720 | # elif defined(__SSE2__) | |
2721 | # include <emmintrin.h> | |
2722 | # elif defined(__ARM_NEON__) || defined(__ARM_NEON) | |
2723 | # define inline __inline__ /* circumvent a clang bug */ | |
2724 | # include <arm_neon.h> | |
2725 | # undef inline | |
2726 | # endif | |
2727 | #elif defined(_MSC_VER) | |
2728 | # include <intrin.h> | |
2729 | #endif | |
2730 | ||
2731 | /* | |
2732 | * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while | |
2733 | * remaining a true 64-bit/128-bit hash function. | |
2734 | * | |
2735 | * This is done by prioritizing a subset of 64-bit operations that can be | |
2736 | * emulated without too many steps on the average 32-bit machine. | |
2737 | * | |
2738 | * For example, these two lines seem similar, and run equally fast on 64-bit: | |
2739 | * | |
2740 | * xxh_u64 x; | |
2741 | * x ^= (x >> 47); // good | |
2742 | * x ^= (x >> 13); // bad | |
2743 | * | |
2744 | * However, to a 32-bit machine, there is a major difference. | |
2745 | * | |
2746 | * x ^= (x >> 47) looks like this: | |
2747 | * | |
2748 | * x.lo ^= (x.hi >> (47 - 32)); | |
2749 | * | |
2750 | * while x ^= (x >> 13) looks like this: | |
2751 | * | |
2752 | * // note: funnel shifts are not usually cheap. | |
2753 | * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); | |
2754 | * x.hi ^= (x.hi >> 13); | |
2755 | * | |
2756 | * The first one is significantly faster than the second, simply because the | |
2757 | * shift is larger than 32. This means: | |
2758 | * - All the bits we need are in the upper 32 bits, so we can ignore the lower | |
2759 | * 32 bits in the shift. | |
2760 | * - The shift result will always fit in the lower 32 bits, and therefore, | |
2761 | * we can ignore the upper 32 bits in the xor. | |
2762 | * | |
2763 | * Thanks to this optimization, XXH3 only requires these features to be efficient: | |
2764 | * | |
2765 | * - Usable unaligned access | |
2766 | * - A 32-bit or 64-bit ALU | |
2767 | * - If 32-bit, a decent ADC instruction | |
2768 | * - A 32 or 64-bit multiply with a 64-bit result | |
2769 | * - For the 128-bit variant, a decent byteswap helps short inputs. | |
2770 | * | |
2771 | * The first two are already required by XXH32, and almost all 32-bit and 64-bit | |
2772 | * platforms which can run XXH32 can run XXH3 efficiently. | |
2773 | * | |
2774 | * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one | |
2775 | * notable exception. | |
2776 | * | |
2777 | * First of all, Thumb-1 lacks support for the UMULL instruction which | |
2778 | * performs the important long multiply. This means numerous __aeabi_lmul | |
2779 | * calls. | |
2780 | * | |
2781 | * Second of all, the 8 functional registers are just not enough. | |
2782 | * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need | |
2783 | * Lo registers, and this shuffling results in thousands more MOVs than A32. | |
2784 | * | |
2785 | * A32 and T32 don't have this limitation. They can access all 14 registers, | |
2786 | * do a 32->64 multiply with UMULL, and the flexible operand allowing free | |
2787 | * shifts is helpful, too. | |
2788 | * | |
2789 | * Therefore, we do a quick sanity check. | |
2790 | * | |
2791 | * If compiling Thumb-1 for a target which supports ARM instructions, we will | |
2792 | * emit a warning, as it is not a "sane" platform to compile for. | |
2793 | * | |
2794 | * Usually, if this happens, it is because of an accident and you probably need | |
2795 | * to specify -march, as you likely meant to compile for a newer architecture. | |
2796 | * | |
2797 | * Credit: large sections of the vectorial and asm source code paths | |
2798 | * have been contributed by @easyaspi314 | |
2799 | */ | |
2800 | #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) | |
2801 | # warning "XXH3 is highly inefficient without ARM or Thumb-2." | |
2802 | #endif | |
2803 | ||
2804 | /* ========================================== | |
2805 | * Vectorization detection | |
2806 | * ========================================== */ | |
2807 | ||
2808 | #ifdef XXH_DOXYGEN | |
2809 | /*! | |
2810 | * @ingroup tuning | |
2811 | * @brief Overrides the vectorization implementation chosen for XXH3. | |
2812 | * | |
2813 | * Can be defined to 0 to disable SIMD or any of the values mentioned in | |
2814 | * @ref XXH_VECTOR_TYPE. | |
2815 | * | |
2816 | * If this is not defined, it uses predefined macros to determine the best | |
2817 | * implementation. | |
2818 | */ | |
2819 | # define XXH_VECTOR XXH_SCALAR | |
2820 | /*! | |
2821 | * @ingroup tuning | |
2822 | * @brief Possible values for @ref XXH_VECTOR. | |
2823 | * | |
2824 | * Note that these are actually implemented as macros. | |
2825 | * | |
2826 | * If this is not defined, it is detected automatically. | |
2827 | * @ref XXH_X86DISPATCH overrides this. | |
2828 | */ | |
2829 | enum XXH_VECTOR_TYPE /* fake enum */ { | |
2830 | XXH_SCALAR = 0, /*!< Portable scalar version */ | |
2831 | XXH_SSE2 = 1, /*!< | |
2832 | * SSE2 for Pentium 4, Opteron, all x86_64. | |
2833 | * | |
2834 | * @note SSE2 is also guaranteed on Windows 10, macOS, and | |
2835 | * Android x86. | |
2836 | */ | |
2837 | XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ | |
2838 | XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ | |
2839 | XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */ | |
2840 | XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ | |
2841 | }; | |
2842 | /*! | |
2843 | * @ingroup tuning | |
2844 | * @brief Selects the minimum alignment for XXH3's accumulators. | |
2845 | * | |
2846 | * When using SIMD, this should match the alignment reqired for said vector | |
2847 | * type, so, for example, 32 for AVX2. | |
2848 | * | |
2849 | * Default: Auto detected. | |
2850 | */ | |
2851 | # define XXH_ACC_ALIGN 8 | |
2852 | #endif | |
2853 | ||
2854 | /* Actual definition */ | |
2855 | #ifndef XXH_DOXYGEN | |
2856 | # define XXH_SCALAR 0 | |
2857 | # define XXH_SSE2 1 | |
2858 | # define XXH_AVX2 2 | |
2859 | # define XXH_AVX512 3 | |
2860 | # define XXH_NEON 4 | |
2861 | # define XXH_VSX 5 | |
2862 | #endif | |
2863 | ||
2864 | #ifndef XXH_VECTOR /* can be defined on command line */ | |
2865 | # if defined(__AVX512F__) | |
2866 | # define XXH_VECTOR XXH_AVX512 | |
2867 | # elif defined(__AVX2__) | |
2868 | # define XXH_VECTOR XXH_AVX2 | |
2869 | # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) | |
2870 | # define XXH_VECTOR XXH_SSE2 | |
2871 | # elif ( \ | |
2872 | defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ | |
2873 | || defined(_M_ARM64) || defined(_M_ARM_ARMV7VE) /* msvc */ \ | |
2874 | ) && ( \ | |
2875 | defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ | |
2876 | || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ | |
2877 | ) | |
2878 | # define XXH_VECTOR XXH_NEON | |
2879 | # elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ | |
2880 | || (defined(__s390x__) && defined(__VEC__)) \ | |
2881 | && defined(__GNUC__) /* TODO: IBM XL */ | |
2882 | # define XXH_VECTOR XXH_VSX | |
2883 | # else | |
2884 | # define XXH_VECTOR XXH_SCALAR | |
2885 | # endif | |
2886 | #endif | |
2887 | ||
2888 | /* | |
2889 | * Controls the alignment of the accumulator, | |
2890 | * for compatibility with aligned vector loads, which are usually faster. | |
2891 | */ | |
2892 | #ifndef XXH_ACC_ALIGN | |
2893 | # if defined(XXH_X86DISPATCH) | |
2894 | # define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ | |
2895 | # elif XXH_VECTOR == XXH_SCALAR /* scalar */ | |
2896 | # define XXH_ACC_ALIGN 8 | |
2897 | # elif XXH_VECTOR == XXH_SSE2 /* sse2 */ | |
2898 | # define XXH_ACC_ALIGN 16 | |
2899 | # elif XXH_VECTOR == XXH_AVX2 /* avx2 */ | |
2900 | # define XXH_ACC_ALIGN 32 | |
2901 | # elif XXH_VECTOR == XXH_NEON /* neon */ | |
2902 | # define XXH_ACC_ALIGN 16 | |
2903 | # elif XXH_VECTOR == XXH_VSX /* vsx */ | |
2904 | # define XXH_ACC_ALIGN 16 | |
2905 | # elif XXH_VECTOR == XXH_AVX512 /* avx512 */ | |
2906 | # define XXH_ACC_ALIGN 64 | |
2907 | # endif | |
2908 | #endif | |
2909 | ||
2910 | #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ | |
2911 | || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 | |
2912 | # define XXH_SEC_ALIGN XXH_ACC_ALIGN | |
2913 | #else | |
2914 | # define XXH_SEC_ALIGN 8 | |
2915 | #endif | |
2916 | ||
2917 | /* | |
2918 | * UGLY HACK: | |
2919 | * GCC usually generates the best code with -O3 for xxHash. | |
2920 | * | |
2921 | * However, when targeting AVX2, it is overzealous in its unrolling resulting | |
2922 | * in code roughly 3/4 the speed of Clang. | |
2923 | * | |
2924 | * There are other issues, such as GCC splitting _mm256_loadu_si256 into | |
2925 | * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which | |
2926 | * only applies to Sandy and Ivy Bridge... which don't even support AVX2. | |
2927 | * | |
2928 | * That is why when compiling the AVX2 version, it is recommended to use either | |
2929 | * -O2 -mavx2 -march=haswell | |
2930 | * or | |
2931 | * -O2 -mavx2 -mno-avx256-split-unaligned-load | |
2932 | * for decent performance, or to use Clang instead. | |
2933 | * | |
2934 | * Fortunately, we can control the first one with a pragma that forces GCC into | |
2935 | * -O2, but the other one we can't control without "failed to inline always | |
2936 | * inline function due to target mismatch" warnings. | |
2937 | */ | |
2938 | #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ | |
2939 | && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ | |
2940 | && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ | |
2941 | # pragma GCC push_options | |
2942 | # pragma GCC optimize("-O2") | |
2943 | #endif | |
2944 | ||
2945 | ||
2946 | #if XXH_VECTOR == XXH_NEON | |
2947 | /* | |
2948 | * NEON's setup for vmlal_u32 is a little more complicated than it is on | |
2949 | * SSE2, AVX2, and VSX. | |
2950 | * | |
2951 | * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast. | |
2952 | * | |
2953 | * To do the same operation, the 128-bit 'Q' register needs to be split into | |
2954 | * two 64-bit 'D' registers, performing this operation:: | |
2955 | * | |
2956 | * [ a | b ] | |
2957 | * | '---------. .--------' | | |
2958 | * | x | | |
2959 | * | .---------' '--------. | | |
2960 | * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] | |
2961 | * | |
2962 | * Due to significant changes in aarch64, the fastest method for aarch64 is | |
2963 | * completely different than the fastest method for ARMv7-A. | |
2964 | * | |
2965 | * ARMv7-A treats D registers as unions overlaying Q registers, so modifying | |
2966 | * D11 will modify the high half of Q5. This is similar to how modifying AH | |
2967 | * will only affect bits 8-15 of AX on x86. | |
2968 | * | |
2969 | * VZIP takes two registers, and puts even lanes in one register and odd lanes | |
2970 | * in the other. | |
2971 | * | |
2972 | * On ARMv7-A, this strangely modifies both parameters in place instead of | |
2973 | * taking the usual 3-operand form. | |
2974 | * | |
2975 | * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the | |
2976 | * lower and upper halves of the Q register to end up with the high and low | |
2977 | * halves where we want - all in one instruction. | |
2978 | * | |
2979 | * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] } | |
2980 | * | |
2981 | * Unfortunately we need inline assembly for this: Instructions modifying two | |
2982 | * registers at once is not possible in GCC or Clang's IR, and they have to | |
2983 | * create a copy. | |
2984 | * | |
2985 | * aarch64 requires a different approach. | |
2986 | * | |
2987 | * In order to make it easier to write a decent compiler for aarch64, many | |
2988 | * quirks were removed, such as conditional execution. | |
2989 | * | |
2990 | * NEON was also affected by this. | |
2991 | * | |
2992 | * aarch64 cannot access the high bits of a Q-form register, and writes to a | |
2993 | * D-form register zero the high bits, similar to how writes to W-form scalar | |
2994 | * registers (or DWORD registers on x86_64) work. | |
2995 | * | |
2996 | * The formerly free vget_high intrinsics now require a vext (with a few | |
2997 | * exceptions) | |
2998 | * | |
2999 | * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent | |
3000 | * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one | |
3001 | * operand. | |
3002 | * | |
3003 | * The equivalent of the VZIP.32 on the lower and upper halves would be this | |
3004 | * mess: | |
3005 | * | |
3006 | * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] } | |
3007 | * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } | |
3008 | * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } | |
3009 | * | |
3010 | * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN): | |
3011 | * | |
3012 | * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); | |
3013 | * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); | |
3014 | * | |
3015 | * This is available on ARMv7-A, but is less efficient than a single VZIP.32. | |
3016 | */ | |
3017 | ||
3018 | /*! | |
3019 | * Function-like macro: | |
3020 | * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi) | |
3021 | * { | |
3022 | * outLo = (uint32x2_t)(in & 0xFFFFFFFF); | |
3023 | * outHi = (uint32x2_t)(in >> 32); | |
3024 | * in = UNDEFINED; | |
3025 | * } | |
3026 | */ | |
3027 | # if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ | |
3028 | && defined(__GNUC__) \ | |
3029 | && !defined(__aarch64__) && !defined(__arm64__) && !defined(_M_ARM64) | |
3030 | # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ | |
3031 | do { \ | |
3032 | /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \ | |
3033 | /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \ | |
3034 | /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \ | |
3035 | __asm__("vzip.32 %e0, %f0" : "+w" (in)); \ | |
3036 | (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \ | |
3037 | (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ | |
3038 | } while (0) | |
3039 | # else | |
3040 | # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ | |
3041 | do { \ | |
3042 | (outLo) = vmovn_u64 (in); \ | |
3043 | (outHi) = vshrn_n_u64 ((in), 32); \ | |
3044 | } while (0) | |
3045 | # endif | |
3046 | #endif /* XXH_VECTOR == XXH_NEON */ | |
3047 | ||
3048 | /* | |
3049 | * VSX and Z Vector helpers. | |
3050 | * | |
3051 | * This is very messy, and any pull requests to clean this up are welcome. | |
3052 | * | |
3053 | * There are a lot of problems with supporting VSX and s390x, due to | |
3054 | * inconsistent intrinsics, spotty coverage, and multiple endiannesses. | |
3055 | */ | |
3056 | #if XXH_VECTOR == XXH_VSX | |
3057 | # if defined(__s390x__) | |
3058 | # include <s390intrin.h> | |
3059 | # else | |
3060 | /* gcc's altivec.h can have the unwanted consequence to unconditionally | |
3061 | * #define bool, vector, and pixel keywords, | |
3062 | * with bad consequences for programs already using these keywords for other purposes. | |
3063 | * The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined. | |
3064 | * __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler, | |
3065 | * but it seems that, in some cases, it isn't. | |
3066 | * Force the build macro to be defined, so that keywords are not altered. | |
3067 | */ | |
3068 | # if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__) | |
3069 | # define __APPLE_ALTIVEC__ | |
3070 | # endif | |
3071 | # include <altivec.h> | |
3072 | # endif | |
3073 | ||
3074 | typedef __vector unsigned long long xxh_u64x2; | |
3075 | typedef __vector unsigned char xxh_u8x16; | |
3076 | typedef __vector unsigned xxh_u32x4; | |
3077 | ||
3078 | # ifndef XXH_VSX_BE | |
3079 | # if defined(__BIG_ENDIAN__) \ | |
3080 | || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |
3081 | # define XXH_VSX_BE 1 | |
3082 | # elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ | |
3083 | # warning "-maltivec=be is not recommended. Please use native endianness." | |
3084 | # define XXH_VSX_BE 1 | |
3085 | # else | |
3086 | # define XXH_VSX_BE 0 | |
3087 | # endif | |
3088 | # endif /* !defined(XXH_VSX_BE) */ | |
3089 | ||
3090 | # if XXH_VSX_BE | |
3091 | # if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) | |
3092 | # define XXH_vec_revb vec_revb | |
3093 | # else | |
3094 | /*! | |
3095 | * A polyfill for POWER9's vec_revb(). | |
3096 | */ | |
3097 | XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) | |
3098 | { | |
3099 | xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, | |
3100 | 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; | |
3101 | return vec_perm(val, val, vByteSwap); | |
3102 | } | |
3103 | # endif | |
3104 | # endif /* XXH_VSX_BE */ | |
3105 | ||
3106 | /*! | |
3107 | * Performs an unaligned vector load and byte swaps it on big endian. | |
3108 | */ | |
3109 | XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) | |
3110 | { | |
3111 | xxh_u64x2 ret; | |
3112 | XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2)); | |
3113 | # if XXH_VSX_BE | |
3114 | ret = XXH_vec_revb(ret); | |
3115 | # endif | |
3116 | return ret; | |
3117 | } | |
3118 | ||
3119 | /* | |
3120 | * vec_mulo and vec_mule are very problematic intrinsics on PowerPC | |
3121 | * | |
3122 | * These intrinsics weren't added until GCC 8, despite existing for a while, | |
3123 | * and they are endian dependent. Also, their meaning swap depending on version. | |
3124 | * */ | |
3125 | # if defined(__s390x__) | |
3126 | /* s390x is always big endian, no issue on this platform */ | |
3127 | # define XXH_vec_mulo vec_mulo | |
3128 | # define XXH_vec_mule vec_mule | |
3129 | # elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) | |
3130 | /* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ | |
3131 | # define XXH_vec_mulo __builtin_altivec_vmulouw | |
3132 | # define XXH_vec_mule __builtin_altivec_vmuleuw | |
3133 | # else | |
3134 | /* gcc needs inline assembly */ | |
3135 | /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ | |
3136 | XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) | |
3137 | { | |
3138 | xxh_u64x2 result; | |
3139 | __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); | |
3140 | return result; | |
3141 | } | |
3142 | XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) | |
3143 | { | |
3144 | xxh_u64x2 result; | |
3145 | __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); | |
3146 | return result; | |
3147 | } | |
3148 | # endif /* XXH_vec_mulo, XXH_vec_mule */ | |
3149 | #endif /* XXH_VECTOR == XXH_VSX */ | |
3150 | ||
3151 | ||
3152 | /* prefetch | |
3153 | * can be disabled, by declaring XXH_NO_PREFETCH build macro */ | |
3154 | #if defined(XXH_NO_PREFETCH) | |
3155 | # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ | |
3156 | #else | |
3157 | # if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ | |
3158 | # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ | |
3159 | # define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) | |
3160 | # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) | |
3161 | # define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) | |
3162 | # else | |
3163 | # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ | |
3164 | # endif | |
3165 | #endif /* XXH_NO_PREFETCH */ | |
3166 | ||
3167 | ||
3168 | /* ========================================== | |
3169 | * XXH3 default settings | |
3170 | * ========================================== */ | |
3171 | ||
3172 | #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ | |
3173 | ||
3174 | #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) | |
3175 | # error "default keyset is not large enough" | |
3176 | #endif | |
3177 | ||
3178 | /*! Pseudorandom secret taken directly from FARSH. */ | |
3179 | XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { | |
3180 | 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, | |
3181 | 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, | |
3182 | 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, | |
3183 | 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, | |
3184 | 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, | |
3185 | 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, | |
3186 | 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, | |
3187 | 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, | |
3188 | 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, | |
3189 | 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, | |
3190 | 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, | |
3191 | 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, | |
3192 | }; | |
3193 | ||
3194 | ||
3195 | #ifdef XXH_OLD_NAMES | |
3196 | # define kSecret XXH3_kSecret | |
3197 | #endif | |
3198 | ||
3199 | #ifdef XXH_DOXYGEN | |
3200 | /*! | |
3201 | * @brief Calculates a 32-bit to 64-bit long multiply. | |
3202 | * | |
3203 | * Implemented as a macro. | |
3204 | * | |
3205 | * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't | |
3206 | * need to (but it shouldn't need to anyways, it is about 7 instructions to do | |
3207 | * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we | |
3208 | * use that instead of the normal method. | |
3209 | * | |
3210 | * If you are compiling for platforms like Thumb-1 and don't have a better option, | |
3211 | * you may also want to write your own long multiply routine here. | |
3212 | * | |
3213 | * @param x, y Numbers to be multiplied | |
3214 | * @return 64-bit product of the low 32 bits of @p x and @p y. | |
3215 | */ | |
3216 | XXH_FORCE_INLINE xxh_u64 | |
3217 | XXH_mult32to64(xxh_u64 x, xxh_u64 y) | |
3218 | { | |
3219 | return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); | |
3220 | } | |
3221 | #elif defined(_MSC_VER) && defined(_M_IX86) | |
3222 | # include <intrin.h> | |
3223 | # define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) | |
3224 | #else | |
3225 | /* | |
3226 | * Downcast + upcast is usually better than masking on older compilers like | |
3227 | * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. | |
3228 | * | |
3229 | * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands | |
3230 | * and perform a full 64x64 multiply -- entirely redundant on 32-bit. | |
3231 | */ | |
3232 | # define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) | |
3233 | #endif | |
3234 | ||
3235 | /*! | |
3236 | * @brief Calculates a 64->128-bit long multiply. | |
3237 | * | |
3238 | * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar | |
3239 | * version. | |
3240 | * | |
3241 | * @param lhs , rhs The 64-bit integers to be multiplied | |
3242 | * @return The 128-bit result represented in an @ref XXH128_hash_t. | |
3243 | */ | |
3244 | static XXH128_hash_t | |
3245 | XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) | |
3246 | { | |
3247 | /* | |
3248 | * GCC/Clang __uint128_t method. | |
3249 | * | |
3250 | * On most 64-bit targets, GCC and Clang define a __uint128_t type. | |
3251 | * This is usually the best way as it usually uses a native long 64-bit | |
3252 | * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. | |
3253 | * | |
3254 | * Usually. | |
3255 | * | |
3256 | * Despite being a 32-bit platform, Clang (and emscripten) define this type | |
3257 | * despite not having the arithmetic for it. This results in a laggy | |
3258 | * compiler builtin call which calculates a full 128-bit multiply. | |
3259 | * In that case it is best to use the portable one. | |
3260 | * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 | |
3261 | */ | |
3262 | #if defined(__GNUC__) && !defined(__wasm__) \ | |
3263 | && defined(__SIZEOF_INT128__) \ | |
3264 | || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) | |
3265 | ||
3266 | __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; | |
3267 | XXH128_hash_t r128; | |
3268 | r128.low64 = (xxh_u64)(product); | |
3269 | r128.high64 = (xxh_u64)(product >> 64); | |
3270 | return r128; | |
3271 | ||
3272 | /* | |
3273 | * MSVC for x64's _umul128 method. | |
3274 | * | |
3275 | * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); | |
3276 | * | |
3277 | * This compiles to single operand MUL on x64. | |
3278 | */ | |
3279 | #elif defined(_M_X64) || defined(_M_IA64) | |
3280 | ||
3281 | #ifndef _MSC_VER | |
3282 | # pragma intrinsic(_umul128) | |
3283 | #endif | |
3284 | xxh_u64 product_high; | |
3285 | xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); | |
3286 | XXH128_hash_t r128; | |
3287 | r128.low64 = product_low; | |
3288 | r128.high64 = product_high; | |
3289 | return r128; | |
3290 | ||
3291 | /* | |
3292 | * MSVC for ARM64's __umulh method. | |
3293 | * | |
3294 | * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. | |
3295 | */ | |
3296 | #elif defined(_M_ARM64) | |
3297 | ||
3298 | #ifndef _MSC_VER | |
3299 | # pragma intrinsic(__umulh) | |
3300 | #endif | |
3301 | XXH128_hash_t r128; | |
3302 | r128.low64 = lhs * rhs; | |
3303 | r128.high64 = __umulh(lhs, rhs); | |
3304 | return r128; | |
3305 | ||
3306 | #else | |
3307 | /* | |
3308 | * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. | |
3309 | * | |
3310 | * This is a fast and simple grade school multiply, which is shown below | |
3311 | * with base 10 arithmetic instead of base 0x100000000. | |
3312 | * | |
3313 | * 9 3 // D2 lhs = 93 | |
3314 | * x 7 5 // D2 rhs = 75 | |
3315 | * ---------- | |
3316 | * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 | |
3317 | * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 | |
3318 | * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 | |
3319 | * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 | |
3320 | * --------- | |
3321 | * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 | |
3322 | * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 | |
3323 | * --------- | |
3324 | * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 | |
3325 | * | |
3326 | * The reasons for adding the products like this are: | |
3327 | * 1. It avoids manual carry tracking. Just like how | |
3328 | * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. | |
3329 | * This avoids a lot of complexity. | |
3330 | * | |
3331 | * 2. It hints for, and on Clang, compiles to, the powerful UMAAL | |
3332 | * instruction available in ARM's Digital Signal Processing extension | |
3333 | * in 32-bit ARMv6 and later, which is shown below: | |
3334 | * | |
3335 | * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) | |
3336 | * { | |
3337 | * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; | |
3338 | * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); | |
3339 | * *RdHi = (xxh_u32)(product >> 32); | |
3340 | * } | |
3341 | * | |
3342 | * This instruction was designed for efficient long multiplication, and | |
3343 | * allows this to be calculated in only 4 instructions at speeds | |
3344 | * comparable to some 64-bit ALUs. | |
3345 | * | |
3346 | * 3. It isn't terrible on other platforms. Usually this will be a couple | |
3347 | * of 32-bit ADD/ADCs. | |
3348 | */ | |
3349 | ||
3350 | /* First calculate all of the cross products. */ | |
3351 | xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); | |
3352 | xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); | |
3353 | xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); | |
3354 | xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); | |
3355 | ||
3356 | /* Now add the products together. These will never overflow. */ | |
3357 | xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; | |
3358 | xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; | |
3359 | xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); | |
3360 | ||
3361 | XXH128_hash_t r128; | |
3362 | r128.low64 = lower; | |
3363 | r128.high64 = upper; | |
3364 | return r128; | |
3365 | #endif | |
3366 | } | |
3367 | ||
3368 | /*! | |
3369 | * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. | |
3370 | * | |
3371 | * The reason for the separate function is to prevent passing too many structs | |
3372 | * around by value. This will hopefully inline the multiply, but we don't force it. | |
3373 | * | |
3374 | * @param lhs , rhs The 64-bit integers to multiply | |
3375 | * @return The low 64 bits of the product XOR'd by the high 64 bits. | |
3376 | * @see XXH_mult64to128() | |
3377 | */ | |
3378 | static xxh_u64 | |
3379 | XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) | |
3380 | { | |
3381 | XXH128_hash_t product = XXH_mult64to128(lhs, rhs); | |
3382 | return product.low64 ^ product.high64; | |
3383 | } | |
3384 | ||
3385 | /*! Seems to produce slightly better code on GCC for some reason. */ | |
3386 | XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) | |
3387 | { | |
3388 | XXH_ASSERT(0 <= shift && shift < 64); | |
3389 | return v64 ^ (v64 >> shift); | |
3390 | } | |
3391 | ||
3392 | /* | |
3393 | * This is a fast avalanche stage, | |
3394 | * suitable when input bits are already partially mixed | |
3395 | */ | |
3396 | static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) | |
3397 | { | |
3398 | h64 = XXH_xorshift64(h64, 37); | |
3399 | h64 *= 0x165667919E3779F9ULL; | |
3400 | h64 = XXH_xorshift64(h64, 32); | |
3401 | return h64; | |
3402 | } | |
3403 | ||
3404 | /* | |
3405 | * This is a stronger avalanche, | |
3406 | * inspired by Pelle Evensen's rrmxmx | |
3407 | * preferable when input has not been previously mixed | |
3408 | */ | |
3409 | static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) | |
3410 | { | |
3411 | /* this mix is inspired by Pelle Evensen's rrmxmx */ | |
3412 | h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); | |
3413 | h64 *= 0x9FB21C651E98DF25ULL; | |
3414 | h64 ^= (h64 >> 35) + len ; | |
3415 | h64 *= 0x9FB21C651E98DF25ULL; | |
3416 | return XXH_xorshift64(h64, 28); | |
3417 | } | |
3418 | ||
3419 | ||
3420 | /* ========================================== | |
3421 | * Short keys | |
3422 | * ========================================== | |
3423 | * One of the shortcomings of XXH32 and XXH64 was that their performance was | |
3424 | * sub-optimal on short lengths. It used an iterative algorithm which strongly | |
3425 | * favored lengths that were a multiple of 4 or 8. | |
3426 | * | |
3427 | * Instead of iterating over individual inputs, we use a set of single shot | |
3428 | * functions which piece together a range of lengths and operate in constant time. | |
3429 | * | |
3430 | * Additionally, the number of multiplies has been significantly reduced. This | |
3431 | * reduces latency, especially when emulating 64-bit multiplies on 32-bit. | |
3432 | * | |
3433 | * Depending on the platform, this may or may not be faster than XXH32, but it | |
3434 | * is almost guaranteed to be faster than XXH64. | |
3435 | */ | |
3436 | ||
3437 | /* | |
3438 | * At very short lengths, there isn't enough input to fully hide secrets, or use | |
3439 | * the entire secret. | |
3440 | * | |
3441 | * There is also only a limited amount of mixing we can do before significantly | |
3442 | * impacting performance. | |
3443 | * | |
3444 | * Therefore, we use different sections of the secret and always mix two secret | |
3445 | * samples with an XOR. This should have no effect on performance on the | |
3446 | * seedless or withSeed variants because everything _should_ be constant folded | |
3447 | * by modern compilers. | |
3448 | * | |
3449 | * The XOR mixing hides individual parts of the secret and increases entropy. | |
3450 | * | |
3451 | * This adds an extra layer of strength for custom secrets. | |
3452 | */ | |
3453 | XXH_FORCE_INLINE XXH64_hash_t | |
3454 | XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
3455 | { | |
3456 | XXH_ASSERT(input != NULL); | |
3457 | XXH_ASSERT(1 <= len && len <= 3); | |
3458 | XXH_ASSERT(secret != NULL); | |
3459 | /* | |
3460 | * len = 1: combined = { input[0], 0x01, input[0], input[0] } | |
3461 | * len = 2: combined = { input[1], 0x02, input[0], input[1] } | |
3462 | * len = 3: combined = { input[2], 0x03, input[0], input[1] } | |
3463 | */ | |
3464 | { xxh_u8 const c1 = input[0]; | |
3465 | xxh_u8 const c2 = input[len >> 1]; | |
3466 | xxh_u8 const c3 = input[len - 1]; | |
3467 | xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) | |
3468 | | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); | |
3469 | xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; | |
3470 | xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; | |
3471 | return XXH64_avalanche(keyed); | |
3472 | } | |
3473 | } | |
3474 | ||
3475 | XXH_FORCE_INLINE XXH64_hash_t | |
3476 | XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
3477 | { | |
3478 | XXH_ASSERT(input != NULL); | |
3479 | XXH_ASSERT(secret != NULL); | |
3480 | XXH_ASSERT(4 <= len && len <= 8); | |
3481 | seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; | |
3482 | { xxh_u32 const input1 = XXH_readLE32(input); | |
3483 | xxh_u32 const input2 = XXH_readLE32(input + len - 4); | |
3484 | xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; | |
3485 | xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); | |
3486 | xxh_u64 const keyed = input64 ^ bitflip; | |
3487 | return XXH3_rrmxmx(keyed, len); | |
3488 | } | |
3489 | } | |
3490 | ||
3491 | XXH_FORCE_INLINE XXH64_hash_t | |
3492 | XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
3493 | { | |
3494 | XXH_ASSERT(input != NULL); | |
3495 | XXH_ASSERT(secret != NULL); | |
3496 | XXH_ASSERT(9 <= len && len <= 16); | |
3497 | { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; | |
3498 | xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; | |
3499 | xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; | |
3500 | xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; | |
3501 | xxh_u64 const acc = len | |
3502 | + XXH_swap64(input_lo) + input_hi | |
3503 | + XXH3_mul128_fold64(input_lo, input_hi); | |
3504 | return XXH3_avalanche(acc); | |
3505 | } | |
3506 | } | |
3507 | ||
3508 | XXH_FORCE_INLINE XXH64_hash_t | |
3509 | XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
3510 | { | |
3511 | XXH_ASSERT(len <= 16); | |
3512 | { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); | |
3513 | if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); | |
3514 | if (len) return XXH3_len_1to3_64b(input, len, secret, seed); | |
3515 | return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); | |
3516 | } | |
3517 | } | |
3518 | ||
3519 | /* | |
3520 | * DISCLAIMER: There are known *seed-dependent* multicollisions here due to | |
3521 | * multiplication by zero, affecting hashes of lengths 17 to 240. | |
3522 | * | |
3523 | * However, they are very unlikely. | |
3524 | * | |
3525 | * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all | |
3526 | * unseeded non-cryptographic hashes, it does not attempt to defend itself | |
3527 | * against specially crafted inputs, only random inputs. | |
3528 | * | |
3529 | * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes | |
3530 | * cancelling out the secret is taken an arbitrary number of times (addressed | |
3531 | * in XXH3_accumulate_512), this collision is very unlikely with random inputs | |
3532 | * and/or proper seeding: | |
3533 | * | |
3534 | * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a | |
3535 | * function that is only called up to 16 times per hash with up to 240 bytes of | |
3536 | * input. | |
3537 | * | |
3538 | * This is not too bad for a non-cryptographic hash function, especially with | |
3539 | * only 64 bit outputs. | |
3540 | * | |
3541 | * The 128-bit variant (which trades some speed for strength) is NOT affected | |
3542 | * by this, although it is always a good idea to use a proper seed if you care | |
3543 | * about strength. | |
3544 | */ | |
3545 | XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, | |
3546 | const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) | |
3547 | { | |
3548 | #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ | |
3549 | && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ | |
3550 | && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ | |
3551 | /* | |
3552 | * UGLY HACK: | |
3553 | * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in | |
3554 | * slower code. | |
3555 | * | |
3556 | * By forcing seed64 into a register, we disrupt the cost model and | |
3557 | * cause it to scalarize. See `XXH32_round()` | |
3558 | * | |
3559 | * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, | |
3560 | * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on | |
3561 | * GCC 9.2, despite both emitting scalar code. | |
3562 | * | |
3563 | * GCC generates much better scalar code than Clang for the rest of XXH3, | |
3564 | * which is why finding a more optimal codepath is an interest. | |
3565 | */ | |
3566 | XXH_COMPILER_GUARD(seed64); | |
3567 | #endif | |
3568 | { xxh_u64 const input_lo = XXH_readLE64(input); | |
3569 | xxh_u64 const input_hi = XXH_readLE64(input+8); | |
3570 | return XXH3_mul128_fold64( | |
3571 | input_lo ^ (XXH_readLE64(secret) + seed64), | |
3572 | input_hi ^ (XXH_readLE64(secret+8) - seed64) | |
3573 | ); | |
3574 | } | |
3575 | } | |
3576 | ||
3577 | /* For mid range keys, XXH3 uses a Mum-hash variant. */ | |
3578 | XXH_FORCE_INLINE XXH64_hash_t | |
3579 | XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, | |
3580 | const xxh_u8* XXH_RESTRICT secret, size_t secretSize, | |
3581 | XXH64_hash_t seed) | |
3582 | { | |
3583 | XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; | |
3584 | XXH_ASSERT(16 < len && len <= 128); | |
3585 | ||
3586 | { xxh_u64 acc = len * XXH_PRIME64_1; | |
3587 | if (len > 32) { | |
3588 | if (len > 64) { | |
3589 | if (len > 96) { | |
3590 | acc += XXH3_mix16B(input+48, secret+96, seed); | |
3591 | acc += XXH3_mix16B(input+len-64, secret+112, seed); | |
3592 | } | |
3593 | acc += XXH3_mix16B(input+32, secret+64, seed); | |
3594 | acc += XXH3_mix16B(input+len-48, secret+80, seed); | |
3595 | } | |
3596 | acc += XXH3_mix16B(input+16, secret+32, seed); | |
3597 | acc += XXH3_mix16B(input+len-32, secret+48, seed); | |
3598 | } | |
3599 | acc += XXH3_mix16B(input+0, secret+0, seed); | |
3600 | acc += XXH3_mix16B(input+len-16, secret+16, seed); | |
3601 | ||
3602 | return XXH3_avalanche(acc); | |
3603 | } | |
3604 | } | |
3605 | ||
3606 | #define XXH3_MIDSIZE_MAX 240 | |
3607 | ||
3608 | XXH_NO_INLINE XXH64_hash_t | |
3609 | XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, | |
3610 | const xxh_u8* XXH_RESTRICT secret, size_t secretSize, | |
3611 | XXH64_hash_t seed) | |
3612 | { | |
3613 | XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; | |
3614 | XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); | |
3615 | ||
3616 | #define XXH3_MIDSIZE_STARTOFFSET 3 | |
3617 | #define XXH3_MIDSIZE_LASTOFFSET 17 | |
3618 | ||
3619 | { xxh_u64 acc = len * XXH_PRIME64_1; | |
3620 | int const nbRounds = (int)len / 16; | |
3621 | int i; | |
3622 | for (i=0; i<8; i++) { | |
3623 | acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); | |
3624 | } | |
3625 | acc = XXH3_avalanche(acc); | |
3626 | XXH_ASSERT(nbRounds >= 8); | |
3627 | #if defined(__clang__) /* Clang */ \ | |
3628 | && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ | |
3629 | && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ | |
3630 | /* | |
3631 | * UGLY HACK: | |
3632 | * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. | |
3633 | * In everywhere else, it uses scalar code. | |
3634 | * | |
3635 | * For 64->128-bit multiplies, even if the NEON was 100% optimal, it | |
3636 | * would still be slower than UMAAL (see XXH_mult64to128). | |
3637 | * | |
3638 | * Unfortunately, Clang doesn't handle the long multiplies properly and | |
3639 | * converts them to the nonexistent "vmulq_u64" intrinsic, which is then | |
3640 | * scalarized into an ugly mess of VMOV.32 instructions. | |
3641 | * | |
3642 | * This mess is difficult to avoid without turning autovectorization | |
3643 | * off completely, but they are usually relatively minor and/or not | |
3644 | * worth it to fix. | |
3645 | * | |
3646 | * This loop is the easiest to fix, as unlike XXH32, this pragma | |
3647 | * _actually works_ because it is a loop vectorization instead of an | |
3648 | * SLP vectorization. | |
3649 | */ | |
3650 | #pragma clang loop vectorize(disable) | |
3651 | #endif | |
3652 | for (i=8 ; i < nbRounds; i++) { | |
3653 | acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); | |
3654 | } | |
3655 | /* last bytes */ | |
3656 | acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); | |
3657 | return XXH3_avalanche(acc); | |
3658 | } | |
3659 | } | |
3660 | ||
3661 | ||
3662 | /* ======= Long Keys ======= */ | |
3663 | ||
3664 | #define XXH_STRIPE_LEN 64 | |
3665 | #define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ | |
3666 | #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) | |
3667 | ||
3668 | #ifdef XXH_OLD_NAMES | |
3669 | # define STRIPE_LEN XXH_STRIPE_LEN | |
3670 | # define ACC_NB XXH_ACC_NB | |
3671 | #endif | |
3672 | ||
3673 | XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) | |
3674 | { | |
3675 | if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); | |
3676 | XXH_memcpy(dst, &v64, sizeof(v64)); | |
3677 | } | |
3678 | ||
3679 | /* Several intrinsic functions below are supposed to accept __int64 as argument, | |
3680 | * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . | |
3681 | * However, several environments do not define __int64 type, | |
3682 | * requiring a workaround. | |
3683 | */ | |
3684 | #if !defined (__VMS) \ | |
3685 | && (defined (__cplusplus) \ | |
3686 | || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) | |
3687 | typedef int64_t xxh_i64; | |
3688 | #else | |
3689 | /* the following type must have a width of 64-bit */ | |
3690 | typedef long long xxh_i64; | |
3691 | #endif | |
3692 | ||
3693 | /* | |
3694 | * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. | |
3695 | * | |
3696 | * It is a hardened version of UMAC, based off of FARSH's implementation. | |
3697 | * | |
3698 | * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD | |
3699 | * implementations, and it is ridiculously fast. | |
3700 | * | |
3701 | * We harden it by mixing the original input to the accumulators as well as the product. | |
3702 | * | |
3703 | * This means that in the (relatively likely) case of a multiply by zero, the | |
3704 | * original input is preserved. | |
3705 | * | |
3706 | * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve | |
3707 | * cross-pollination, as otherwise the upper and lower halves would be | |
3708 | * essentially independent. | |
3709 | * | |
3710 | * This doesn't matter on 64-bit hashes since they all get merged together in | |
3711 | * the end, so we skip the extra step. | |
3712 | * | |
3713 | * Both XXH3_64bits and XXH3_128bits use this subroutine. | |
3714 | */ | |
3715 | ||
3716 | #if (XXH_VECTOR == XXH_AVX512) \ | |
3717 | || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) | |
3718 | ||
3719 | #ifndef XXH_TARGET_AVX512 | |
3720 | # define XXH_TARGET_AVX512 /* disable attribute target */ | |
3721 | #endif | |
3722 | ||
3723 | XXH_FORCE_INLINE XXH_TARGET_AVX512 void | |
3724 | XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, | |
3725 | const void* XXH_RESTRICT input, | |
3726 | const void* XXH_RESTRICT secret) | |
3727 | { | |
3728 | __m512i* const xacc = (__m512i *) acc; | |
3729 | XXH_ASSERT((((size_t)acc) & 63) == 0); | |
3730 | XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); | |
3731 | ||
3732 | { | |
3733 | /* data_vec = input[0]; */ | |
3734 | __m512i const data_vec = _mm512_loadu_si512 (input); | |
3735 | /* key_vec = secret[0]; */ | |
3736 | __m512i const key_vec = _mm512_loadu_si512 (secret); | |
3737 | /* data_key = data_vec ^ key_vec; */ | |
3738 | __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); | |
3739 | /* data_key_lo = data_key >> 32; */ | |
3740 | __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); | |
3741 | /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ | |
3742 | __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); | |
3743 | /* xacc[0] += swap(data_vec); */ | |
3744 | __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); | |
3745 | __m512i const sum = _mm512_add_epi64(*xacc, data_swap); | |
3746 | /* xacc[0] += product; */ | |
3747 | *xacc = _mm512_add_epi64(product, sum); | |
3748 | } | |
3749 | } | |
3750 | ||
3751 | /* | |
3752 | * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. | |
3753 | * | |
3754 | * Multiplication isn't perfect, as explained by Google in HighwayHash: | |
3755 | * | |
3756 | * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to | |
3757 | * // varying degrees. In descending order of goodness, bytes | |
3758 | * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. | |
3759 | * // As expected, the upper and lower bytes are much worse. | |
3760 | * | |
3761 | * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 | |
3762 | * | |
3763 | * Since our algorithm uses a pseudorandom secret to add some variance into the | |
3764 | * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. | |
3765 | * | |
3766 | * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid | |
3767 | * extraction. | |
3768 | * | |
3769 | * Both XXH3_64bits and XXH3_128bits use this subroutine. | |
3770 | */ | |
3771 | ||
3772 | XXH_FORCE_INLINE XXH_TARGET_AVX512 void | |
3773 | XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) | |
3774 | { | |
3775 | XXH_ASSERT((((size_t)acc) & 63) == 0); | |
3776 | XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); | |
3777 | { __m512i* const xacc = (__m512i*) acc; | |
3778 | const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); | |
3779 | ||
3780 | /* xacc[0] ^= (xacc[0] >> 47) */ | |
3781 | __m512i const acc_vec = *xacc; | |
3782 | __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); | |
3783 | __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted); | |
3784 | /* xacc[0] ^= secret; */ | |
3785 | __m512i const key_vec = _mm512_loadu_si512 (secret); | |
3786 | __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); | |
3787 | ||
3788 | /* xacc[0] *= XXH_PRIME32_1; */ | |
3789 | __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); | |
3790 | __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); | |
3791 | __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); | |
3792 | *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); | |
3793 | } | |
3794 | } | |
3795 | ||
3796 | XXH_FORCE_INLINE XXH_TARGET_AVX512 void | |
3797 | XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) | |
3798 | { | |
3799 | XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); | |
3800 | XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); | |
3801 | XXH_ASSERT(((size_t)customSecret & 63) == 0); | |
3802 | (void)(&XXH_writeLE64); | |
3803 | { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); | |
3804 | __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64)); | |
3805 | ||
3806 | const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); | |
3807 | __m512i* const dest = ( __m512i*) customSecret; | |
3808 | int i; | |
3809 | XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ | |
3810 | XXH_ASSERT(((size_t)dest & 63) == 0); | |
3811 | for (i=0; i < nbRounds; ++i) { | |
3812 | /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*', | |
3813 | * this will warn "discards 'const' qualifier". */ | |
3814 | union { | |
3815 | const __m512i* cp; | |
3816 | void* p; | |
3817 | } remote_const_void; | |
3818 | remote_const_void.cp = src + i; | |
3819 | dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed); | |
3820 | } } | |
3821 | } | |
3822 | ||
3823 | #endif | |
3824 | ||
3825 | #if (XXH_VECTOR == XXH_AVX2) \ | |
3826 | || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) | |
3827 | ||
3828 | #ifndef XXH_TARGET_AVX2 | |
3829 | # define XXH_TARGET_AVX2 /* disable attribute target */ | |
3830 | #endif | |
3831 | ||
3832 | XXH_FORCE_INLINE XXH_TARGET_AVX2 void | |
3833 | XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, | |
3834 | const void* XXH_RESTRICT input, | |
3835 | const void* XXH_RESTRICT secret) | |
3836 | { | |
3837 | XXH_ASSERT((((size_t)acc) & 31) == 0); | |
3838 | { __m256i* const xacc = (__m256i *) acc; | |
3839 | /* Unaligned. This is mainly for pointer arithmetic, and because | |
3840 | * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ | |
3841 | const __m256i* const xinput = (const __m256i *) input; | |
3842 | /* Unaligned. This is mainly for pointer arithmetic, and because | |
3843 | * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ | |
3844 | const __m256i* const xsecret = (const __m256i *) secret; | |
3845 | ||
3846 | size_t i; | |
3847 | for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { | |
3848 | /* data_vec = xinput[i]; */ | |
3849 | __m256i const data_vec = _mm256_loadu_si256 (xinput+i); | |
3850 | /* key_vec = xsecret[i]; */ | |
3851 | __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); | |
3852 | /* data_key = data_vec ^ key_vec; */ | |
3853 | __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); | |
3854 | /* data_key_lo = data_key >> 32; */ | |
3855 | __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); | |
3856 | /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ | |
3857 | __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); | |
3858 | /* xacc[i] += swap(data_vec); */ | |
3859 | __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); | |
3860 | __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); | |
3861 | /* xacc[i] += product; */ | |
3862 | xacc[i] = _mm256_add_epi64(product, sum); | |
3863 | } } | |
3864 | } | |
3865 | ||
3866 | XXH_FORCE_INLINE XXH_TARGET_AVX2 void | |
3867 | XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) | |
3868 | { | |
3869 | XXH_ASSERT((((size_t)acc) & 31) == 0); | |
3870 | { __m256i* const xacc = (__m256i*) acc; | |
3871 | /* Unaligned. This is mainly for pointer arithmetic, and because | |
3872 | * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ | |
3873 | const __m256i* const xsecret = (const __m256i *) secret; | |
3874 | const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); | |
3875 | ||
3876 | size_t i; | |
3877 | for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { | |
3878 | /* xacc[i] ^= (xacc[i] >> 47) */ | |
3879 | __m256i const acc_vec = xacc[i]; | |
3880 | __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); | |
3881 | __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); | |
3882 | /* xacc[i] ^= xsecret; */ | |
3883 | __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); | |
3884 | __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); | |
3885 | ||
3886 | /* xacc[i] *= XXH_PRIME32_1; */ | |
3887 | __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); | |
3888 | __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); | |
3889 | __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); | |
3890 | xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); | |
3891 | } | |
3892 | } | |
3893 | } | |
3894 | ||
3895 | XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) | |
3896 | { | |
3897 | XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); | |
3898 | XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); | |
3899 | XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); | |
3900 | (void)(&XXH_writeLE64); | |
3901 | XXH_PREFETCH(customSecret); | |
3902 | { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); | |
3903 | ||
3904 | const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); | |
3905 | __m256i* dest = ( __m256i*) customSecret; | |
3906 | ||
3907 | # if defined(__GNUC__) || defined(__clang__) | |
3908 | /* | |
3909 | * On GCC & Clang, marking 'dest' as modified will cause the compiler: | |
3910 | * - do not extract the secret from sse registers in the internal loop | |
3911 | * - use less common registers, and avoid pushing these reg into stack | |
3912 | */ | |
3913 | XXH_COMPILER_GUARD(dest); | |
3914 | # endif | |
3915 | XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ | |
3916 | XXH_ASSERT(((size_t)dest & 31) == 0); | |
3917 | ||
3918 | /* GCC -O2 need unroll loop manually */ | |
3919 | dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed); | |
3920 | dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed); | |
3921 | dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed); | |
3922 | dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed); | |
3923 | dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed); | |
3924 | dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed); | |
3925 | } | |
3926 | } | |
3927 | ||
3928 | #endif | |
3929 | ||
3930 | /* x86dispatch always generates SSE2 */ | |
3931 | #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) | |
3932 | ||
3933 | #ifndef XXH_TARGET_SSE2 | |
3934 | # define XXH_TARGET_SSE2 /* disable attribute target */ | |
3935 | #endif | |
3936 | ||
3937 | XXH_FORCE_INLINE XXH_TARGET_SSE2 void | |
3938 | XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, | |
3939 | const void* XXH_RESTRICT input, | |
3940 | const void* XXH_RESTRICT secret) | |
3941 | { | |
3942 | /* SSE2 is just a half-scale version of the AVX2 version. */ | |
3943 | XXH_ASSERT((((size_t)acc) & 15) == 0); | |
3944 | { __m128i* const xacc = (__m128i *) acc; | |
3945 | /* Unaligned. This is mainly for pointer arithmetic, and because | |
3946 | * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ | |
3947 | const __m128i* const xinput = (const __m128i *) input; | |
3948 | /* Unaligned. This is mainly for pointer arithmetic, and because | |
3949 | * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ | |
3950 | const __m128i* const xsecret = (const __m128i *) secret; | |
3951 | ||
3952 | size_t i; | |
3953 | for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { | |
3954 | /* data_vec = xinput[i]; */ | |
3955 | __m128i const data_vec = _mm_loadu_si128 (xinput+i); | |
3956 | /* key_vec = xsecret[i]; */ | |
3957 | __m128i const key_vec = _mm_loadu_si128 (xsecret+i); | |
3958 | /* data_key = data_vec ^ key_vec; */ | |
3959 | __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); | |
3960 | /* data_key_lo = data_key >> 32; */ | |
3961 | __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); | |
3962 | /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ | |
3963 | __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); | |
3964 | /* xacc[i] += swap(data_vec); */ | |
3965 | __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); | |
3966 | __m128i const sum = _mm_add_epi64(xacc[i], data_swap); | |
3967 | /* xacc[i] += product; */ | |
3968 | xacc[i] = _mm_add_epi64(product, sum); | |
3969 | } } | |
3970 | } | |
3971 | ||
3972 | XXH_FORCE_INLINE XXH_TARGET_SSE2 void | |
3973 | XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) | |
3974 | { | |
3975 | XXH_ASSERT((((size_t)acc) & 15) == 0); | |
3976 | { __m128i* const xacc = (__m128i*) acc; | |
3977 | /* Unaligned. This is mainly for pointer arithmetic, and because | |
3978 | * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ | |
3979 | const __m128i* const xsecret = (const __m128i *) secret; | |
3980 | const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); | |
3981 | ||
3982 | size_t i; | |
3983 | for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { | |
3984 | /* xacc[i] ^= (xacc[i] >> 47) */ | |
3985 | __m128i const acc_vec = xacc[i]; | |
3986 | __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); | |
3987 | __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); | |
3988 | /* xacc[i] ^= xsecret[i]; */ | |
3989 | __m128i const key_vec = _mm_loadu_si128 (xsecret+i); | |
3990 | __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); | |
3991 | ||
3992 | /* xacc[i] *= XXH_PRIME32_1; */ | |
3993 | __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); | |
3994 | __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); | |
3995 | __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); | |
3996 | xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); | |
3997 | } | |
3998 | } | |
3999 | } | |
4000 | ||
4001 | XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) | |
4002 | { | |
4003 | XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); | |
4004 | (void)(&XXH_writeLE64); | |
4005 | { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); | |
4006 | ||
4007 | # if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 | |
4008 | /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ | |
4009 | XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; | |
4010 | __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); | |
4011 | # else | |
4012 | __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); | |
4013 | # endif | |
4014 | int i; | |
4015 | ||
4016 | const void* const src16 = XXH3_kSecret; | |
4017 | __m128i* dst16 = (__m128i*) customSecret; | |
4018 | # if defined(__GNUC__) || defined(__clang__) | |
4019 | /* | |
4020 | * On GCC & Clang, marking 'dest' as modified will cause the compiler: | |
4021 | * - do not extract the secret from sse registers in the internal loop | |
4022 | * - use less common registers, and avoid pushing these reg into stack | |
4023 | */ | |
4024 | XXH_COMPILER_GUARD(dst16); | |
4025 | # endif | |
4026 | XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ | |
4027 | XXH_ASSERT(((size_t)dst16 & 15) == 0); | |
4028 | ||
4029 | for (i=0; i < nbRounds; ++i) { | |
4030 | dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); | |
4031 | } } | |
4032 | } | |
4033 | ||
4034 | #endif | |
4035 | ||
4036 | #if (XXH_VECTOR == XXH_NEON) | |
4037 | ||
4038 | XXH_FORCE_INLINE void | |
4039 | XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, | |
4040 | const void* XXH_RESTRICT input, | |
4041 | const void* XXH_RESTRICT secret) | |
4042 | { | |
4043 | XXH_ASSERT((((size_t)acc) & 15) == 0); | |
4044 | { | |
4045 | uint64x2_t* const xacc = (uint64x2_t *) acc; | |
4046 | /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ | |
4047 | uint8_t const* const xinput = (const uint8_t *) input; | |
4048 | uint8_t const* const xsecret = (const uint8_t *) secret; | |
4049 | ||
4050 | size_t i; | |
4051 | for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) { | |
4052 | /* data_vec = xinput[i]; */ | |
4053 | uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); | |
4054 | /* key_vec = xsecret[i]; */ | |
4055 | uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); | |
4056 | uint64x2_t data_key; | |
4057 | uint32x2_t data_key_lo, data_key_hi; | |
4058 | /* xacc[i] += swap(data_vec); */ | |
4059 | uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); | |
4060 | uint64x2_t const swapped = vextq_u64(data64, data64, 1); | |
4061 | xacc[i] = vaddq_u64 (xacc[i], swapped); | |
4062 | /* data_key = data_vec ^ key_vec; */ | |
4063 | data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); | |
4064 | /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); | |
4065 | * data_key_hi = (uint32x2_t) (data_key >> 32); | |
4066 | * data_key = UNDEFINED; */ | |
4067 | XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); | |
4068 | /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ | |
4069 | xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); | |
4070 | ||
4071 | } | |
4072 | } | |
4073 | } | |
4074 | ||
4075 | XXH_FORCE_INLINE void | |
4076 | XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) | |
4077 | { | |
4078 | XXH_ASSERT((((size_t)acc) & 15) == 0); | |
4079 | ||
4080 | { uint64x2_t* xacc = (uint64x2_t*) acc; | |
4081 | uint8_t const* xsecret = (uint8_t const*) secret; | |
4082 | uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1); | |
4083 | ||
4084 | size_t i; | |
4085 | for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) { | |
4086 | /* xacc[i] ^= (xacc[i] >> 47); */ | |
4087 | uint64x2_t acc_vec = xacc[i]; | |
4088 | uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); | |
4089 | uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); | |
4090 | ||
4091 | /* xacc[i] ^= xsecret[i]; */ | |
4092 | uint8x16_t key_vec = vld1q_u8 (xsecret + (i * 16)); | |
4093 | uint64x2_t data_key = veorq_u64 (data_vec, vreinterpretq_u64_u8(key_vec)); | |
4094 | ||
4095 | /* xacc[i] *= XXH_PRIME32_1 */ | |
4096 | uint32x2_t data_key_lo, data_key_hi; | |
4097 | /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); | |
4098 | * data_key_hi = (uint32x2_t) (xacc[i] >> 32); | |
4099 | * xacc[i] = UNDEFINED; */ | |
4100 | XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); | |
4101 | { /* | |
4102 | * prod_hi = (data_key >> 32) * XXH_PRIME32_1; | |
4103 | * | |
4104 | * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will | |
4105 | * incorrectly "optimize" this: | |
4106 | * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b)); | |
4107 | * shifted = vshll_n_u32(tmp, 32); | |
4108 | * to this: | |
4109 | * tmp = "vmulq_u64"(a, b); // no such thing! | |
4110 | * shifted = vshlq_n_u64(tmp, 32); | |
4111 | * | |
4112 | * However, unlike SSE, Clang lacks a 64-bit multiply routine | |
4113 | * for NEON, and it scalarizes two 64-bit multiplies instead. | |
4114 | * | |
4115 | * vmull_u32 has the same timing as vmul_u32, and it avoids | |
4116 | * this bug completely. | |
4117 | * See https://bugs.llvm.org/show_bug.cgi?id=39967 | |
4118 | */ | |
4119 | uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); | |
4120 | /* xacc[i] = prod_hi << 32; */ | |
4121 | xacc[i] = vshlq_n_u64(prod_hi, 32); | |
4122 | /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */ | |
4123 | xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime); | |
4124 | } | |
4125 | } } | |
4126 | } | |
4127 | ||
4128 | #endif | |
4129 | ||
4130 | #if (XXH_VECTOR == XXH_VSX) | |
4131 | ||
4132 | XXH_FORCE_INLINE void | |
4133 | XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, | |
4134 | const void* XXH_RESTRICT input, | |
4135 | const void* XXH_RESTRICT secret) | |
4136 | { | |
4137 | /* presumed aligned */ | |
4138 | unsigned long long* const xacc = (unsigned long long*) acc; | |
4139 | xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ | |
4140 | xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ | |
4141 | xxh_u64x2 const v32 = { 32, 32 }; | |
4142 | size_t i; | |
4143 | for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { | |
4144 | /* data_vec = xinput[i]; */ | |
4145 | xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); | |
4146 | /* key_vec = xsecret[i]; */ | |
4147 | xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); | |
4148 | xxh_u64x2 const data_key = data_vec ^ key_vec; | |
4149 | /* shuffled = (data_key << 32) | (data_key >> 32); */ | |
4150 | xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); | |
4151 | /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ | |
4152 | xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); | |
4153 | /* acc_vec = xacc[i]; */ | |
4154 | xxh_u64x2 acc_vec = vec_xl(0, xacc + 2 * i); | |
4155 | acc_vec += product; | |
4156 | ||
4157 | /* swap high and low halves */ | |
4158 | #ifdef __s390x__ | |
4159 | acc_vec += vec_permi(data_vec, data_vec, 2); | |
4160 | #else | |
4161 | acc_vec += vec_xxpermdi(data_vec, data_vec, 2); | |
4162 | #endif | |
4163 | /* xacc[i] = acc_vec; */ | |
4164 | vec_xst(acc_vec, 0, xacc + 2 * i); | |
4165 | } | |
4166 | } | |
4167 | ||
4168 | XXH_FORCE_INLINE void | |
4169 | XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) | |
4170 | { | |
4171 | XXH_ASSERT((((size_t)acc) & 15) == 0); | |
4172 | ||
4173 | { xxh_u64x2* const xacc = (xxh_u64x2*) acc; | |
4174 | const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; | |
4175 | /* constants */ | |
4176 | xxh_u64x2 const v32 = { 32, 32 }; | |
4177 | xxh_u64x2 const v47 = { 47, 47 }; | |
4178 | xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; | |
4179 | size_t i; | |
4180 | for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { | |
4181 | /* xacc[i] ^= (xacc[i] >> 47); */ | |
4182 | xxh_u64x2 const acc_vec = xacc[i]; | |
4183 | xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); | |
4184 | ||
4185 | /* xacc[i] ^= xsecret[i]; */ | |
4186 | xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); | |
4187 | xxh_u64x2 const data_key = data_vec ^ key_vec; | |
4188 | ||
4189 | /* xacc[i] *= XXH_PRIME32_1 */ | |
4190 | /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ | |
4191 | xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); | |
4192 | /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ | |
4193 | xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); | |
4194 | xacc[i] = prod_odd + (prod_even << v32); | |
4195 | } } | |
4196 | } | |
4197 | ||
4198 | #endif | |
4199 | ||
4200 | /* scalar variants - universal */ | |
4201 | ||
4202 | XXH_FORCE_INLINE void | |
4203 | XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, | |
4204 | const void* XXH_RESTRICT input, | |
4205 | const void* XXH_RESTRICT secret) | |
4206 | { | |
4207 | xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ | |
4208 | const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ | |
4209 | const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ | |
4210 | size_t i; | |
4211 | XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); | |
4212 | for (i=0; i < XXH_ACC_NB; i++) { | |
4213 | xxh_u64 const data_val = XXH_readLE64(xinput + 8*i); | |
4214 | xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8); | |
4215 | xacc[i ^ 1] += data_val; /* swap adjacent lanes */ | |
4216 | xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); | |
4217 | } | |
4218 | } | |
4219 | ||
4220 | XXH_FORCE_INLINE void | |
4221 | XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) | |
4222 | { | |
4223 | xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ | |
4224 | const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ | |
4225 | size_t i; | |
4226 | XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); | |
4227 | for (i=0; i < XXH_ACC_NB; i++) { | |
4228 | xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i); | |
4229 | xxh_u64 acc64 = xacc[i]; | |
4230 | acc64 = XXH_xorshift64(acc64, 47); | |
4231 | acc64 ^= key64; | |
4232 | acc64 *= XXH_PRIME32_1; | |
4233 | xacc[i] = acc64; | |
4234 | } | |
4235 | } | |
4236 | ||
4237 | XXH_FORCE_INLINE void | |
4238 | XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) | |
4239 | { | |
4240 | /* | |
4241 | * We need a separate pointer for the hack below, | |
4242 | * which requires a non-const pointer. | |
4243 | * Any decent compiler will optimize this out otherwise. | |
4244 | */ | |
4245 | const xxh_u8* kSecretPtr = XXH3_kSecret; | |
4246 | XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); | |
4247 | ||
4248 | #if defined(__clang__) && defined(__aarch64__) | |
4249 | /* | |
4250 | * UGLY HACK: | |
4251 | * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are | |
4252 | * placed sequentially, in order, at the top of the unrolled loop. | |
4253 | * | |
4254 | * While MOVK is great for generating constants (2 cycles for a 64-bit | |
4255 | * constant compared to 4 cycles for LDR), long MOVK chains stall the | |
4256 | * integer pipelines: | |
4257 | * I L S | |
4258 | * MOVK | |
4259 | * MOVK | |
4260 | * MOVK | |
4261 | * MOVK | |
4262 | * ADD | |
4263 | * SUB STR | |
4264 | * STR | |
4265 | * By forcing loads from memory (as the asm line causes Clang to assume | |
4266 | * that XXH3_kSecretPtr has been changed), the pipelines are used more | |
4267 | * efficiently: | |
4268 | * I L S | |
4269 | * LDR | |
4270 | * ADD LDR | |
4271 | * SUB STR | |
4272 | * STR | |
4273 | * XXH3_64bits_withSeed, len == 256, Snapdragon 835 | |
4274 | * without hack: 2654.4 MB/s | |
4275 | * with hack: 3202.9 MB/s | |
4276 | */ | |
4277 | XXH_COMPILER_GUARD(kSecretPtr); | |
4278 | #endif | |
4279 | /* | |
4280 | * Note: in debug mode, this overrides the asm optimization | |
4281 | * and Clang will emit MOVK chains again. | |
4282 | */ | |
4283 | XXH_ASSERT(kSecretPtr == XXH3_kSecret); | |
4284 | ||
4285 | { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; | |
4286 | int i; | |
4287 | for (i=0; i < nbRounds; i++) { | |
4288 | /* | |
4289 | * The asm hack causes Clang to assume that kSecretPtr aliases with | |
4290 | * customSecret, and on aarch64, this prevented LDP from merging two | |
4291 | * loads together for free. Putting the loads together before the stores | |
4292 | * properly generates LDP. | |
4293 | */ | |
4294 | xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; | |
4295 | xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; | |
4296 | XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); | |
4297 | XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); | |
4298 | } } | |
4299 | } | |
4300 | ||
4301 | ||
4302 | typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*); | |
4303 | typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); | |
4304 | typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); | |
4305 | ||
4306 | ||
4307 | #if (XXH_VECTOR == XXH_AVX512) | |
4308 | ||
4309 | #define XXH3_accumulate_512 XXH3_accumulate_512_avx512 | |
4310 | #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 | |
4311 | #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 | |
4312 | ||
4313 | #elif (XXH_VECTOR == XXH_AVX2) | |
4314 | ||
4315 | #define XXH3_accumulate_512 XXH3_accumulate_512_avx2 | |
4316 | #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 | |
4317 | #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 | |
4318 | ||
4319 | #elif (XXH_VECTOR == XXH_SSE2) | |
4320 | ||
4321 | #define XXH3_accumulate_512 XXH3_accumulate_512_sse2 | |
4322 | #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 | |
4323 | #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 | |
4324 | ||
4325 | #elif (XXH_VECTOR == XXH_NEON) | |
4326 | ||
4327 | #define XXH3_accumulate_512 XXH3_accumulate_512_neon | |
4328 | #define XXH3_scrambleAcc XXH3_scrambleAcc_neon | |
4329 | #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar | |
4330 | ||
4331 | #elif (XXH_VECTOR == XXH_VSX) | |
4332 | ||
4333 | #define XXH3_accumulate_512 XXH3_accumulate_512_vsx | |
4334 | #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx | |
4335 | #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar | |
4336 | ||
4337 | #else /* scalar */ | |
4338 | ||
4339 | #define XXH3_accumulate_512 XXH3_accumulate_512_scalar | |
4340 | #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar | |
4341 | #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar | |
4342 | ||
4343 | #endif | |
4344 | ||
4345 | ||
4346 | ||
4347 | #ifndef XXH_PREFETCH_DIST | |
4348 | # ifdef __clang__ | |
4349 | # define XXH_PREFETCH_DIST 320 | |
4350 | # else | |
4351 | # if (XXH_VECTOR == XXH_AVX512) | |
4352 | # define XXH_PREFETCH_DIST 512 | |
4353 | # else | |
4354 | # define XXH_PREFETCH_DIST 384 | |
4355 | # endif | |
4356 | # endif /* __clang__ */ | |
4357 | #endif /* XXH_PREFETCH_DIST */ | |
4358 | ||
4359 | /* | |
4360 | * XXH3_accumulate() | |
4361 | * Loops over XXH3_accumulate_512(). | |
4362 | * Assumption: nbStripes will not overflow the secret size | |
4363 | */ | |
4364 | XXH_FORCE_INLINE void | |
4365 | XXH3_accumulate( xxh_u64* XXH_RESTRICT acc, | |
4366 | const xxh_u8* XXH_RESTRICT input, | |
4367 | const xxh_u8* XXH_RESTRICT secret, | |
4368 | size_t nbStripes, | |
4369 | XXH3_f_accumulate_512 f_acc512) | |
4370 | { | |
4371 | size_t n; | |
4372 | for (n = 0; n < nbStripes; n++ ) { | |
4373 | const xxh_u8* const in = input + n*XXH_STRIPE_LEN; | |
4374 | XXH_PREFETCH(in + XXH_PREFETCH_DIST); | |
4375 | f_acc512(acc, | |
4376 | in, | |
4377 | secret + n*XXH_SECRET_CONSUME_RATE); | |
4378 | } | |
4379 | } | |
4380 | ||
4381 | XXH_FORCE_INLINE void | |
4382 | XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, | |
4383 | const xxh_u8* XXH_RESTRICT input, size_t len, | |
4384 | const xxh_u8* XXH_RESTRICT secret, size_t secretSize, | |
4385 | XXH3_f_accumulate_512 f_acc512, | |
4386 | XXH3_f_scrambleAcc f_scramble) | |
4387 | { | |
4388 | size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; | |
4389 | size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; | |
4390 | size_t const nb_blocks = (len - 1) / block_len; | |
4391 | ||
4392 | size_t n; | |
4393 | ||
4394 | XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); | |
4395 | ||
4396 | for (n = 0; n < nb_blocks; n++) { | |
4397 | XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512); | |
4398 | f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); | |
4399 | } | |
4400 | ||
4401 | /* last partial block */ | |
4402 | XXH_ASSERT(len > XXH_STRIPE_LEN); | |
4403 | { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; | |
4404 | XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); | |
4405 | XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512); | |
4406 | ||
4407 | /* last stripe */ | |
4408 | { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; | |
4409 | #define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ | |
4410 | f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); | |
4411 | } } | |
4412 | } | |
4413 | ||
4414 | XXH_FORCE_INLINE xxh_u64 | |
4415 | XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) | |
4416 | { | |
4417 | return XXH3_mul128_fold64( | |
4418 | acc[0] ^ XXH_readLE64(secret), | |
4419 | acc[1] ^ XXH_readLE64(secret+8) ); | |
4420 | } | |
4421 | ||
4422 | static XXH64_hash_t | |
4423 | XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) | |
4424 | { | |
4425 | xxh_u64 result64 = start; | |
4426 | size_t i = 0; | |
4427 | ||
4428 | for (i = 0; i < 4; i++) { | |
4429 | result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); | |
4430 | #if defined(__clang__) /* Clang */ \ | |
4431 | && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ | |
4432 | && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ | |
4433 | && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ | |
4434 | /* | |
4435 | * UGLY HACK: | |
4436 | * Prevent autovectorization on Clang ARMv7-a. Exact same problem as | |
4437 | * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. | |
4438 | * XXH3_64bits, len == 256, Snapdragon 835: | |
4439 | * without hack: 2063.7 MB/s | |
4440 | * with hack: 2560.7 MB/s | |
4441 | */ | |
4442 | XXH_COMPILER_GUARD(result64); | |
4443 | #endif | |
4444 | } | |
4445 | ||
4446 | return XXH3_avalanche(result64); | |
4447 | } | |
4448 | ||
4449 | #define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ | |
4450 | XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } | |
4451 | ||
4452 | XXH_FORCE_INLINE XXH64_hash_t | |
4453 | XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, | |
4454 | const void* XXH_RESTRICT secret, size_t secretSize, | |
4455 | XXH3_f_accumulate_512 f_acc512, | |
4456 | XXH3_f_scrambleAcc f_scramble) | |
4457 | { | |
4458 | XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; | |
4459 | ||
4460 | XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble); | |
4461 | ||
4462 | /* converge into final hash */ | |
4463 | XXH_STATIC_ASSERT(sizeof(acc) == 64); | |
4464 | /* do not align on 8, so that the secret is different from the accumulator */ | |
4465 | #define XXH_SECRET_MERGEACCS_START 11 | |
4466 | XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); | |
4467 | return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); | |
4468 | } | |
4469 | ||
4470 | /* | |
4471 | * It's important for performance to transmit secret's size (when it's static) | |
4472 | * so that the compiler can properly optimize the vectorized loop. | |
4473 | * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. | |
4474 | */ | |
4475 | XXH_FORCE_INLINE XXH64_hash_t | |
4476 | XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, | |
4477 | XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) | |
4478 | { | |
4479 | (void)seed64; | |
4480 | return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc); | |
4481 | } | |
4482 | ||
4483 | /* | |
4484 | * It's preferable for performance that XXH3_hashLong is not inlined, | |
4485 | * as it results in a smaller function for small data, easier to the instruction cache. | |
4486 | * Note that inside this no_inline function, we do inline the internal loop, | |
4487 | * and provide a statically defined secret size to allow optimization of vector loop. | |
4488 | */ | |
4489 | XXH_NO_INLINE XXH64_hash_t | |
4490 | XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, | |
4491 | XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) | |
4492 | { | |
4493 | (void)seed64; (void)secret; (void)secretLen; | |
4494 | return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc); | |
4495 | } | |
4496 | ||
4497 | /* | |
4498 | * XXH3_hashLong_64b_withSeed(): | |
4499 | * Generate a custom key based on alteration of default XXH3_kSecret with the seed, | |
4500 | * and then use this key for long mode hashing. | |
4501 | * | |
4502 | * This operation is decently fast but nonetheless costs a little bit of time. | |
4503 | * Try to avoid it whenever possible (typically when seed==0). | |
4504 | * | |
4505 | * It's important for performance that XXH3_hashLong is not inlined. Not sure | |
4506 | * why (uop cache maybe?), but the difference is large and easily measurable. | |
4507 | */ | |
4508 | XXH_FORCE_INLINE XXH64_hash_t | |
4509 | XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, | |
4510 | XXH64_hash_t seed, | |
4511 | XXH3_f_accumulate_512 f_acc512, | |
4512 | XXH3_f_scrambleAcc f_scramble, | |
4513 | XXH3_f_initCustomSecret f_initSec) | |
4514 | { | |
4515 | if (seed == 0) | |
4516 | return XXH3_hashLong_64b_internal(input, len, | |
4517 | XXH3_kSecret, sizeof(XXH3_kSecret), | |
4518 | f_acc512, f_scramble); | |
4519 | { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; | |
4520 | f_initSec(secret, seed); | |
4521 | return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), | |
4522 | f_acc512, f_scramble); | |
4523 | } | |
4524 | } | |
4525 | ||
4526 | /* | |
4527 | * It's important for performance that XXH3_hashLong is not inlined. | |
4528 | */ | |
4529 | XXH_NO_INLINE XXH64_hash_t | |
4530 | XXH3_hashLong_64b_withSeed(const void* input, size_t len, | |
4531 | XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen) | |
4532 | { | |
4533 | (void)secret; (void)secretLen; | |
4534 | return XXH3_hashLong_64b_withSeed_internal(input, len, seed, | |
4535 | XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret); | |
4536 | } | |
4537 | ||
4538 | ||
4539 | typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, | |
4540 | XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); | |
4541 | ||
4542 | XXH_FORCE_INLINE XXH64_hash_t | |
4543 | XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, | |
4544 | XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, | |
4545 | XXH3_hashLong64_f f_hashLong) | |
4546 | { | |
4547 | XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); | |
4548 | /* | |
4549 | * If an action is to be taken if `secretLen` condition is not respected, | |
4550 | * it should be done here. | |
4551 | * For now, it's a contract pre-condition. | |
4552 | * Adding a check and a branch here would cost performance at every hash. | |
4553 | * Also, note that function signature doesn't offer room to return an error. | |
4554 | */ | |
4555 | if (len <= 16) | |
4556 | return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); | |
4557 | if (len <= 128) | |
4558 | return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); | |
4559 | if (len <= XXH3_MIDSIZE_MAX) | |
4560 | return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); | |
4561 | return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); | |
4562 | } | |
4563 | ||
4564 | ||
4565 | /* === Public entry point === */ | |
4566 | ||
4567 | /*! @ingroup xxh3_family */ | |
4568 | XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len) | |
4569 | { | |
4570 | return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); | |
4571 | } | |
4572 | ||
4573 | /*! @ingroup xxh3_family */ | |
4574 | XXH_PUBLIC_API XXH64_hash_t | |
4575 | XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) | |
4576 | { | |
4577 | return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); | |
4578 | } | |
4579 | ||
4580 | /*! @ingroup xxh3_family */ | |
4581 | XXH_PUBLIC_API XXH64_hash_t | |
4582 | XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) | |
4583 | { | |
4584 | return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); | |
4585 | } | |
4586 | ||
4587 | XXH_PUBLIC_API XXH64_hash_t | |
4588 | XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed) | |
4589 | { | |
4590 | if (len <= XXH3_MIDSIZE_MAX) | |
4591 | return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); | |
4592 | return XXH3_hashLong_64b_withSecret(input, len, seed, (const xxh_u8*)secret, secretSize); | |
4593 | } | |
4594 | ||
4595 | ||
4596 | /* === XXH3 streaming === */ | |
4597 | ||
4598 | /* | |
4599 | * Malloc's a pointer that is always aligned to align. | |
4600 | * | |
4601 | * This must be freed with `XXH_alignedFree()`. | |
4602 | * | |
4603 | * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte | |
4604 | * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 | |
4605 | * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. | |
4606 | * | |
4607 | * This underalignment previously caused a rather obvious crash which went | |
4608 | * completely unnoticed due to XXH3_createState() not actually being tested. | |
4609 | * Credit to RedSpah for noticing this bug. | |
4610 | * | |
4611 | * The alignment is done manually: Functions like posix_memalign or _mm_malloc | |
4612 | * are avoided: To maintain portability, we would have to write a fallback | |
4613 | * like this anyways, and besides, testing for the existence of library | |
4614 | * functions without relying on external build tools is impossible. | |
4615 | * | |
4616 | * The method is simple: Overallocate, manually align, and store the offset | |
4617 | * to the original behind the returned pointer. | |
4618 | * | |
4619 | * Align must be a power of 2 and 8 <= align <= 128. | |
4620 | */ | |
4621 | static void* XXH_alignedMalloc(size_t s, size_t align) | |
4622 | { | |
4623 | XXH_ASSERT(align <= 128 && align >= 8); /* range check */ | |
4624 | XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ | |
4625 | XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ | |
4626 | { /* Overallocate to make room for manual realignment and an offset byte */ | |
4627 | xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); | |
4628 | if (base != NULL) { | |
4629 | /* | |
4630 | * Get the offset needed to align this pointer. | |
4631 | * | |
4632 | * Even if the returned pointer is aligned, there will always be | |
4633 | * at least one byte to store the offset to the original pointer. | |
4634 | */ | |
4635 | size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ | |
4636 | /* Add the offset for the now-aligned pointer */ | |
4637 | xxh_u8* ptr = base + offset; | |
4638 | ||
4639 | XXH_ASSERT((size_t)ptr % align == 0); | |
4640 | ||
4641 | /* Store the offset immediately before the returned pointer. */ | |
4642 | ptr[-1] = (xxh_u8)offset; | |
4643 | return ptr; | |
4644 | } | |
4645 | return NULL; | |
4646 | } | |
4647 | } | |
4648 | /* | |
4649 | * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass | |
4650 | * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. | |
4651 | */ | |
4652 | static void XXH_alignedFree(void* p) | |
4653 | { | |
4654 | if (p != NULL) { | |
4655 | xxh_u8* ptr = (xxh_u8*)p; | |
4656 | /* Get the offset byte we added in XXH_malloc. */ | |
4657 | xxh_u8 offset = ptr[-1]; | |
4658 | /* Free the original malloc'd pointer */ | |
4659 | xxh_u8* base = ptr - offset; | |
4660 | XXH_free(base); | |
4661 | } | |
4662 | } | |
4663 | /*! @ingroup xxh3_family */ | |
4664 | XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) | |
4665 | { | |
4666 | XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); | |
4667 | if (state==NULL) return NULL; | |
4668 | XXH3_INITSTATE(state); | |
4669 | return state; | |
4670 | } | |
4671 | ||
4672 | /*! @ingroup xxh3_family */ | |
4673 | XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) | |
4674 | { | |
4675 | XXH_alignedFree(statePtr); | |
4676 | return XXH_OK; | |
4677 | } | |
4678 | ||
4679 | /*! @ingroup xxh3_family */ | |
4680 | XXH_PUBLIC_API void | |
4681 | XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state) | |
4682 | { | |
4683 | XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); | |
4684 | } | |
4685 | ||
4686 | static void | |
4687 | XXH3_reset_internal(XXH3_state_t* statePtr, | |
4688 | XXH64_hash_t seed, | |
4689 | const void* secret, size_t secretSize) | |
4690 | { | |
4691 | size_t const initStart = offsetof(XXH3_state_t, bufferedSize); | |
4692 | size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; | |
4693 | XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); | |
4694 | XXH_ASSERT(statePtr != NULL); | |
4695 | /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ | |
4696 | memset((char*)statePtr + initStart, 0, initLength); | |
4697 | statePtr->acc[0] = XXH_PRIME32_3; | |
4698 | statePtr->acc[1] = XXH_PRIME64_1; | |
4699 | statePtr->acc[2] = XXH_PRIME64_2; | |
4700 | statePtr->acc[3] = XXH_PRIME64_3; | |
4701 | statePtr->acc[4] = XXH_PRIME64_4; | |
4702 | statePtr->acc[5] = XXH_PRIME32_2; | |
4703 | statePtr->acc[6] = XXH_PRIME64_5; | |
4704 | statePtr->acc[7] = XXH_PRIME32_1; | |
4705 | statePtr->seed = seed; | |
4706 | statePtr->useSeed = (seed != 0); | |
4707 | statePtr->extSecret = (const unsigned char*)secret; | |
4708 | XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); | |
4709 | statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; | |
4710 | statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; | |
4711 | } | |
4712 | ||
4713 | /*! @ingroup xxh3_family */ | |
4714 | XXH_PUBLIC_API XXH_errorcode | |
4715 | XXH3_64bits_reset(XXH3_state_t* statePtr) | |
4716 | { | |
4717 | if (statePtr == NULL) return XXH_ERROR; | |
4718 | XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); | |
4719 | return XXH_OK; | |
4720 | } | |
4721 | ||
4722 | /*! @ingroup xxh3_family */ | |
4723 | XXH_PUBLIC_API XXH_errorcode | |
4724 | XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) | |
4725 | { | |
4726 | if (statePtr == NULL) return XXH_ERROR; | |
4727 | XXH3_reset_internal(statePtr, 0, secret, secretSize); | |
4728 | if (secret == NULL) return XXH_ERROR; | |
4729 | if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; | |
4730 | return XXH_OK; | |
4731 | } | |
4732 | ||
4733 | /*! @ingroup xxh3_family */ | |
4734 | XXH_PUBLIC_API XXH_errorcode | |
4735 | XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) | |
4736 | { | |
4737 | if (statePtr == NULL) return XXH_ERROR; | |
4738 | if (seed==0) return XXH3_64bits_reset(statePtr); | |
4739 | if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) | |
4740 | XXH3_initCustomSecret(statePtr->customSecret, seed); | |
4741 | XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); | |
4742 | return XXH_OK; | |
4743 | } | |
4744 | ||
4745 | /*! @ingroup xxh3_family */ | |
4746 | XXH_PUBLIC_API XXH_errorcode | |
4747 | XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64) | |
4748 | { | |
4749 | if (statePtr == NULL) return XXH_ERROR; | |
4750 | if (secret == NULL) return XXH_ERROR; | |
4751 | if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; | |
4752 | XXH3_reset_internal(statePtr, seed64, secret, secretSize); | |
4753 | statePtr->useSeed = 1; /* always, even if seed64==0 */ | |
4754 | return XXH_OK; | |
4755 | } | |
4756 | ||
4757 | /* Note : when XXH3_consumeStripes() is invoked, | |
4758 | * there must be a guarantee that at least one more byte must be consumed from input | |
4759 | * so that the function can blindly consume all stripes using the "normal" secret segment */ | |
4760 | XXH_FORCE_INLINE void | |
4761 | XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, | |
4762 | size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, | |
4763 | const xxh_u8* XXH_RESTRICT input, size_t nbStripes, | |
4764 | const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, | |
4765 | XXH3_f_accumulate_512 f_acc512, | |
4766 | XXH3_f_scrambleAcc f_scramble) | |
4767 | { | |
4768 | XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */ | |
4769 | XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); | |
4770 | if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) { | |
4771 | /* need a scrambling operation */ | |
4772 | size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr; | |
4773 | size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock; | |
4774 | XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512); | |
4775 | f_scramble(acc, secret + secretLimit); | |
4776 | XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512); | |
4777 | *nbStripesSoFarPtr = nbStripesAfterBlock; | |
4778 | } else { | |
4779 | XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512); | |
4780 | *nbStripesSoFarPtr += nbStripes; | |
4781 | } | |
4782 | } | |
4783 | ||
4784 | #ifndef XXH3_STREAM_USE_STACK | |
4785 | # ifndef __clang__ /* clang doesn't need additional stack space */ | |
4786 | # define XXH3_STREAM_USE_STACK 1 | |
4787 | # endif | |
4788 | #endif | |
4789 | /* | |
4790 | * Both XXH3_64bits_update and XXH3_128bits_update use this routine. | |
4791 | */ | |
4792 | XXH_FORCE_INLINE XXH_errorcode | |
4793 | XXH3_update(XXH3_state_t* XXH_RESTRICT const state, | |
4794 | const xxh_u8* XXH_RESTRICT input, size_t len, | |
4795 | XXH3_f_accumulate_512 f_acc512, | |
4796 | XXH3_f_scrambleAcc f_scramble) | |
4797 | { | |
4798 | if (input==NULL) { | |
4799 | XXH_ASSERT(len == 0); | |
4800 | return XXH_OK; | |
4801 | } | |
4802 | ||
4803 | XXH_ASSERT(state != NULL); | |
4804 | { const xxh_u8* const bEnd = input + len; | |
4805 | const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; | |
4806 | #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 | |
4807 | /* For some reason, gcc and MSVC seem to suffer greatly | |
4808 | * when operating accumulators directly into state. | |
4809 | * Operating into stack space seems to enable proper optimization. | |
4810 | * clang, on the other hand, doesn't seem to need this trick */ | |
4811 | XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc)); | |
4812 | #else | |
4813 | xxh_u64* XXH_RESTRICT const acc = state->acc; | |
4814 | #endif | |
4815 | state->totalLen += len; | |
4816 | XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); | |
4817 | ||
4818 | /* small input : just fill in tmp buffer */ | |
4819 | if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { | |
4820 | XXH_memcpy(state->buffer + state->bufferedSize, input, len); | |
4821 | state->bufferedSize += (XXH32_hash_t)len; | |
4822 | return XXH_OK; | |
4823 | } | |
4824 | ||
4825 | /* total input is now > XXH3_INTERNALBUFFER_SIZE */ | |
4826 | #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) | |
4827 | XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ | |
4828 | ||
4829 | /* | |
4830 | * Internal buffer is partially filled (always, except at beginning) | |
4831 | * Complete it, then consume it. | |
4832 | */ | |
4833 | if (state->bufferedSize) { | |
4834 | size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; | |
4835 | XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); | |
4836 | input += loadSize; | |
4837 | XXH3_consumeStripes(acc, | |
4838 | &state->nbStripesSoFar, state->nbStripesPerBlock, | |
4839 | state->buffer, XXH3_INTERNALBUFFER_STRIPES, | |
4840 | secret, state->secretLimit, | |
4841 | f_acc512, f_scramble); | |
4842 | state->bufferedSize = 0; | |
4843 | } | |
4844 | XXH_ASSERT(input < bEnd); | |
4845 | ||
4846 | /* large input to consume : ingest per full block */ | |
4847 | if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) { | |
4848 | size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; | |
4849 | XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar); | |
4850 | /* join to current block's end */ | |
4851 | { size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar; | |
4852 | XXH_ASSERT(nbStripes <= nbStripes); | |
4853 | XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512); | |
4854 | f_scramble(acc, secret + state->secretLimit); | |
4855 | state->nbStripesSoFar = 0; | |
4856 | input += nbStripesToEnd * XXH_STRIPE_LEN; | |
4857 | nbStripes -= nbStripesToEnd; | |
4858 | } | |
4859 | /* consume per entire blocks */ | |
4860 | while(nbStripes >= state->nbStripesPerBlock) { | |
4861 | XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512); | |
4862 | f_scramble(acc, secret + state->secretLimit); | |
4863 | input += state->nbStripesPerBlock * XXH_STRIPE_LEN; | |
4864 | nbStripes -= state->nbStripesPerBlock; | |
4865 | } | |
4866 | /* consume last partial block */ | |
4867 | XXH3_accumulate(acc, input, secret, nbStripes, f_acc512); | |
4868 | input += nbStripes * XXH_STRIPE_LEN; | |
4869 | XXH_ASSERT(input < bEnd); /* at least some bytes left */ | |
4870 | state->nbStripesSoFar = nbStripes; | |
4871 | /* buffer predecessor of last partial stripe */ | |
4872 | XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); | |
4873 | XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN); | |
4874 | } else { | |
4875 | /* content to consume <= block size */ | |
4876 | /* Consume input by a multiple of internal buffer size */ | |
4877 | if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { | |
4878 | const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; | |
4879 | do { | |
4880 | XXH3_consumeStripes(acc, | |
4881 | &state->nbStripesSoFar, state->nbStripesPerBlock, | |
4882 | input, XXH3_INTERNALBUFFER_STRIPES, | |
4883 | secret, state->secretLimit, | |
4884 | f_acc512, f_scramble); | |
4885 | input += XXH3_INTERNALBUFFER_SIZE; | |
4886 | } while (input<limit); | |
4887 | /* buffer predecessor of last partial stripe */ | |
4888 | XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); | |
4889 | } | |
4890 | } | |
4891 | ||
4892 | /* Some remaining input (always) : buffer it */ | |
4893 | XXH_ASSERT(input < bEnd); | |
4894 | XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); | |
4895 | XXH_ASSERT(state->bufferedSize == 0); | |
4896 | XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); | |
4897 | state->bufferedSize = (XXH32_hash_t)(bEnd-input); | |
4898 | #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 | |
4899 | /* save stack accumulators into state */ | |
4900 | memcpy(state->acc, acc, sizeof(acc)); | |
4901 | #endif | |
4902 | } | |
4903 | ||
4904 | return XXH_OK; | |
4905 | } | |
4906 | ||
4907 | /*! @ingroup xxh3_family */ | |
4908 | XXH_PUBLIC_API XXH_errorcode | |
4909 | XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) | |
4910 | { | |
4911 | return XXH3_update(state, (const xxh_u8*)input, len, | |
4912 | XXH3_accumulate_512, XXH3_scrambleAcc); | |
4913 | } | |
4914 | ||
4915 | ||
4916 | XXH_FORCE_INLINE void | |
4917 | XXH3_digest_long (XXH64_hash_t* acc, | |
4918 | const XXH3_state_t* state, | |
4919 | const unsigned char* secret) | |
4920 | { | |
4921 | /* | |
4922 | * Digest on a local copy. This way, the state remains unaltered, and it can | |
4923 | * continue ingesting more input afterwards. | |
4924 | */ | |
4925 | XXH_memcpy(acc, state->acc, sizeof(state->acc)); | |
4926 | if (state->bufferedSize >= XXH_STRIPE_LEN) { | |
4927 | size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; | |
4928 | size_t nbStripesSoFar = state->nbStripesSoFar; | |
4929 | XXH3_consumeStripes(acc, | |
4930 | &nbStripesSoFar, state->nbStripesPerBlock, | |
4931 | state->buffer, nbStripes, | |
4932 | secret, state->secretLimit, | |
4933 | XXH3_accumulate_512, XXH3_scrambleAcc); | |
4934 | /* last stripe */ | |
4935 | XXH3_accumulate_512(acc, | |
4936 | state->buffer + state->bufferedSize - XXH_STRIPE_LEN, | |
4937 | secret + state->secretLimit - XXH_SECRET_LASTACC_START); | |
4938 | } else { /* bufferedSize < XXH_STRIPE_LEN */ | |
4939 | xxh_u8 lastStripe[XXH_STRIPE_LEN]; | |
4940 | size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; | |
4941 | XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ | |
4942 | XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); | |
4943 | XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); | |
4944 | XXH3_accumulate_512(acc, | |
4945 | lastStripe, | |
4946 | secret + state->secretLimit - XXH_SECRET_LASTACC_START); | |
4947 | } | |
4948 | } | |
4949 | ||
4950 | /*! @ingroup xxh3_family */ | |
4951 | XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) | |
4952 | { | |
4953 | const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; | |
4954 | if (state->totalLen > XXH3_MIDSIZE_MAX) { | |
4955 | XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; | |
4956 | XXH3_digest_long(acc, state, secret); | |
4957 | return XXH3_mergeAccs(acc, | |
4958 | secret + XXH_SECRET_MERGEACCS_START, | |
4959 | (xxh_u64)state->totalLen * XXH_PRIME64_1); | |
4960 | } | |
4961 | /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ | |
4962 | if (state->useSeed) | |
4963 | return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); | |
4964 | return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), | |
4965 | secret, state->secretLimit + XXH_STRIPE_LEN); | |
4966 | } | |
4967 | ||
4968 | ||
4969 | ||
4970 | /* ========================================== | |
4971 | * XXH3 128 bits (a.k.a XXH128) | |
4972 | * ========================================== | |
4973 | * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, | |
4974 | * even without counting the significantly larger output size. | |
4975 | * | |
4976 | * For example, extra steps are taken to avoid the seed-dependent collisions | |
4977 | * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). | |
4978 | * | |
4979 | * This strength naturally comes at the cost of some speed, especially on short | |
4980 | * lengths. Note that longer hashes are about as fast as the 64-bit version | |
4981 | * due to it using only a slight modification of the 64-bit loop. | |
4982 | * | |
4983 | * XXH128 is also more oriented towards 64-bit machines. It is still extremely | |
4984 | * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). | |
4985 | */ | |
4986 | ||
4987 | XXH_FORCE_INLINE XXH128_hash_t | |
4988 | XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
4989 | { | |
4990 | /* A doubled version of 1to3_64b with different constants. */ | |
4991 | XXH_ASSERT(input != NULL); | |
4992 | XXH_ASSERT(1 <= len && len <= 3); | |
4993 | XXH_ASSERT(secret != NULL); | |
4994 | /* | |
4995 | * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } | |
4996 | * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } | |
4997 | * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } | |
4998 | */ | |
4999 | { xxh_u8 const c1 = input[0]; | |
5000 | xxh_u8 const c2 = input[len >> 1]; | |
5001 | xxh_u8 const c3 = input[len - 1]; | |
5002 | xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) | |
5003 | | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); | |
5004 | xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); | |
5005 | xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; | |
5006 | xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; | |
5007 | xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; | |
5008 | xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; | |
5009 | XXH128_hash_t h128; | |
5010 | h128.low64 = XXH64_avalanche(keyed_lo); | |
5011 | h128.high64 = XXH64_avalanche(keyed_hi); | |
5012 | return h128; | |
5013 | } | |
5014 | } | |
5015 | ||
5016 | XXH_FORCE_INLINE XXH128_hash_t | |
5017 | XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
5018 | { | |
5019 | XXH_ASSERT(input != NULL); | |
5020 | XXH_ASSERT(secret != NULL); | |
5021 | XXH_ASSERT(4 <= len && len <= 8); | |
5022 | seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; | |
5023 | { xxh_u32 const input_lo = XXH_readLE32(input); | |
5024 | xxh_u32 const input_hi = XXH_readLE32(input + len - 4); | |
5025 | xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); | |
5026 | xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; | |
5027 | xxh_u64 const keyed = input_64 ^ bitflip; | |
5028 | ||
5029 | /* Shift len to the left to ensure it is even, this avoids even multiplies. */ | |
5030 | XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); | |
5031 | ||
5032 | m128.high64 += (m128.low64 << 1); | |
5033 | m128.low64 ^= (m128.high64 >> 3); | |
5034 | ||
5035 | m128.low64 = XXH_xorshift64(m128.low64, 35); | |
5036 | m128.low64 *= 0x9FB21C651E98DF25ULL; | |
5037 | m128.low64 = XXH_xorshift64(m128.low64, 28); | |
5038 | m128.high64 = XXH3_avalanche(m128.high64); | |
5039 | return m128; | |
5040 | } | |
5041 | } | |
5042 | ||
5043 | XXH_FORCE_INLINE XXH128_hash_t | |
5044 | XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
5045 | { | |
5046 | XXH_ASSERT(input != NULL); | |
5047 | XXH_ASSERT(secret != NULL); | |
5048 | XXH_ASSERT(9 <= len && len <= 16); | |
5049 | { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; | |
5050 | xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; | |
5051 | xxh_u64 const input_lo = XXH_readLE64(input); | |
5052 | xxh_u64 input_hi = XXH_readLE64(input + len - 8); | |
5053 | XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); | |
5054 | /* | |
5055 | * Put len in the middle of m128 to ensure that the length gets mixed to | |
5056 | * both the low and high bits in the 128x64 multiply below. | |
5057 | */ | |
5058 | m128.low64 += (xxh_u64)(len - 1) << 54; | |
5059 | input_hi ^= bitfliph; | |
5060 | /* | |
5061 | * Add the high 32 bits of input_hi to the high 32 bits of m128, then | |
5062 | * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to | |
5063 | * the high 64 bits of m128. | |
5064 | * | |
5065 | * The best approach to this operation is different on 32-bit and 64-bit. | |
5066 | */ | |
5067 | if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ | |
5068 | /* | |
5069 | * 32-bit optimized version, which is more readable. | |
5070 | * | |
5071 | * On 32-bit, it removes an ADC and delays a dependency between the two | |
5072 | * halves of m128.high64, but it generates an extra mask on 64-bit. | |
5073 | */ | |
5074 | m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); | |
5075 | } else { | |
5076 | /* | |
5077 | * 64-bit optimized (albeit more confusing) version. | |
5078 | * | |
5079 | * Uses some properties of addition and multiplication to remove the mask: | |
5080 | * | |
5081 | * Let: | |
5082 | * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) | |
5083 | * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) | |
5084 | * c = XXH_PRIME32_2 | |
5085 | * | |
5086 | * a + (b * c) | |
5087 | * Inverse Property: x + y - x == y | |
5088 | * a + (b * (1 + c - 1)) | |
5089 | * Distributive Property: x * (y + z) == (x * y) + (x * z) | |
5090 | * a + (b * 1) + (b * (c - 1)) | |
5091 | * Identity Property: x * 1 == x | |
5092 | * a + b + (b * (c - 1)) | |
5093 | * | |
5094 | * Substitute a, b, and c: | |
5095 | * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) | |
5096 | * | |
5097 | * Since input_hi.hi + input_hi.lo == input_hi, we get this: | |
5098 | * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) | |
5099 | */ | |
5100 | m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); | |
5101 | } | |
5102 | /* m128 ^= XXH_swap64(m128 >> 64); */ | |
5103 | m128.low64 ^= XXH_swap64(m128.high64); | |
5104 | ||
5105 | { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ | |
5106 | XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); | |
5107 | h128.high64 += m128.high64 * XXH_PRIME64_2; | |
5108 | ||
5109 | h128.low64 = XXH3_avalanche(h128.low64); | |
5110 | h128.high64 = XXH3_avalanche(h128.high64); | |
5111 | return h128; | |
5112 | } } | |
5113 | } | |
5114 | ||
5115 | /* | |
5116 | * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN | |
5117 | */ | |
5118 | XXH_FORCE_INLINE XXH128_hash_t | |
5119 | XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
5120 | { | |
5121 | XXH_ASSERT(len <= 16); | |
5122 | { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); | |
5123 | if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); | |
5124 | if (len) return XXH3_len_1to3_128b(input, len, secret, seed); | |
5125 | { XXH128_hash_t h128; | |
5126 | xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); | |
5127 | xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); | |
5128 | h128.low64 = XXH64_avalanche(seed ^ bitflipl); | |
5129 | h128.high64 = XXH64_avalanche( seed ^ bitfliph); | |
5130 | return h128; | |
5131 | } } | |
5132 | } | |
5133 | ||
5134 | /* | |
5135 | * A bit slower than XXH3_mix16B, but handles multiply by zero better. | |
5136 | */ | |
5137 | XXH_FORCE_INLINE XXH128_hash_t | |
5138 | XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, | |
5139 | const xxh_u8* secret, XXH64_hash_t seed) | |
5140 | { | |
5141 | acc.low64 += XXH3_mix16B (input_1, secret+0, seed); | |
5142 | acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); | |
5143 | acc.high64 += XXH3_mix16B (input_2, secret+16, seed); | |
5144 | acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); | |
5145 | return acc; | |
5146 | } | |
5147 | ||
5148 | ||
5149 | XXH_FORCE_INLINE XXH128_hash_t | |
5150 | XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, | |
5151 | const xxh_u8* XXH_RESTRICT secret, size_t secretSize, | |
5152 | XXH64_hash_t seed) | |
5153 | { | |
5154 | XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; | |
5155 | XXH_ASSERT(16 < len && len <= 128); | |
5156 | ||
5157 | { XXH128_hash_t acc; | |
5158 | acc.low64 = len * XXH_PRIME64_1; | |
5159 | acc.high64 = 0; | |
5160 | if (len > 32) { | |
5161 | if (len > 64) { | |
5162 | if (len > 96) { | |
5163 | acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); | |
5164 | } | |
5165 | acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); | |
5166 | } | |
5167 | acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); | |
5168 | } | |
5169 | acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); | |
5170 | { XXH128_hash_t h128; | |
5171 | h128.low64 = acc.low64 + acc.high64; | |
5172 | h128.high64 = (acc.low64 * XXH_PRIME64_1) | |
5173 | + (acc.high64 * XXH_PRIME64_4) | |
5174 | + ((len - seed) * XXH_PRIME64_2); | |
5175 | h128.low64 = XXH3_avalanche(h128.low64); | |
5176 | h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); | |
5177 | return h128; | |
5178 | } | |
5179 | } | |
5180 | } | |
5181 | ||
5182 | XXH_NO_INLINE XXH128_hash_t | |
5183 | XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, | |
5184 | const xxh_u8* XXH_RESTRICT secret, size_t secretSize, | |
5185 | XXH64_hash_t seed) | |
5186 | { | |
5187 | XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; | |
5188 | XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); | |
5189 | ||
5190 | { XXH128_hash_t acc; | |
5191 | int const nbRounds = (int)len / 32; | |
5192 | int i; | |
5193 | acc.low64 = len * XXH_PRIME64_1; | |
5194 | acc.high64 = 0; | |
5195 | for (i=0; i<4; i++) { | |
5196 | acc = XXH128_mix32B(acc, | |
5197 | input + (32 * i), | |
5198 | input + (32 * i) + 16, | |
5199 | secret + (32 * i), | |
5200 | seed); | |
5201 | } | |
5202 | acc.low64 = XXH3_avalanche(acc.low64); | |
5203 | acc.high64 = XXH3_avalanche(acc.high64); | |
5204 | XXH_ASSERT(nbRounds >= 4); | |
5205 | for (i=4 ; i < nbRounds; i++) { | |
5206 | acc = XXH128_mix32B(acc, | |
5207 | input + (32 * i), | |
5208 | input + (32 * i) + 16, | |
5209 | secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), | |
5210 | seed); | |
5211 | } | |
5212 | /* last bytes */ | |
5213 | acc = XXH128_mix32B(acc, | |
5214 | input + len - 16, | |
5215 | input + len - 32, | |
5216 | secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, | |
5217 | 0ULL - seed); | |
5218 | ||
5219 | { XXH128_hash_t h128; | |
5220 | h128.low64 = acc.low64 + acc.high64; | |
5221 | h128.high64 = (acc.low64 * XXH_PRIME64_1) | |
5222 | + (acc.high64 * XXH_PRIME64_4) | |
5223 | + ((len - seed) * XXH_PRIME64_2); | |
5224 | h128.low64 = XXH3_avalanche(h128.low64); | |
5225 | h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); | |
5226 | return h128; | |
5227 | } | |
5228 | } | |
5229 | } | |
5230 | ||
5231 | XXH_FORCE_INLINE XXH128_hash_t | |
5232 | XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, | |
5233 | const xxh_u8* XXH_RESTRICT secret, size_t secretSize, | |
5234 | XXH3_f_accumulate_512 f_acc512, | |
5235 | XXH3_f_scrambleAcc f_scramble) | |
5236 | { | |
5237 | XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; | |
5238 | ||
5239 | XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble); | |
5240 | ||
5241 | /* converge into final hash */ | |
5242 | XXH_STATIC_ASSERT(sizeof(acc) == 64); | |
5243 | XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); | |
5244 | { XXH128_hash_t h128; | |
5245 | h128.low64 = XXH3_mergeAccs(acc, | |
5246 | secret + XXH_SECRET_MERGEACCS_START, | |
5247 | (xxh_u64)len * XXH_PRIME64_1); | |
5248 | h128.high64 = XXH3_mergeAccs(acc, | |
5249 | secret + secretSize | |
5250 | - sizeof(acc) - XXH_SECRET_MERGEACCS_START, | |
5251 | ~((xxh_u64)len * XXH_PRIME64_2)); | |
5252 | return h128; | |
5253 | } | |
5254 | } | |
5255 | ||
5256 | /* | |
5257 | * It's important for performance that XXH3_hashLong is not inlined. | |
5258 | */ | |
5259 | XXH_NO_INLINE XXH128_hash_t | |
5260 | XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, | |
5261 | XXH64_hash_t seed64, | |
5262 | const void* XXH_RESTRICT secret, size_t secretLen) | |
5263 | { | |
5264 | (void)seed64; (void)secret; (void)secretLen; | |
5265 | return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), | |
5266 | XXH3_accumulate_512, XXH3_scrambleAcc); | |
5267 | } | |
5268 | ||
5269 | /* | |
5270 | * It's important for performance to pass @secretLen (when it's static) | |
5271 | * to the compiler, so that it can properly optimize the vectorized loop. | |
5272 | */ | |
5273 | XXH_FORCE_INLINE XXH128_hash_t | |
5274 | XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, | |
5275 | XXH64_hash_t seed64, | |
5276 | const void* XXH_RESTRICT secret, size_t secretLen) | |
5277 | { | |
5278 | (void)seed64; | |
5279 | return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, | |
5280 | XXH3_accumulate_512, XXH3_scrambleAcc); | |
5281 | } | |
5282 | ||
5283 | XXH_FORCE_INLINE XXH128_hash_t | |
5284 | XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, | |
5285 | XXH64_hash_t seed64, | |
5286 | XXH3_f_accumulate_512 f_acc512, | |
5287 | XXH3_f_scrambleAcc f_scramble, | |
5288 | XXH3_f_initCustomSecret f_initSec) | |
5289 | { | |
5290 | if (seed64 == 0) | |
5291 | return XXH3_hashLong_128b_internal(input, len, | |
5292 | XXH3_kSecret, sizeof(XXH3_kSecret), | |
5293 | f_acc512, f_scramble); | |
5294 | { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; | |
5295 | f_initSec(secret, seed64); | |
5296 | return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), | |
5297 | f_acc512, f_scramble); | |
5298 | } | |
5299 | } | |
5300 | ||
5301 | /* | |
5302 | * It's important for performance that XXH3_hashLong is not inlined. | |
5303 | */ | |
5304 | XXH_NO_INLINE XXH128_hash_t | |
5305 | XXH3_hashLong_128b_withSeed(const void* input, size_t len, | |
5306 | XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) | |
5307 | { | |
5308 | (void)secret; (void)secretLen; | |
5309 | return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, | |
5310 | XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret); | |
5311 | } | |
5312 | ||
5313 | typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, | |
5314 | XXH64_hash_t, const void* XXH_RESTRICT, size_t); | |
5315 | ||
5316 | XXH_FORCE_INLINE XXH128_hash_t | |
5317 | XXH3_128bits_internal(const void* input, size_t len, | |
5318 | XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, | |
5319 | XXH3_hashLong128_f f_hl128) | |
5320 | { | |
5321 | XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); | |
5322 | /* | |
5323 | * If an action is to be taken if `secret` conditions are not respected, | |
5324 | * it should be done here. | |
5325 | * For now, it's a contract pre-condition. | |
5326 | * Adding a check and a branch here would cost performance at every hash. | |
5327 | */ | |
5328 | if (len <= 16) | |
5329 | return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); | |
5330 | if (len <= 128) | |
5331 | return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); | |
5332 | if (len <= XXH3_MIDSIZE_MAX) | |
5333 | return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); | |
5334 | return f_hl128(input, len, seed64, secret, secretLen); | |
5335 | } | |
5336 | ||
5337 | ||
5338 | /* === Public XXH128 API === */ | |
5339 | ||
5340 | /*! @ingroup xxh3_family */ | |
5341 | XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) | |
5342 | { | |
5343 | return XXH3_128bits_internal(input, len, 0, | |
5344 | XXH3_kSecret, sizeof(XXH3_kSecret), | |
5345 | XXH3_hashLong_128b_default); | |
5346 | } | |
5347 | ||
5348 | /*! @ingroup xxh3_family */ | |
5349 | XXH_PUBLIC_API XXH128_hash_t | |
5350 | XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) | |
5351 | { | |
5352 | return XXH3_128bits_internal(input, len, 0, | |
5353 | (const xxh_u8*)secret, secretSize, | |
5354 | XXH3_hashLong_128b_withSecret); | |
5355 | } | |
5356 | ||
5357 | /*! @ingroup xxh3_family */ | |
5358 | XXH_PUBLIC_API XXH128_hash_t | |
5359 | XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) | |
5360 | { | |
5361 | return XXH3_128bits_internal(input, len, seed, | |
5362 | XXH3_kSecret, sizeof(XXH3_kSecret), | |
5363 | XXH3_hashLong_128b_withSeed); | |
5364 | } | |
5365 | ||
5366 | /*! @ingroup xxh3_family */ | |
5367 | XXH_PUBLIC_API XXH128_hash_t | |
5368 | XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed) | |
5369 | { | |
5370 | if (len <= XXH3_MIDSIZE_MAX) | |
5371 | return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); | |
5372 | return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); | |
5373 | } | |
5374 | ||
5375 | /*! @ingroup xxh3_family */ | |
5376 | XXH_PUBLIC_API XXH128_hash_t | |
5377 | XXH128(const void* input, size_t len, XXH64_hash_t seed) | |
5378 | { | |
5379 | return XXH3_128bits_withSeed(input, len, seed); | |
5380 | } | |
5381 | ||
5382 | ||
5383 | /* === XXH3 128-bit streaming === */ | |
5384 | ||
5385 | /* | |
5386 | * All initialization and update functions are identical to 64-bit streaming variant. | |
5387 | * The only difference is the finalization routine. | |
5388 | */ | |
5389 | ||
5390 | /*! @ingroup xxh3_family */ | |
5391 | XXH_PUBLIC_API XXH_errorcode | |
5392 | XXH3_128bits_reset(XXH3_state_t* statePtr) | |
5393 | { | |
5394 | return XXH3_64bits_reset(statePtr); | |
5395 | } | |
5396 | ||
5397 | /*! @ingroup xxh3_family */ | |
5398 | XXH_PUBLIC_API XXH_errorcode | |
5399 | XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) | |
5400 | { | |
5401 | return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); | |
5402 | } | |
5403 | ||
5404 | /*! @ingroup xxh3_family */ | |
5405 | XXH_PUBLIC_API XXH_errorcode | |
5406 | XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) | |
5407 | { | |
5408 | return XXH3_64bits_reset_withSeed(statePtr, seed); | |
5409 | } | |
5410 | ||
5411 | /*! @ingroup xxh3_family */ | |
5412 | XXH_PUBLIC_API XXH_errorcode | |
5413 | XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed) | |
5414 | { | |
5415 | return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); | |
5416 | } | |
5417 | ||
5418 | /*! @ingroup xxh3_family */ | |
5419 | XXH_PUBLIC_API XXH_errorcode | |
5420 | XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) | |
5421 | { | |
5422 | return XXH3_update(state, (const xxh_u8*)input, len, | |
5423 | XXH3_accumulate_512, XXH3_scrambleAcc); | |
5424 | } | |
5425 | ||
5426 | /*! @ingroup xxh3_family */ | |
5427 | XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) | |
5428 | { | |
5429 | const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; | |
5430 | if (state->totalLen > XXH3_MIDSIZE_MAX) { | |
5431 | XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; | |
5432 | XXH3_digest_long(acc, state, secret); | |
5433 | XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); | |
5434 | { XXH128_hash_t h128; | |
5435 | h128.low64 = XXH3_mergeAccs(acc, | |
5436 | secret + XXH_SECRET_MERGEACCS_START, | |
5437 | (xxh_u64)state->totalLen * XXH_PRIME64_1); | |
5438 | h128.high64 = XXH3_mergeAccs(acc, | |
5439 | secret + state->secretLimit + XXH_STRIPE_LEN | |
5440 | - sizeof(acc) - XXH_SECRET_MERGEACCS_START, | |
5441 | ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); | |
5442 | return h128; | |
5443 | } | |
5444 | } | |
5445 | /* len <= XXH3_MIDSIZE_MAX : short code */ | |
5446 | if (state->seed) | |
5447 | return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); | |
5448 | return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), | |
5449 | secret, state->secretLimit + XXH_STRIPE_LEN); | |
5450 | } | |
5451 | ||
5452 | /* 128-bit utility functions */ | |
5453 | ||
5454 | #include <string.h> /* memcmp, memcpy */ | |
5455 | ||
5456 | /* return : 1 is equal, 0 if different */ | |
5457 | /*! @ingroup xxh3_family */ | |
5458 | XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) | |
5459 | { | |
5460 | /* note : XXH128_hash_t is compact, it has no padding byte */ | |
5461 | return !(memcmp(&h1, &h2, sizeof(h1))); | |
5462 | } | |
5463 | ||
5464 | /* This prototype is compatible with stdlib's qsort(). | |
5465 | * return : >0 if *h128_1 > *h128_2 | |
5466 | * <0 if *h128_1 < *h128_2 | |
5467 | * =0 if *h128_1 == *h128_2 */ | |
5468 | /*! @ingroup xxh3_family */ | |
5469 | XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) | |
5470 | { | |
5471 | XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; | |
5472 | XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; | |
5473 | int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); | |
5474 | /* note : bets that, in most cases, hash values are different */ | |
5475 | if (hcmp) return hcmp; | |
5476 | return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); | |
5477 | } | |
5478 | ||
5479 | ||
5480 | /*====== Canonical representation ======*/ | |
5481 | /*! @ingroup xxh3_family */ | |
5482 | XXH_PUBLIC_API void | |
5483 | XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) | |
5484 | { | |
5485 | XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); | |
5486 | if (XXH_CPU_LITTLE_ENDIAN) { | |
5487 | hash.high64 = XXH_swap64(hash.high64); | |
5488 | hash.low64 = XXH_swap64(hash.low64); | |
5489 | } | |
5490 | XXH_memcpy(dst, &hash.high64, sizeof(hash.high64)); | |
5491 | XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); | |
5492 | } | |
5493 | ||
5494 | /*! @ingroup xxh3_family */ | |
5495 | XXH_PUBLIC_API XXH128_hash_t | |
5496 | XXH128_hashFromCanonical(const XXH128_canonical_t* src) | |
5497 | { | |
5498 | XXH128_hash_t h; | |
5499 | h.high64 = XXH_readBE64(src); | |
5500 | h.low64 = XXH_readBE64(src->digest + 8); | |
5501 | return h; | |
5502 | } | |
5503 | ||
5504 | ||
5505 | ||
5506 | /* ========================================== | |
5507 | * Secret generators | |
5508 | * ========================================== | |
5509 | */ | |
5510 | #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) | |
5511 | ||
5512 | static void XXH3_combine16(void* dst, XXH128_hash_t h128) | |
5513 | { | |
5514 | XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 ); | |
5515 | XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); | |
5516 | } | |
5517 | ||
5518 | /*! @ingroup xxh3_family */ | |
5519 | XXH_PUBLIC_API XXH_errorcode | |
5520 | XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize) | |
5521 | { | |
5522 | XXH_ASSERT(secretBuffer != NULL); | |
5523 | if (secretBuffer == NULL) return XXH_ERROR; | |
5524 | XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); | |
5525 | if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; | |
5526 | if (customSeedSize == 0) { | |
5527 | customSeed = XXH3_kSecret; | |
5528 | customSeedSize = XXH_SECRET_DEFAULT_SIZE; | |
5529 | } | |
5530 | XXH_ASSERT(customSeed != NULL); | |
5531 | if (customSeed == NULL) return XXH_ERROR; | |
5532 | ||
5533 | /* Fill secretBuffer with a copy of customSeed - repeat as needed */ | |
5534 | { size_t pos = 0; | |
5535 | while (pos < secretSize) { | |
5536 | size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); | |
5537 | memcpy((char*)secretBuffer + pos, customSeed, toCopy); | |
5538 | pos += toCopy; | |
5539 | } } | |
5540 | ||
5541 | { size_t const nbSeg16 = secretSize / 16; | |
5542 | size_t n; | |
5543 | XXH128_canonical_t scrambler; | |
5544 | XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); | |
5545 | for (n=0; n<nbSeg16; n++) { | |
5546 | XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n); | |
5547 | XXH3_combine16((char*)secretBuffer + n*16, h128); | |
5548 | } | |
5549 | /* last segment */ | |
5550 | XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler)); | |
5551 | } | |
5552 | return XXH_OK; | |
5553 | } | |
5554 | ||
5555 | /*! @ingroup xxh3_family */ | |
5556 | XXH_PUBLIC_API void | |
5557 | XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed) | |
5558 | { | |
5559 | XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; | |
5560 | XXH3_initCustomSecret(secret, seed); | |
5561 | XXH_ASSERT(secretBuffer != NULL); | |
5562 | memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE); | |
5563 | } | |
5564 | ||
5565 | ||
5566 | ||
5567 | /* Pop our optimization override from above */ | |
5568 | #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ | |
5569 | && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ | |
5570 | && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ | |
5571 | # pragma GCC pop_options | |
5572 | #endif | |
5573 | ||
5574 | #endif /* XXH_NO_LONG_LONG */ | |
5575 | ||
5576 | #endif /* XXH_NO_XXH3 */ | |
5577 | ||
5578 | /*! | |
5579 | * @} | |
5580 | */ | |
5581 | #endif /* XXH_IMPLEMENTATION */ | |
5582 | ||
5583 | ||
5584 | #if defined (__cplusplus) | |
5585 | } | |
5586 | #endif |