* src/blake2/blake2-impl.h: Sync load16() implementation,
which doesn't change code generation.
Also leverage (builtin) memcpy to more efficiently
move data on little endian systems,
giving a 2% win with GCC 9.2.1 on an i3-2310M.
#ifndef BLAKE2_IMPL_H
#define BLAKE2_IMPL_H
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+#ifndef WORDS_BIGENDIAN
+# define NATIVE_LITTLE_ENDIAN 1
+#endif
+
#include <stdint.h>
#include <string.h>
return w;
#else
const uint8_t *p = ( const uint8_t * )src;
- return (( uint16_t )( p[0] ) << 0) |
- (( uint16_t )( p[1] ) << 8) ;
+ return ( uint16_t )((( uint32_t )( p[0] ) << 0) |
+ (( uint32_t )( p[1] ) << 8));
#endif
}