From 2f98abbcb6cfd6ffcf45d5587286f1f849184594 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sat, 14 Dec 2002 20:42:05 +0000 Subject: [PATCH] x86_64 performance patch. --- Configure | 2 +- crypto/bn/Makefile.ssl | 2 + crypto/bn/asm/x86_64-gcc.c | 575 +++++++++++++++++++++++++++++++++++++ crypto/bn/bn_div.c | 18 +- crypto/bn/bn_lcl.h | 17 +- crypto/des/des_locl.h | 10 + crypto/md32_common.h | 29 +- crypto/md4/md4_dgst.c | 36 +-- crypto/md5/md5_dgst.c | 36 +-- crypto/rc5/rc5_locl.h | 17 ++ crypto/ripemd/rmd_dgst.c | 12 +- crypto/sha/sha_locl.h | 12 +- 12 files changed, 685 insertions(+), 81 deletions(-) create mode 100644 crypto/bn/asm/x86_64-gcc.c diff --git a/Configure b/Configure index eb7b588898..1e90ccd48d 100755 --- a/Configure +++ b/Configure @@ -391,7 +391,7 @@ my %table=( "linux-s390", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-s390x", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR:asm/ia64.o:::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"linux-x86_64", "gcc:-DL_ENDIAN -DNO_ASM ::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR BF_PTR2 DES_INT DES_UNROLL:asm/x86_64-gcc.o:::::::::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "NetBSD-sparc", "gcc:-DTERMIOS -O3 -fomit-frame-pointer -mv8 -Wall -DB_ENDIAN::(unknown):::BN_LLONG MD2_CHAR RC4_INDEX DES_UNROLL::::::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "NetBSD-m68", "gcc:-DTERMIOS -O3 -fomit-frame-pointer -Wall -DB_ENDIAN::(unknown):::BN_LLONG MD2_CHAR RC4_INDEX DES_UNROLL::::::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "NetBSD-x86", "gcc:-DTERMIOS -O3 -fomit-frame-pointer -m486 -Wall::(unknown):::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}::::::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", diff --git a/crypto/bn/Makefile.ssl b/crypto/bn/Makefile.ssl index 9516ba5ab8..b709c84adc 100644 --- a/crypto/bn/Makefile.ssl +++ b/crypto/bn/Makefile.ssl @@ -138,6 +138,8 @@ asm/ia64-cpp.o: asm/ia64.S $(CC) $(ASFLAGS) -c -o asm/ia64-cpp.o /tmp/ia64.$$$$.s; \ rm -f /tmp/ia64.$$$$.s +asm/x86_64-gcc.o: asm/x86_64-gcc.c + files: $(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c new file mode 100644 index 0000000000..b97b394661 --- /dev/null +++ b/crypto/bn/asm/x86_64-gcc.c @@ -0,0 +1,575 @@ +/* + * x86_64 BIGNUM accelerator version 0.1, December 2002. + * + * Implemented by Andy Polyakov for the OpenSSL + * project. + * + * Rights for redistribution and usage in source and binary forms are + * granted according to the OpenSSL license. Warranty of any kind is + * disclaimed. + * + * Q. Version 0.1? It doesn't sound like Andy, he used to assign real + * versions, like 1.0... + * A. Well, that's because this code is basically a quick-n-dirty + * proof-of-concept hack. As you can see it's implemented with + * inline assembler, which means that you're bound to GCC and that + * there must be a room for fine-tuning. + * + * Q. Why inline assembler? + * A. x86_64 features own ABI I'm not familiar with. Which is why + * I decided to let the compiler take care of subroutine + * prologue/epilogue as well as register allocation. + * + * Q. How much faster does it get? + * A. Unfortunately people sitting on x86_64 hardware are prohibited + * to disclose the performance numbers, so they (SuSE labs to be + * specific) wouldn't tell me. However! Very similar coding technique + * (reaching out for 128-bit result from 64x64-bit multiplication) + * results in >3 times performance improvement on MIPS and I see no + * reason why gain on x86_64 would be so much different:-) + */ + +#define BN_ULONG unsigned long + +/* + * "m"(a), "+m"(r) is the way to favor DirectPath ยต-code; + * "g"(0) let the compiler to decide where does it + * want to keep the value of zero; + */ +#define mul_add(r,a,word,carry) do { \ + register BN_ULONG high,low; \ + asm ("mulq %3" \ + : "=a"(low),"=d"(high) \ + : "a"(word),"m"(a) \ + : "cc"); \ + asm ("addq %2,%0; adcq %3,%1" \ + : "+r"(carry),"+d"(high)\ + : "a"(low),"g"(0) \ + : "cc"); \ + asm ("addq %2,%0; adcq %3,%1" \ + : "+m"(r),"+d"(high) \ + : "r"(carry),"g"(0) \ + : "cc"); \ + carry=high; \ + } while (0) + +#define mul(r,a,word,carry) do { \ + register BN_ULONG high,low; \ + asm ("mulq %3" \ + : "=a"(low),"=d"(high) \ + : "a"(word),"g"(a) \ + : "cc"); \ + asm ("addq %2,%0; adcq %3,%1" \ + : "+r"(carry),"+d"(high)\ + : "a"(low),"g"(0) \ + : "cc"); \ + (r)=carry, carry=high; \ + } while (0) + +#define sqr(r0,r1,a) \ + asm ("mulq %2" \ + : "=a"(r0),"=d"(r1) \ + : "a"(a) \ + : "cc"); + +BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) + { + BN_ULONG c1=0; + + if (num <= 0) return(c1); + + while (num&~3) + { + mul_add(rp[0],ap[0],w,c1); + mul_add(rp[1],ap[1],w,c1); + mul_add(rp[2],ap[2],w,c1); + mul_add(rp[3],ap[3],w,c1); + ap+=4; rp+=4; num-=4; + } + if (num) + { + mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; + mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; + mul_add(rp[2],ap[2],w,c1); return c1; + } + + return(c1); + } + +BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) + { + BN_ULONG c1=0; + + if (num <= 0) return(c1); + + while (num&~3) + { + mul(rp[0],ap[0],w,c1); + mul(rp[1],ap[1],w,c1); + mul(rp[2],ap[2],w,c1); + mul(rp[3],ap[3],w,c1); + ap+=4; rp+=4; num-=4; + } + if (num) + { + mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; + mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; + mul(rp[2],ap[2],w,c1); + } + return(c1); + } + +void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) + { + if (n <= 0) return; + + while (n&~3) + { + sqr(r[0],r[1],a[0]); + sqr(r[2],r[3],a[1]); + sqr(r[4],r[5],a[2]); + sqr(r[6],r[7],a[3]); + a+=4; r+=8; n-=4; + } + if (n) + { + sqr(r[0],r[1],a[0]); if (--n == 0) return; + sqr(r[2],r[3],a[1]); if (--n == 0) return; + sqr(r[4],r[5],a[2]); + } + } + +BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) +{ BN_ULONG ret,waste; + + asm ("divq %3" + : "=a"(ret),"=d"(waste) + : "a"(l),"d"(h),"g"(d) + : "cc"); + + return ret; +} + +BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) +{ BN_ULONG ret,i; + + if (n <= 0) return 0; + + asm ( + " subq %2,%2 \n" + ".align 16 \n" + "1: movq (%4,%2,8),%0 \n" + " adcq (%5,%2,8),%0 \n" + " movq %0,(%3,%2,8) \n" + " leaq 1(%2),%2 \n" + " loop 1b \n" + " sbbq %0,%0 \n" + : "+a"(ret),"+c"(n),"+r"(i) + : "r"(rp),"r"(ap),"r"(bp) + : "cc" + ); + + return ret&1; +} + +#ifndef SIMICS +BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) +{ BN_ULONG ret,i; + + if (n <= 0) return 0; + + asm ( + " subq %2,%2 \n" + ".align 16 \n" + "1: movq (%4,%2,8),%0 \n" + " sbbq (%5,%2,8),%0 \n" + " movq %0,(%3,%2,8) \n" + " leaq 1(%2),%2 \n" + " loop 1b \n" + " sbbq %0,%0 \n" + : "+a"(ret),"+c"(n),"+r"(i) + : "r"(rp),"r"(ap),"r"(bp) + : "cc" + ); + + return ret&1; +} +#else +/* Simics 1.4<7 has buggy sbbq:-( */ +#define BN_MASK2 0xffffffffffffffffL +BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) + { + BN_ULONG t1,t2; + int c=0; + + if (n <= 0) return((BN_ULONG)0); + + for (;;) + { + t1=a[0]; t2=b[0]; + r[0]=(t1-t2-c)&BN_MASK2; + if (t1 != t2) c=(t1 < t2); + if (--n <= 0) break; + + t1=a[1]; t2=b[1]; + r[1]=(t1-t2-c)&BN_MASK2; + if (t1 != t2) c=(t1 < t2); + if (--n <= 0) break; + + t1=a[2]; t2=b[2]; + r[2]=(t1-t2-c)&BN_MASK2; + if (t1 != t2) c=(t1 < t2); + if (--n <= 0) break; + + t1=a[3]; t2=b[3]; + r[3]=(t1-t2-c)&BN_MASK2; + if (t1 != t2) c=(t1 < t2); + if (--n <= 0) break; + + a+=4; + b+=4; + r+=4; + } + return(c); + } +#endif + +/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ +/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ +/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ +/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ + +#if 0 +/* original macros are kept for reference purposes */ +#define mul_add_c(a,b,c0,c1,c2) { \ + BN_ULONG ta=(a),tb=(b); \ + t1 = ta * tb; \ + t2 = BN_UMULT_HIGH(ta,tb); \ + c0 += t1; t2 += (c0 + */ +# define bn_div_words(n0,n1,d0) \ + ({ asm volatile ( \ + "divq %4" \ + : "=a"(q), "=d"(rem) \ + : "a"(n1), "d"(n0), "g"(d0) \ + : "cc"); \ + q; \ + }) +# define REMAINDER_IS_ALREADY_CALCULATED # endif /* __ */ # endif /* __GNUC__ */ #endif /* OPENSSL_NO_ASM */ @@ -296,7 +310,9 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, rem=(n1-q*d0)&BN_MASK2; #endif -#ifdef BN_UMULT_HIGH +#if defined(BN_UMULT_LOHI) + BN_UMULT_LOHI(t2l,t2h,d1,q); +#elif defined(BN_UMULT_HIGH) t2l = d1 * q; t2h = BN_UMULT_HIGH(d1,q); #else diff --git a/crypto/bn/bn_lcl.h b/crypto/bn/bn_lcl.h index 1db940f4c5..d10ec56e7e 100644 --- a/crypto/bn/bn_lcl.h +++ b/crypto/bn/bn_lcl.h @@ -230,6 +230,21 @@ struct bignum_ctx : "r"(a), "r"(b)); \ ret; }) # endif /* compiler */ +# elif defined(__x86_64) && defined(SIXTY_FOUR_BIT_LONG) +# if defined(__GNUC__) +# define BN_UMULT_HIGH(a,b) ({ \ + register BN_ULONG ret,discard; \ + asm ("mulq %3" \ + : "=a"(discard),"=d"(ret) \ + : "a"(a), "g"(b) \ + : "cc"); \ + ret; }) +# define BN_UMULT_LOHI(low,high,a,b) \ + asm ("mulq %3" \ + : "=a"(low),"=d"(high) \ + : "a"(a),"g"(b) \ + : "cc"); +# endif # endif /* cpu */ #endif /* OPENSSL_NO_ASM */ @@ -347,7 +362,7 @@ struct bignum_ctx #define LBITS(a) ((a)&BN_MASK2l) #define HBITS(a) (((a)>>BN_BITS4)&BN_MASK2l) -#define L2HBITS(a) ((BN_ULONG)((a)&BN_MASK2l)<>BN_BITS2)&BN_MASKl) diff --git a/crypto/des/des_locl.h b/crypto/des/des_locl.h index 9e033f7c2e..5eb7cff0ca 100644 --- a/crypto/des/des_locl.h +++ b/crypto/des/des_locl.h @@ -159,6 +159,16 @@ #if defined(OPENSSL_SYS_WIN32) && defined(_MSC_VER) #define ROTATE(a,n) (_lrotr(a,n)) +#elif defined(__GNUC__) && __GNUC__>=2 && !defined(NO_ASM) && !defined(NO_INLINE_ASM) +# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__) +# define ROTATE(a,n) ({ register unsigned int ret; \ + asm ("rorl %1,%0" \ + : "=r"(ret) \ + : "I"(n),"0"(a) \ + : "cc"); \ + ret; \ + }) +# endif #else #define ROTATE(a,n) (((a)>>(n))+((a)<<(32-(n)))) #endif diff --git a/crypto/md32_common.h b/crypto/md32_common.h index 275b93618b..573850b122 100644 --- a/crypto/md32_common.h +++ b/crypto/md32_common.h @@ -198,7 +198,7 @@ * * */ -# if defined(__i386) || defined(__i386__) +# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__) # define ROTATE(a,n) ({ register unsigned int ret; \ asm ( \ "roll %1,%0" \ @@ -224,7 +224,7 @@ */ # if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) /* some GNU C inline assembler templates by */ -# if (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY) +# if (defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)) && !defined(I386_ONLY) # define BE_FETCH32(a) ({ register unsigned int l=(a);\ asm ( \ "bswapl %0" \ @@ -610,3 +610,28 @@ int HASH_FINAL (unsigned char *md, HASH_CTX *c) */ return 1; } + +#ifndef MD32_REG_T +#define MD32_REG_T long +/* + * This comment was originaly written for MD5, which is why it + * discusses A-D. But it basically applies to all 32-bit digests, + * which is why it was moved to common header file. + * + * In case you wonder why A-D are declared as long and not + * as MD5_LONG. Doing so results in slight performance + * boost on LP64 architectures. The catch is we don't + * really care if 32 MSBs of a 64-bit register get polluted + * with eventual overflows as we *save* only 32 LSBs in + * *either* case. Now declaring 'em long excuses the compiler + * from keeping 32 MSBs zeroed resulting in 13% performance + * improvement under SPARC Solaris7/64 and 5% under AlphaLinux. + * Well, to be honest it should say that this *prevents* + * performance degradation. + * + * Apparently there're LP64 compilers that generate better + * code if A-D are declared int. Most notably GCC-x86_64 + * generates better code. + * + */ +#endif diff --git a/crypto/md4/md4_dgst.c b/crypto/md4/md4_dgst.c index 6446f5f5e7..7afb7185b6 100644 --- a/crypto/md4/md4_dgst.c +++ b/crypto/md4/md4_dgst.c @@ -86,21 +86,7 @@ int MD4_Init(MD4_CTX *c) void md4_block_host_order (MD4_CTX *c, const void *data, int num) { const MD4_LONG *X=data; - register unsigned long A,B,C,D; - /* - * In case you wonder why A-D are declared as long and not - * as MD4_LONG. Doing so results in slight performance - * boost on LP64 architectures. The catch is we don't - * really care if 32 MSBs of a 64-bit register get polluted - * with eventual overflows as we *save* only 32 LSBs in - * *either* case. Now declaring 'em long excuses the compiler - * from keeping 32 MSBs zeroed resulting in 13% performance - * improvement under SPARC Solaris7/64 and 5% under AlphaLinux. - * Well, to be honest it should say that this *prevents* - * performance degradation. - * - * - */ + register unsigned MD32_REG_T A,B,C,D; A=c->A; B=c->B; @@ -176,25 +162,11 @@ void md4_block_host_order (MD4_CTX *c, const void *data, int num) void md4_block_data_order (MD4_CTX *c, const void *data_, int num) { const unsigned char *data=data_; - register unsigned long A,B,C,D,l; - /* - * In case you wonder why A-D are declared as long and not - * as MD4_LONG. Doing so results in slight performance - * boost on LP64 architectures. The catch is we don't - * really care if 32 MSBs of a 64-bit register get polluted - * with eventual overflows as we *save* only 32 LSBs in - * *either* case. Now declaring 'em long excuses the compiler - * from keeping 32 MSBs zeroed resulting in 13% performance - * improvement under SPARC Solaris7/64 and 5% under AlphaLinux. - * Well, to be honest it should say that this *prevents* - * performance degradation. - * - * - */ + register unsigned MD32_REG_T A,B,C,D,l; #ifndef MD32_XARRAY /* See comment in crypto/sha/sha_locl.h for details. */ - unsigned long XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, - XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15; + unsigned MD32_REG_T XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, + XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15; # define X(i) XX##i #else MD4_LONG XX[MD4_LBLOCK]; diff --git a/crypto/md5/md5_dgst.c b/crypto/md5/md5_dgst.c index c38a3f021e..9c7abc3697 100644 --- a/crypto/md5/md5_dgst.c +++ b/crypto/md5/md5_dgst.c @@ -86,21 +86,7 @@ int MD5_Init(MD5_CTX *c) void md5_block_host_order (MD5_CTX *c, const void *data, int num) { const MD5_LONG *X=data; - register unsigned long A,B,C,D; - /* - * In case you wonder why A-D are declared as long and not - * as MD5_LONG. Doing so results in slight performance - * boost on LP64 architectures. The catch is we don't - * really care if 32 MSBs of a 64-bit register get polluted - * with eventual overflows as we *save* only 32 LSBs in - * *either* case. Now declaring 'em long excuses the compiler - * from keeping 32 MSBs zeroed resulting in 13% performance - * improvement under SPARC Solaris7/64 and 5% under AlphaLinux. - * Well, to be honest it should say that this *prevents* - * performance degradation. - * - * - */ + register unsigned MD32_REG_T A,B,C,D; A=c->A; B=c->B; @@ -193,25 +179,11 @@ void md5_block_host_order (MD5_CTX *c, const void *data, int num) void md5_block_data_order (MD5_CTX *c, const void *data_, int num) { const unsigned char *data=data_; - register unsigned long A,B,C,D,l; - /* - * In case you wonder why A-D are declared as long and not - * as MD5_LONG. Doing so results in slight performance - * boost on LP64 architectures. The catch is we don't - * really care if 32 MSBs of a 64-bit register get polluted - * with eventual overflows as we *save* only 32 LSBs in - * *either* case. Now declaring 'em long excuses the compiler - * from keeping 32 MSBs zeroed resulting in 13% performance - * improvement under SPARC Solaris7/64 and 5% under AlphaLinux. - * Well, to be honest it should say that this *prevents* - * performance degradation. - * - * - */ + register unsigned MD32_REG_T A,B,C,D,l; #ifndef MD32_XARRAY /* See comment in crypto/sha/sha_locl.h for details. */ - unsigned long XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, - XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15; + unsigned MD32_REG_T XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, + XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15; # define X(i) XX##i #else MD5_LONG XX[MD5_LBLOCK]; diff --git a/crypto/rc5/rc5_locl.h b/crypto/rc5/rc5_locl.h index d3871c6555..c700dfb1a5 100644 --- a/crypto/rc5/rc5_locl.h +++ b/crypto/rc5/rc5_locl.h @@ -149,6 +149,23 @@ #if defined(OPENSSL_SYS_WIN32) && defined(_MSC_VER) #define ROTATE_l32(a,n) _lrotl(a,n) #define ROTATE_r32(a,n) _lrotr(a,n) +#elif defined(__GNUC__) && __GNUC__>=2 && !defined(NO_ASM) && !defined(NO_INLINE_ASM) +# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__) +# define ROTATE_l32(a,n) ({ register unsigned int ret; \ + asm ("roll %%cl,%0" \ + : "=r"(ret) \ + : "c"(n),"0"(a) \ + : "cc"); \ + ret; \ + }) +# define ROTATE_r32(a,n) ({ register unsigned int ret; \ + asm ("rorl %%cl,%0" \ + : "=r"(ret) \ + : "c"(n),"0"(a) \ + : "cc"); \ + ret; \ + }) +# endif #else #define ROTATE_l32(a,n) (((a)<<(n&0x1f))|(((a)&0xffffffff)>>(32-(n&0x1f)))) #define ROTATE_r32(a,n) (((a)<<(32-(n&0x1f)))|(((a)&0xffffffff)>>(n&0x1f))) diff --git a/crypto/ripemd/rmd_dgst.c b/crypto/ripemd/rmd_dgst.c index a3170f7c8a..f351f00eea 100644 --- a/crypto/ripemd/rmd_dgst.c +++ b/crypto/ripemd/rmd_dgst.c @@ -90,8 +90,8 @@ int RIPEMD160_Init(RIPEMD160_CTX *c) void ripemd160_block_host_order (RIPEMD160_CTX *ctx, const void *p, int num) { const RIPEMD160_LONG *XX=p; - register unsigned long A,B,C,D,E; - register unsigned long a,b,c,d,e; + register unsigned MD32_REG_T A,B,C,D,E; + register unsigned MD32_REG_T a,b,c,d,e; for (;num--;XX+=HASH_LBLOCK) { @@ -290,12 +290,12 @@ void ripemd160_block_host_order (RIPEMD160_CTX *ctx, const void *p, int num) void ripemd160_block_data_order (RIPEMD160_CTX *ctx, const void *p, int num) { const unsigned char *data=p; - register unsigned long A,B,C,D,E; - unsigned long a,b,c,d,e,l; + register unsigned MD32_REG_T A,B,C,D,E; + unsigned MD32_REG_T a,b,c,d,e,l; #ifndef MD32_XARRAY /* See comment in crypto/sha/sha_locl.h for details. */ - unsigned long XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, - XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15; + unsigned MD32_REG_T XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, + XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15; # define X(i) XX##i #else RIPEMD160_LONG XX[16]; diff --git a/crypto/sha/sha_locl.h b/crypto/sha/sha_locl.h index 471dfb9f8f..2dd63a62a6 100644 --- a/crypto/sha/sha_locl.h +++ b/crypto/sha/sha_locl.h @@ -224,10 +224,10 @@ int HASH_INIT (SHA_CTX *c) void HASH_BLOCK_HOST_ORDER (SHA_CTX *c, const void *d, int num) { const SHA_LONG *W=d; - register unsigned long A,B,C,D,E,T; + register unsigned MD32_REG_T A,B,C,D,E,T; #ifndef MD32_XARRAY - unsigned long XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, - XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15; + unsigned MD32_REG_T XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, + XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15; #else SHA_LONG XX[16]; #endif @@ -349,10 +349,10 @@ void HASH_BLOCK_HOST_ORDER (SHA_CTX *c, const void *d, int num) void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, int num) { const unsigned char *data=p; - register unsigned long A,B,C,D,E,T,l; + register unsigned MD32_REG_T A,B,C,D,E,T,l; #ifndef MD32_XARRAY - unsigned long XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, - XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15; + unsigned MD32_REG_T XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, + XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15; #else SHA_LONG XX[16]; #endif -- 2.39.2