From: Niels Möller Date: Sat, 24 Jul 2010 21:47:57 +0000 (+0200) Subject: * camellia-set-encrypt-key.c (camellia_set_encrypt_key): Reduced X-Git-Tag: nettle_2.1_release_20100725~12 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=5535723634e1554c8f9c2517692cc0f491722095;p=thirdparty%2Fnettle.git * camellia-set-encrypt-key.c (camellia_set_encrypt_key): Reduced code size, no complete loop unroll. Use one loop for each phase of the post-processing. Rev: nettle/camellia-set-encrypt-key.c:1.2 --- diff --git a/camellia-set-encrypt-key.c b/camellia-set-encrypt-key.c index dc635642..b6ea7b95 100644 --- a/camellia-set-encrypt-key.c +++ b/camellia-set-encrypt-key.c @@ -256,7 +256,7 @@ camellia_set_encrypt_key(struct camellia_ctx *ctx, /* Subkeys according to the spec, 26 for short keys and 34 for large keys */ uint64_t subkey[34]; - uint64_t kw4; + uint64_t kw2, kw4; uint32_t dw, tl, tr; unsigned i; @@ -285,146 +285,76 @@ camellia_set_encrypt_key(struct camellia_ctx *ctx, } /* absorb kw2 to other subkeys */ - subkey[3] ^= subkey[1]; - subkey[5] ^= subkey[1]; - subkey[7] ^= subkey[1]; - /* FIXME: gcc for x86_32 is smart enough to fetch the 32 low bits - and xor the result into the 32 high bits, but it still generates - worse code than for explicit 32-bit operations. */ - subkey[1] ^= (subkey[1] & ~subkey[9]) << 32; - dw = (subkey[1] & subkey[9]) >> 32; subkey[1] ^= ROL32(1, dw); - - subkey[11] ^= subkey[1]; - subkey[13] ^= subkey[1]; - subkey[15] ^= subkey[1]; - subkey[1] ^= (subkey[1] & ~subkey[17]) << 32; - dw = (subkey[1] & subkey[17]) >> 32; subkey[1] ^= ROL32(1, dw); - - subkey[19] ^= subkey[1]; - subkey[21] ^= subkey[1]; - subkey[23] ^= subkey[1]; - if (ctx->nkeys < 32) - { - subkey[24] ^= subkey[1]; - } - else - { - subkey[1] ^= (subkey[1] & ~subkey[25]) << 32; - dw = (subkey[1] & subkey[25]) >> 32; subkey[1] ^= ROL32(1, dw); + kw2 = subkey[1]; - subkey[27] ^= subkey[1]; - subkey[29] ^= subkey[1]; - subkey[31] ^= subkey[1]; - subkey[32] ^= subkey[1]; + subkey[3] ^= kw2; + subkey[5] ^= kw2; + subkey[7] ^= kw2; + for (i = 8; i < ctx->nkeys - 2; i += 8) + { + /* FIXME: gcc for x86_32 is smart enough to fetch the 32 low bits + and xor the result into the 32 high bits, but it still generates + worse code than for explicit 32-bit operations. */ + kw2 ^= (kw2 & ~subkey[i+1]) << 32; + dw = (kw2 & subkey[i+1]) >> 32; kw2 ^= ROL32(1, dw); + + subkey[i+3] ^= kw2; + subkey[i+5] ^= kw2; + subkey[i+7] ^= kw2; } - + subkey[i] ^= kw2; + /* absorb kw4 to other subkeys */ kw4 = subkey[ctx->nkeys - 1]; - - if (ctx->nkeys >= 32) + + for (i = ctx->nkeys - 10; i > 0; i -= 8) { - subkey[30] ^= kw4; - subkey[28] ^= kw4; - subkey[26] ^= kw4; - kw4 ^= (kw4 & ~subkey[24]) << 32; - dw = (kw4 & subkey[24]) >> 32; kw4 ^= ROL32(1, dw); + subkey[i+6] ^= kw4; + subkey[i+4] ^= kw4; + subkey[i+2] ^= kw4; + kw4 ^= (kw4 & ~subkey[i]) << 32; + dw = (kw4 & subkey[i]) >> 32; kw4 ^= ROL32(1, dw); } - subkey[22] ^= kw4; - subkey[20] ^= kw4; - subkey[18] ^= kw4; - kw4 ^= (kw4 & ~subkey[16]) << 32; - dw = (kw4 & subkey[16]) >> 32; kw4 ^= ROL32(1, dw); - - subkey[14] ^= kw4; - subkey[12] ^= kw4; - subkey[10] ^= kw4; - kw4 ^= (kw4 & ~subkey[8]) << 32; - dw = (kw4 & subkey[8]) >> 32; kw4 ^= ROL32(1, dw); - subkey[6] ^= kw4; subkey[4] ^= kw4; subkey[2] ^= kw4; subkey[0] ^= kw4; /* key XOR is end of F-function */ - ctx->keys[0] = subkey[0] ^subkey[2]; - + ctx->keys[0] = subkey[0] ^ subkey[2]; ctx->keys[2] = subkey[3]; + ctx->keys[3] = subkey[2] ^ subkey[4]; ctx->keys[4] = subkey[3] ^ subkey[5]; ctx->keys[5] = subkey[4] ^ subkey[6]; ctx->keys[6] = subkey[5] ^ subkey[7]; - tl = (subkey[10] >> 32) ^ (subkey[10] & ~subkey[8]); - dw = tl & (subkey[8] >> 32); - tr = subkey[10] ^ROL32(1, dw); - ctx->keys[7] = subkey[6] ^ ( ((uint64_t) tl << 32) | tr); - - ctx->keys[8] = subkey[8]; - ctx->keys[9] = subkey[9]; - - tl = (subkey[7] >> 32) ^ (subkey[7] & ~subkey[9]); - dw = tl & (subkey[9] >> 32); - tr = subkey[7] ^ ROL32(1, dw); - ctx->keys[10] = subkey[11] ^ ( ((uint64_t) tl << 32) | tr); - - ctx->keys[11] = subkey[10] ^ subkey[12]; - ctx->keys[12] = subkey[11] ^ subkey[13]; - ctx->keys[13] = subkey[12] ^ subkey[14]; - ctx->keys[14] = subkey[13] ^ subkey[15]; - - tl = (subkey[18] >> 32) ^ (subkey[18] & ~subkey[16]); - dw = tl & (subkey[16] >> 32); - tr = subkey[18] ^ ROL32(1, dw); - ctx->keys[15] = subkey[14] ^ ( ((uint64_t) tl << 32) | tr); - - ctx->keys[16] = subkey[16]; - ctx->keys[17] = subkey[17]; - - tl = (subkey[15] >> 32) ^ (subkey[15] & ~subkey[17]); - dw = tl & (subkey[17] >> 32); - tr = subkey[15] ^ ROL32(1, dw); - ctx->keys[18] = subkey[19] ^ ( ((uint64_t) tl << 32) | tr); - - ctx->keys[19] = subkey[18] ^ subkey[20]; - ctx->keys[20] = subkey[19] ^ subkey[21]; - ctx->keys[21] = subkey[20] ^ subkey[22]; - ctx->keys[22] = subkey[21] ^ subkey[23]; - - if (ctx->nkeys < 32) - { - ctx->keys[23] = subkey[22]; - ctx->keys[24] = subkey[24] ^ subkey[23]; - - } - else + for (i = 8; i < ctx->nkeys - 2; i += 8) { - tl = (subkey[26] >> 32) ^ (subkey[26] & ~subkey[24]); - dw = tl & (subkey[24] >> 32); - tr = subkey[26] ^ ROL32(1, dw); - ctx->keys[23] = subkey[22] ^ ( ((uint64_t) tl << 32) | tr); - - ctx->keys[24] = subkey[24]; - ctx->keys[25] = subkey[25]; - - tl = (subkey[23] >> 32) ^ (subkey[23] & ~subkey[25]); - dw = tl & (subkey[25] >> 32); - tr = subkey[23] ^ ROL32(1, dw); - ctx->keys[26] = subkey[27] ^ ( ((uint64_t) tl << 32) | tr); - - ctx->keys[27] = subkey[26] ^ subkey[28]; - ctx->keys[28] = subkey[27] ^ subkey[29]; - ctx->keys[29] = subkey[28] ^ subkey[30]; - ctx->keys[30] = subkey[29] ^ subkey[31]; - - ctx->keys[31] = subkey[30]; - ctx->keys[32] = subkey[32] ^ subkey[31]; - + tl = (subkey[i+2] >> 32) ^ (subkey[i+2] & ~subkey[i]); + dw = tl & (subkey[i] >> 32); + tr = subkey[i+2] ^ ROL32(1, dw); + ctx->keys[i-1] = subkey[i-2] ^ ( ((uint64_t) tl << 32) | tr); + + ctx->keys[i] = subkey[i]; + ctx->keys[i+1] = subkey[i+1]; + + tl = (subkey[i-1] >> 32) ^ (subkey[i-1] & ~subkey[i+1]); + dw = tl & (subkey[i+1] >> 32); + tr = subkey[i-1] ^ ROL32(1, dw); + ctx->keys[i+2] = subkey[i+3] ^ ( ((uint64_t) tl << 32) | tr); + + ctx->keys[i+3] = subkey[i+2] ^ subkey[i+4]; + ctx->keys[i+4] = subkey[i+3] ^ subkey[i+5]; + ctx->keys[i+5] = subkey[i+4] ^ subkey[i+6]; + ctx->keys[i+6] = subkey[i+5] ^ subkey[i+7]; } + ctx->keys[i-1] = subkey[i-2]; + ctx->keys[i] = subkey[i] ^ subkey[i-1]; + for (i = 0; i < ctx->nkeys - 2; i += 8) { - /* apply the inverse of the last half of F-function */ CAMELLIA_F_HALF_INV(ctx->keys[i+2]); CAMELLIA_F_HALF_INV(ctx->keys[i+3]);