From 268a0393e9e7e02c5f2c41dc0137540cbf61ab76 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sat, 16 Apr 2022 11:48:37 +0200 Subject: Optimized keccak implementation All tests were conducted on the same PC (Ryzen 5 5600X running at fixed 4.65 GHz). Before: test_cn_fast_hash<32> (100000 calls) - OK: 1 us/call test_cn_fast_hash<16384> (1000 calls) - OK: 164 us/call After: test_cn_fast_hash<32> (100000 calls) - OK: 0 us/call test_cn_fast_hash<16384> (1000 calls) - OK: 31 us/call More than 5 times speedup for cn_fast_hash. Also noticed consistent 1-2% improvement in test_construct_tx results. --- src/crypto/keccak.c | 95 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 33 deletions(-) diff --git a/src/crypto/keccak.c b/src/crypto/keccak.c index 72d472d8a..f098cbdf0 100644 --- a/src/crypto/keccak.c +++ b/src/crypto/keccak.c @@ -31,54 +31,83 @@ const uint64_t keccakf_rndc[24] = 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 }; -const int keccakf_rotc[24] = -{ - 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, - 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 -}; - -const int keccakf_piln[24] = -{ - 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, - 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 -}; - // update the state with given number of rounds void keccakf(uint64_t st[25], int rounds) { - int i, j, round; + int round; uint64_t t, bc[5]; - for (round = 0; round < rounds; round++) { - + for (round = 0; round < rounds; ++round) { // Theta - for (i = 0; i < 5; i++) - bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20]; - - for (i = 0; i < 5; i++) { - t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); - for (j = 0; j < 25; j += 5) - st[j + i] ^= t; + bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]; + bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21]; + bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22]; + bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23]; + bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24]; + +#define THETA(i) { \ + t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); \ + st[i ] ^= t; \ + st[i + 5] ^= t; \ + st[i + 10] ^= t; \ + st[i + 15] ^= t; \ + st[i + 20] ^= t; \ } + THETA(0); + THETA(1); + THETA(2); + THETA(3); + THETA(4); + // Rho Pi t = st[1]; - for (i = 0; i < 24; i++) { - j = keccakf_piln[i]; - bc[0] = st[j]; - st[j] = ROTL64(t, keccakf_rotc[i]); - t = bc[0]; - } + st[ 1] = ROTL64(st[ 6], 44); + st[ 6] = ROTL64(st[ 9], 20); + st[ 9] = ROTL64(st[22], 61); + st[22] = ROTL64(st[14], 39); + st[14] = ROTL64(st[20], 18); + st[20] = ROTL64(st[ 2], 62); + st[ 2] = ROTL64(st[12], 43); + st[12] = ROTL64(st[13], 25); + st[13] = ROTL64(st[19], 8); + st[19] = ROTL64(st[23], 56); + st[23] = ROTL64(st[15], 41); + st[15] = ROTL64(st[ 4], 27); + st[ 4] = ROTL64(st[24], 14); + st[24] = ROTL64(st[21], 2); + st[21] = ROTL64(st[ 8], 55); + st[ 8] = ROTL64(st[16], 45); + st[16] = ROTL64(st[ 5], 36); + st[ 5] = ROTL64(st[ 3], 28); + st[ 3] = ROTL64(st[18], 21); + st[18] = ROTL64(st[17], 15); + st[17] = ROTL64(st[11], 10); + st[11] = ROTL64(st[ 7], 6); + st[ 7] = ROTL64(st[10], 3); + st[10] = ROTL64(t, 1); // Chi - for (j = 0; j < 25; j += 5) { - for (i = 0; i < 5; i++) - bc[i] = st[j + i]; - for (i = 0; i < 5; i++) - st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5]; +#define CHI(j) { \ + const uint64_t st0 = st[j ]; \ + const uint64_t st1 = st[j + 1]; \ + const uint64_t st2 = st[j + 2]; \ + const uint64_t st3 = st[j + 3]; \ + const uint64_t st4 = st[j + 4]; \ + st[j ] ^= ~st1 & st2; \ + st[j + 1] ^= ~st2 & st3; \ + st[j + 2] ^= ~st3 & st4; \ + st[j + 3] ^= ~st4 & st0; \ + st[j + 4] ^= ~st0 & st1; \ } + CHI( 0); + CHI( 5); + CHI(10); + CHI(15); + CHI(20); + // Iota st[0] ^= keccakf_rndc[round]; } -- cgit v1.2.3