aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSChernykh <sergey.v.chernykh@gmail.com>2022-04-16 11:48:37 +0200
committerSChernykh <sergey.v.chernykh@gmail.com>2022-04-18 10:01:13 +0200
commit268a0393e9e7e02c5f2c41dc0137540cbf61ab76 (patch)
treea0b6007e71844f711ae2d852204db7be683f2e2a
parentMerge pull request #8242 (diff)
downloadmonero-268a0393e9e7e02c5f2c41dc0137540cbf61ab76.tar.xz
Optimized keccak implementation
All tests were conducted on the same PC (Ryzen 5 5600X running at fixed 4.65 GHz). Before: test_cn_fast_hash<32> (100000 calls) - OK: 1 us/call test_cn_fast_hash<16384> (1000 calls) - OK: 164 us/call After: test_cn_fast_hash<32> (100000 calls) - OK: 0 us/call test_cn_fast_hash<16384> (1000 calls) - OK: 31 us/call More than 5 times speedup for cn_fast_hash. Also noticed consistent 1-2% improvement in test_construct_tx results.
-rw-r--r--src/crypto/keccak.c95
1 files changed, 62 insertions, 33 deletions
diff --git a/src/crypto/keccak.c b/src/crypto/keccak.c
index 72d472d8a..f098cbdf0 100644
--- a/src/crypto/keccak.c
+++ b/src/crypto/keccak.c
@@ -31,54 +31,83 @@ const uint64_t keccakf_rndc[24] =
0x8000000000008080, 0x0000000080000001, 0x8000000080008008
};
-const int keccakf_rotc[24] =
-{
- 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
- 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
-};
-
-const int keccakf_piln[24] =
-{
- 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
- 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
-};
-
// update the state with given number of rounds
void keccakf(uint64_t st[25], int rounds)
{
- int i, j, round;
+ int round;
uint64_t t, bc[5];
- for (round = 0; round < rounds; round++) {
-
+ for (round = 0; round < rounds; ++round) {
// Theta
- for (i = 0; i < 5; i++)
- bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
-
- for (i = 0; i < 5; i++) {
- t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
- for (j = 0; j < 25; j += 5)
- st[j + i] ^= t;
+ bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
+ bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
+ bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
+ bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
+ bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
+
+#define THETA(i) { \
+ t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); \
+ st[i ] ^= t; \
+ st[i + 5] ^= t; \
+ st[i + 10] ^= t; \
+ st[i + 15] ^= t; \
+ st[i + 20] ^= t; \
}
+ THETA(0);
+ THETA(1);
+ THETA(2);
+ THETA(3);
+ THETA(4);
+
// Rho Pi
t = st[1];
- for (i = 0; i < 24; i++) {
- j = keccakf_piln[i];
- bc[0] = st[j];
- st[j] = ROTL64(t, keccakf_rotc[i]);
- t = bc[0];
- }
+ st[ 1] = ROTL64(st[ 6], 44);
+ st[ 6] = ROTL64(st[ 9], 20);
+ st[ 9] = ROTL64(st[22], 61);
+ st[22] = ROTL64(st[14], 39);
+ st[14] = ROTL64(st[20], 18);
+ st[20] = ROTL64(st[ 2], 62);
+ st[ 2] = ROTL64(st[12], 43);
+ st[12] = ROTL64(st[13], 25);
+ st[13] = ROTL64(st[19], 8);
+ st[19] = ROTL64(st[23], 56);
+ st[23] = ROTL64(st[15], 41);
+ st[15] = ROTL64(st[ 4], 27);
+ st[ 4] = ROTL64(st[24], 14);
+ st[24] = ROTL64(st[21], 2);
+ st[21] = ROTL64(st[ 8], 55);
+ st[ 8] = ROTL64(st[16], 45);
+ st[16] = ROTL64(st[ 5], 36);
+ st[ 5] = ROTL64(st[ 3], 28);
+ st[ 3] = ROTL64(st[18], 21);
+ st[18] = ROTL64(st[17], 15);
+ st[17] = ROTL64(st[11], 10);
+ st[11] = ROTL64(st[ 7], 6);
+ st[ 7] = ROTL64(st[10], 3);
+ st[10] = ROTL64(t, 1);
// Chi
- for (j = 0; j < 25; j += 5) {
- for (i = 0; i < 5; i++)
- bc[i] = st[j + i];
- for (i = 0; i < 5; i++)
- st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
+#define CHI(j) { \
+ const uint64_t st0 = st[j ]; \
+ const uint64_t st1 = st[j + 1]; \
+ const uint64_t st2 = st[j + 2]; \
+ const uint64_t st3 = st[j + 3]; \
+ const uint64_t st4 = st[j + 4]; \
+ st[j ] ^= ~st1 & st2; \
+ st[j + 1] ^= ~st2 & st3; \
+ st[j + 2] ^= ~st3 & st4; \
+ st[j + 3] ^= ~st4 & st0; \
+ st[j + 4] ^= ~st0 & st1; \
}
+ CHI( 0);
+ CHI( 5);
+ CHI(10);
+ CHI(15);
+ CHI(20);
+
// Iota
st[0] ^= keccakf_rndc[round];
}