diff options
author | Howard Chu <hyc@symas.com> | 2016-09-12 01:07:40 +0000 |
---|---|---|
committer | Howard Chu <hyc@symas.com> | 2016-09-16 01:45:49 +0100 |
commit | 69b59186f309609ee9d7b6ff3a35dd5e32d9d7dc (patch) | |
tree | 1a1158ebe4e94f7396ef10c70dc150ecb3e33807 /src | |
parent | Merge pull request #1074 (diff) | |
download | monero-69b59186f309609ee9d7b6ff3a35dd5e32d9d7dc.tar.xz |
Add ARMv8-A AES support
More than twice as fast as plain C code. Note that both ARMv7 and
ARMv8 can be further improved with better use of NEON.
Also tweak ARMv7 multiplier
Diffstat (limited to '')
-rw-r--r-- | src/crypto/slow-hash.c | 337 |
1 files changed, 295 insertions, 42 deletions
diff --git a/src/crypto/slow-hash.c b/src/crypto/slow-hash.c index a0d2d1302..90fda3470 100644 --- a/src/crypto/slow-hash.c +++ b/src/crypto/slow-hash.c @@ -37,6 +37,13 @@ #include "hash-ops.h" #include "oaes_lib.h" +#define MEMORY (1 << 21) // 2MB scratchpad +#define ITER (1 << 20) +#define AES_BLOCK_SIZE 16 +#define AES_KEY_SIZE 32 +#define INIT_SIZE_BLK 8 +#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) + #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_WIN64)) // Optimised code below, uses x86-specific intrinsics, SSE2, AES-NI // Fall back to more portable code is down at the bottom @@ -77,12 +84,6 @@ #define ASM __asm #endif -#define MEMORY (1 << 21) // 2MB scratchpad -#define ITER (1 << 20) -#define AES_BLOCK_SIZE 16 -#define AES_KEY_SIZE 32 -#define INIT_SIZE_BLK 8 -#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) #define TOTALBLOCKS (MEMORY / AES_BLOCK_SIZE) #define U64(x) ((uint64_t *) (x)) @@ -643,9 +644,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash) extra_hashes[state.hs.b[0] & 3](&state, 200, hash); } -#elif defined(__arm__) -// ND: Some minor optimizations for ARM7 (raspberrry pi 2), effect seems to be ~40-50% faster. -// Needs more work. +#elif defined(__arm__) || defined(__aarch64__) void slow_hash_allocate_state(void) { // Do nothing, this is just to maintain compatibility with the upgraded slow-hash.c @@ -658,13 +657,6 @@ void slow_hash_free_state(void) return; } -#define MEMORY (1 << 21) /* 2 MiB */ -#define ITER (1 << 20) -#define AES_BLOCK_SIZE 16 -#define AES_KEY_SIZE 32 /*16*/ -#define INIT_SIZE_BLK 8 -#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) - #if defined(__GNUC__) #define RDATA_ALIGN16 __attribute__ ((aligned(16))) #define STATIC static @@ -677,6 +669,276 @@ void slow_hash_free_state(void) #define U64(x) ((uint64_t *) (x)) +#pragma pack(push, 1) +union cn_slow_hash_state +{ + union hash_state hs; + struct + { + uint8_t k[64]; + uint8_t init[INIT_SIZE_BYTE]; + }; +}; +#pragma pack(pop) + +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) + +/* ARMv8-A optimized with NEON and AES instructions. + * Copied from the x86-64 AES-NI implementation. It has much the same + * characteristics as x86-64: there's no 64x64=128 multiplier for vectors, + * and moving between vector and regular registers stalls the pipeline. + */ +#include <arm_neon.h> + +#define TOTALBLOCKS (MEMORY / AES_BLOCK_SIZE) + +#define state_index(x) (((*((uint64_t *)x) >> 4) & (TOTALBLOCKS - 1)) << 4) +#define __mul() __asm__("mul %0, %1, %2\n\t" : "=r"(lo) : "r"(c[0]), "r"(b[0]) ); \ + __asm__("umulh %0, %1, %2\n\t" : "=r"(hi) : "r"(c[0]), "r"(b[0]) ); + +#define pre_aes() \ + j = state_index(a); \ + _c = vld1q_u8(&hp_state[j]); \ + _a = vld1q_u8((const uint8_t *)a); \ + +#define post_aes() \ + vst1q_u8((uint8_t *)c, _c); \ + _b = veorq_u8(_b, _c); \ + vst1q_u8(&hp_state[j], _b); \ + j = state_index(c); \ + p = U64(&hp_state[j]); \ + b[0] = p[0]; b[1] = p[1]; \ + __mul(); \ + a[0] += hi; a[1] += lo; \ + p = U64(&hp_state[j]); \ + p[0] = a[0]; p[1] = a[1]; \ + a[0] ^= b[0]; a[1] ^= b[1]; \ + _b = _c; \ + + +/* Note: this was based on a standard 256bit key schedule but + * it's been shortened since Cryptonight doesn't use the full + * key schedule. Don't try to use this for vanilla AES. +*/ +static void aes_expand_key(const uint8_t *key, uint8_t *expandedKey) { +__asm__("mov x2, %1\n\t" : : "r"(key), "r"(expandedKey)); +__asm__( +" adr x3,Lrcon\n" +"\n" +" eor v0.16b,v0.16b,v0.16b\n" +" ld1 {v3.16b},[x0],#16\n" +" ld1 {v1.4s,v2.4s},[x3],#32\n" +" b L256\n" +".align 5\n" +"Lrcon:\n" +".long 0x01,0x01,0x01,0x01\n" +".long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat\n" +".long 0x1b,0x1b,0x1b,0x1b\n" +"\n" +".align 4\n" +"L256:\n" +" ld1 {v4.16b},[x0]\n" +" mov w1,#5\n" +" st1 {v3.4s},[x2],#16\n" +"\n" +"Loop256:\n" +" tbl v6.16b,{v4.16b},v2.16b\n" +" ext v5.16b,v0.16b,v3.16b,#12\n" +" st1 {v4.4s},[x2],#16\n" +" aese v6.16b,v0.16b\n" +" subs w1,w1,#1\n" +"\n" +" eor v3.16b,v3.16b,v5.16b\n" +" ext v5.16b,v0.16b,v5.16b,#12\n" +" eor v3.16b,v3.16b,v5.16b\n" +" ext v5.16b,v0.16b,v5.16b,#12\n" +" eor v6.16b,v6.16b,v1.16b\n" +" eor v3.16b,v3.16b,v5.16b\n" +" shl v1.16b,v1.16b,#1\n" +" eor v3.16b,v3.16b,v6.16b\n" +" st1 {v3.4s},[x2],#16\n" +" b.eq Ldone\n" +"\n" +" dup v6.4s,v3.s[3] // just splat\n" +" ext v5.16b,v0.16b,v4.16b,#12\n" +" aese v6.16b,v0.16b\n" +"\n" +" eor v4.16b,v4.16b,v5.16b\n" +" ext v5.16b,v0.16b,v5.16b,#12\n" +" eor v4.16b,v4.16b,v5.16b\n" +" ext v5.16b,v0.16b,v5.16b,#12\n" +" eor v4.16b,v4.16b,v5.16b\n" +"\n" +" eor v4.16b,v4.16b,v6.16b\n" +" b Loop256\n" +"\n" +"Ldone:\n"); +} + +/* An ordinary AES round is a sequence of SubBytes, ShiftRows, MixColumns, AddRoundKey. There + * is also an InitialRound which consists solely of AddRoundKey. The ARM instructions slice + * this sequence differently; the aese instruction performs AddRoundKey, SubBytes, ShiftRows. + * The aesmc instruction does the MixColumns. Since the aese instruction moves the AddRoundKey + * up front, and Cryptonight's hash skips the InitialRound step, we have to kludge it here by + * feeding in a vector of zeros for our first step. Also we have to do our own Xor explicitly + * at the last step, to provide the AddRoundKey that the ARM instructions omit. + */ +STATIC INLINE void aes_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey, int nblocks) +{ + const uint8x16_t *k = (const uint8x16_t *)expandedKey, zero = {0}; + uint8x16_t tmp; + int i; + + for (i=0; i<nblocks; i++) + { + uint8x16_t tmp = vld1q_u8(in + i * AES_BLOCK_SIZE); + tmp = vaeseq_u8(tmp, zero); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[0]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[1]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[2]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[3]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[4]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[5]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[6]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[7]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[8]); + tmp = vaesmcq_u8(tmp); + tmp = veorq_u8(tmp, k[9]); + vst1q_u8(out + i * AES_BLOCK_SIZE, tmp); + } +} + +STATIC INLINE void aes_pseudo_round_xor(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey, const uint8_t *xor, int nblocks) +{ + const uint8x16_t *k = (const uint8x16_t *)expandedKey; + const uint8x16_t *x = (const uint8x16_t *)xor; + uint8x16_t tmp; + int i; + + for (i=0; i<nblocks; i++) + { + uint8x16_t tmp = vld1q_u8(in + i * AES_BLOCK_SIZE); + tmp = vaeseq_u8(tmp, x[i]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[0]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[1]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[2]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[3]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[4]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[5]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[6]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[7]); + tmp = vaesmcq_u8(tmp); + tmp = vaeseq_u8(tmp, k[8]); + tmp = vaesmcq_u8(tmp); + tmp = veorq_u8(tmp, k[9]); + vst1q_u8(out + i * AES_BLOCK_SIZE, tmp); + } +} + +void cn_slow_hash(const void *data, size_t length, char *hash) +{ + RDATA_ALIGN16 uint8_t expandedKey[240]; + RDATA_ALIGN16 uint8_t hp_state[MEMORY]; + + uint8_t text[INIT_SIZE_BYTE]; + RDATA_ALIGN16 uint64_t a[2]; + RDATA_ALIGN16 uint64_t b[2]; + RDATA_ALIGN16 uint64_t c[2]; + union cn_slow_hash_state state; + uint8x16_t _a, _b, _c, zero = {0}; + uint64_t hi, lo; + + size_t i, j; + uint64_t *p = NULL; + + static void (*const extra_hashes[4])(const void *, size_t, char *) = + { + hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein + }; + + /* CryptoNight Step 1: Use Keccak1600 to initialize the 'state' (and 'text') buffers from the data. */ + + hash_process(&state.hs, data, length); + memcpy(text, state.init, INIT_SIZE_BYTE); + + /* CryptoNight Step 2: Iteratively encrypt the results from Keccak to fill + * the 2MB large random access buffer. + */ + + aes_expand_key(state.hs.b, expandedKey); + for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) + { + aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK); + memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE); + } + + U64(a)[0] = U64(&state.k[0])[0] ^ U64(&state.k[32])[0]; + U64(a)[1] = U64(&state.k[0])[1] ^ U64(&state.k[32])[1]; + U64(b)[0] = U64(&state.k[16])[0] ^ U64(&state.k[48])[0]; + U64(b)[1] = U64(&state.k[16])[1] ^ U64(&state.k[48])[1]; + + /* CryptoNight Step 3: Bounce randomly 1 million times through the mixing buffer, + * using 500,000 iterations of the following mixing function. Each execution + * performs two reads and writes from the mixing buffer. + */ + + _b = vld1q_u8((const uint8_t *)b); + + + for(i = 0; i < ITER / 2; i++) + { + pre_aes(); + _c = vaeseq_u8(_c, zero); + _c = vaesmcq_u8(_c); + _c = veorq_u8(_c, _a); + post_aes(); + } + + /* CryptoNight Step 4: Sequentially pass through the mixing buffer and use 10 rounds + * of AES encryption to mix the random data back into the 'text' buffer. 'text' + * was originally created with the output of Keccak1600. */ + + memcpy(text, state.init, INIT_SIZE_BYTE); + + aes_expand_key(&state.hs.b[32], expandedKey); + for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) + { + // add the xor to the pseudo round + aes_pseudo_round_xor(text, text, expandedKey, &hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK); + } + + /* CryptoNight Step 5: Apply Keccak to the state again, and then + * use the resulting data to select which of four finalizer + * hash functions to apply to the data (Blake, Groestl, JH, or Skein). + * Use this hash to squeeze the state array down + * to the final 256 bit hash output. + */ + + memcpy(state.init, text, INIT_SIZE_BYTE); + hash_permutation(&state.hs); + extra_hashes[state.hs.b[0] & 3](&state, 200, hash); +} +#else /* aarch64 && crypto */ + +// ND: Some minor optimizations for ARMv7 (raspberrry pi 2), effect seems to be ~40-50% faster. +// Needs more work. #include "aesb.c" #ifdef NO_OPTIMIZED_MULTIPLY_ON_ARM @@ -714,13 +976,21 @@ void mul(const uint8_t *ca, const uint8_t *cb, uint8_t *cres) { } #else // !NO_OPTIMIZED_MULTIPLY_ON_ARM +#ifdef __aarch64__ /* ARM64, no crypto */ +#define mul(a, b, c) cn_mul128((const uint64_t *)a, (const uint64_t *)b, (uint64_t *)c) +STATIC void cn_mul128(const uint64_t *a, const uint64_t *b, uint64_t *r) +{ + uint64_t lo, hi; + __asm__("mul %0, %1, %2\n\t" : "=r"(lo) : "r"(a[0]), "r"(b[0]) ); + __asm__("umulh %0, %1, %2\n\t" : "=r"(hi) : "r"(a[0]), "r"(b[0]) ); + r[0] = hi; + r[1] = lo; +} +#else /* ARM32 */ /* Can work as inline, but actually runs slower. Keep it separate */ -#define mul(a, b, c) cn_mul128(a, b, c) -void mul(const uint8_t *ca, const uint8_t *cb, uint8_t *cr) +#define mul(a, b, c) cn_mul128((const uint32_t *)a, (const uint32_t *)b, (uint32_t *)c) +void cn_mul128(const uint32_t *aa, const uint32_t *bb, uint32_t *r) { - const uint32_t *aa = (uint32_t *)ca; - const uint32_t *bb = (uint32_t *)cb; - uint32_t *r = (uint32_t *)cr; uint32_t t0, t1; __asm__ __volatile__( "umull %[t0], %[t1], %[a], %[b]\n\t" @@ -743,10 +1013,11 @@ __asm__ __volatile__( "str %[t0], [%[r]]\n\t" "str %[t1], [%[r], #4]\n\t" - : [t0]"=&r"(t0), [t1]"=&r"(t1) + : [t0]"=&r"(t0), [t1]"=&r"(t1), "=m"(r[0]), "=m"(r[1]), "=m"(r[2]), "=m"(r[3]) : [A]"r"(aa[1]), [a]"r"(aa[0]), [B]"r"(bb[1]), [b]"r"(bb[0]), [r]"r"(r) - : "cc", "memory"); + : "cc"); } +#endif /* !aarch64 */ #endif // NO_OPTIMIZED_MULTIPLY_ON_ARM STATIC INLINE void sum_half_blocks(uint8_t* a, const uint8_t* b) @@ -779,18 +1050,6 @@ STATIC INLINE void xor_blocks(uint8_t* a, const uint8_t* b) U64(a)[1] ^= U64(b)[1]; } -#pragma pack(push, 1) -union cn_slow_hash_state -{ - union hash_state hs; - struct - { - uint8_t k[64]; - uint8_t init[INIT_SIZE_BYTE]; - }; -}; -#pragma pack(pop) - void cn_slow_hash(const void *data, size_t length, char *hash) { uint8_t long_state[MEMORY]; @@ -871,6 +1130,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash) hash_permutation(&state.hs); extra_hashes[state.hs.b[0] & 3](&state, 200, hash); } +#endif /* !aarch64 || !crypto */ #else // Portable implementation as a fallback @@ -891,13 +1151,6 @@ static void (*const extra_hashes[4])(const void *, size_t, char *) = { hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein }; -#define MEMORY (1 << 21) /* 2 MiB */ -#define ITER (1 << 20) -#define AES_BLOCK_SIZE 16 -#define AES_KEY_SIZE 32 /*16*/ -#define INIT_SIZE_BLK 8 -#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) - extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey); extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey); |