From 69b59186f309609ee9d7b6ff3a35dd5e32d9d7dc Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Mon, 12 Sep 2016 01:07:40 +0000 Subject: Add ARMv8-A AES support More than twice as fast as plain C code. Note that both ARMv7 and ARMv8 can be further improved with better use of NEON. Also tweak ARMv7 multiplier --- src/crypto/slow-hash.c | 337 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 295 insertions(+), 42 deletions(-) (limited to 'src') diff --git a/src/crypto/slow-hash.c b/src/crypto/slow-hash.c index a0d2d1302..90fda3470 100644 --- a/src/crypto/slow-hash.c +++ b/src/crypto/slow-hash.c @@ -37,6 +37,13 @@ #include "hash-ops.h" #include "oaes_lib.h" +#define MEMORY (1 << 21) // 2MB scratchpad +#define ITER (1 << 20) +#define AES_BLOCK_SIZE 16 +#define AES_KEY_SIZE 32 +#define INIT_SIZE_BLK 8 +#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) + #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_WIN64)) // Optimised code below, uses x86-specific intrinsics, SSE2, AES-NI // Fall back to more portable code is down at the bottom @@ -77,12 +84,6 @@ #define ASM __asm #endif -#define MEMORY (1 << 21) // 2MB scratchpad -#define ITER (1 << 20) -#define AES_BLOCK_SIZE 16 -#define AES_KEY_SIZE 32 -#define INIT_SIZE_BLK 8 -#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) #define TOTALBLOCKS (MEMORY / AES_BLOCK_SIZE) #define U64(x) ((uint64_t *) (x)) @@ -643,9 +644,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash) extra_hashes[state.hs.b[0] & 3](&state, 200, hash); } -#elif defined(__arm__) -// ND: Some minor optimizations for ARM7 (raspberrry pi 2), effect seems to be ~40-50% faster. -// Needs more work. +#elif defined(__arm__) || defined(__aarch64__) void slow_hash_allocate_state(void) { // Do nothing, this is just to maintain compatibility with the upgraded slow-hash.c @@ -658,13 +657,6 @@ void slow_hash_free_state(void) return; } -#define MEMORY (1 << 21) /* 2 MiB */ -#define ITER (1 << 20) -#define AES_BLOCK_SIZE 16 -#define AES_KEY_SIZE 32 /*16*/ -#define INIT_SIZE_BLK 8 -#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) - #if defined(__GNUC__) #define RDATA_ALIGN16 __attribute__ ((aligned(16))) #define STATIC static @@ -677,6 +669,276 @@ void slow_hash_free_state(void) #define U64(x) ((uint64_t *) (x)) +#pragma pack(push, 1) +union cn_slow_hash_state +{ + union hash_state hs; + struct + { + uint8_t k[64]; + uint8_t init[INIT_SIZE_BYTE]; + }; +}; +#pragma pack(pop) + +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) + +/* ARMv8-A optimized with NEON and AES instructions. + * Copied from the x86-64 AES-NI implementation. It has much the same + * characteristics as x86-64: there's no 64x64=128 multiplier for vectors, + * and moving between vector and regular registers stalls the pipeline. + */ +#include + +#define TOTALBLOCKS (MEMORY / AES_BLOCK_SIZE) + +#define state_index(x) (((*((uint64_t *)x) >> 4) & (TOTALBLOCKS - 1)) << 4) +#define __mul() __asm__("mul %0, %1, %2\n\t" : "=r"(lo) : "r"(c[0]), "r"(b[0]) ); \ + __asm__("umulh %0, %1, %2\n\t" : "=r"(hi) : "r"(c[0]), "r"(b[0]) ); + +#define pre_aes() \ + j = state_index(a); \ + _c = vld1q_u8(&hp_state[j]); \ + _a = vld1q_u8((const uint8_t *)a); \ + +#define post_aes() \ + vst1q_u8((uint8_t *)c, _c); \ + _b = veorq_u8(_b, _c); \ + vst1q_u8(&hp_state[j], _b); \ + j = state_index(c); \ + p = U64(&hp_state[j]); \ + b[0] = p[0]; b[1] = p[1]; \ + __mul(); \ + a[0] += hi; a[1] += lo; \ + p = U64(&hp_state[j]); \ + p[0] = a[0]; p[1] = a[1]; \ + a[0] ^= b[0]; a[1] ^= b[1]; \ + _b = _c; \ + + +/* Note: this was based on a standard 256bit key schedule but + * it's been shortened since Cryptonight doesn't use the full + * key schedule. Don't try to use this for vanilla AES. +*/ +static void aes_expand_key(const uint8_t *key, uint8_t *expandedKey) { +__asm__("mov x2, %1\n\t" : : "r"(key), "r"(expandedKey)); +__asm__( +" adr x3,Lrcon\n" +"\n" +" eor v0.16b,v0.16b,v0.16b\n" +" ld1 {v3.16b},[x0],#16\n" +" ld1 {v1.4s,v2.4s},[x3],#32\n" +" b L256\n" +".align 5\n" +"Lrcon:\n" +".long 0x01,0x01,0x01,0x01\n" +".long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat\n" +".long 0x1b,0x1b,0x1b,0x1b\n" +"\n" +".align 4\n" +"L256:\n" +" ld1 {v4.16b},[x0]\n" +" mov w1,#5\n" +" st1 {v3.4s},[x2],#16\n" +"\n" +"Loop256:\n" +" tbl v6.16b,{v4.16b},v2.16b\n" +" ext v5.16b,v0.16b,v3.16b,#12\n" +" st1 {v4.4s},[x2],#16\n" +" aese v6.16b,v0.16b\n" +" subs w1,w1,#1\n" +"\n" +" eor v3.16b,v3.16b,v5.16b\n" +" ext v5.16b,v0.16b,v5.16b,#12\n" +" eor v3.16b,v3.16b,v5.16b\n" +" ext v5.16b,v0.16b,v5.16b,#12\n" +" eor v6.16b,v6.16b,v1.16b\n" +" eor v3.16b,v3.16b,v5.16b\n" +" shl v1.16b,v1.16b,#1\n" +" eor v3.16b,v3.16b,v6.16b\n" +" st1 {v3.4s},[x2],#16\n" +" b.eq Ldone\n" +"\n" +" dup v6.4s,v3.s[3] // just splat\n" +" ext v5.16b,v0.16b,v4.16b,#12\n" +" aese v6.16b,v0.16b\n" +"\n" +" eor v4.16b,v4.16b,v5.16b\n" +" ext v5.16b,v0.16b,v5.16b,#12\n" +" eor v4.16b,v4.16b,v5.16b\n" +" ext v5.16b,v0.16b,v5.16b,#12\n" +" eor v4.16b,v4.16b,v5.16b\n" +"\n" +" eor v4.16b,v4.16b,v6.16b\n" +" b Loop256\n" +"\n" +"Ldone:\n"); +} + +/* An ordinary AES round is a sequence of SubBytes, ShiftRows, MixColumns, AddRoundKey. There + * is also an InitialRound which consists solely of AddRoundKey. The ARM instructions slice + * this sequence differently; the aese instruction performs AddRoundKey, SubBytes, ShiftRows. + * The aesmc instruction does the MixColumns. Since the aese instruction moves the AddRoundKey + * up front, and Cryptonight's hash skips the InitialRound step, we have to kludge it here by + * feeding in a vector of zeros for our first step. Also we have to do our own Xor explicitly + * at the last step, to provide the AddRoundKey that the ARM instructions omit. + */ +STATIC INLINE void aes_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey, int nblocks) +{ + const uint8x16_t *k = (const uint8x16_t *)expandedKey, zero = {0}; + uint8x16_t tmp; + int i; + + for (i=0; i