diff options
Diffstat (limited to 'src/crypto')
-rw-r--r-- | src/crypto/slow-hash.c | 210 |
1 files changed, 159 insertions, 51 deletions
diff --git a/src/crypto/slow-hash.c b/src/crypto/slow-hash.c index 5a773f3cf..38aeeee54 100644 --- a/src/crypto/slow-hash.c +++ b/src/crypto/slow-hash.c @@ -51,6 +51,12 @@ #define INIT_SIZE_BLK 8 #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) +#if defined(_MSC_VER) +#define THREADV __declspec(thread) +#else +#define THREADV __thread +#endif + extern void aesb_single_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey); extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey); @@ -89,6 +95,28 @@ static inline int use_v4_jit(void) #endif } +#if defined(__x86_64__) || defined(__aarch64__) +static inline int force_software_aes(void) +{ + static int use = -1; + + if (use != -1) + return use; + + const char *env = getenv("MONERO_USE_SOFTWARE_AES"); + if (!env) { + use = 0; + } + else if (!strcmp(env, "0") || !strcmp(env, "no")) { + use = 0; + } + else { + use = 1; + } + return use; +} +#endif + #define VARIANT1_1(p) \ do if (variant == 1) \ { \ @@ -437,12 +465,6 @@ static inline int use_v4_jit(void) _b1 = _b; \ _b = _c; \ -#if defined(_MSC_VER) -#define THREADV __declspec(thread) -#else -#define THREADV __thread -#endif - #pragma pack(push, 1) union cn_slow_hash_state { @@ -498,25 +520,6 @@ STATIC INLINE void xor64(uint64_t *a, const uint64_t b) * @return true if the CPU supports AES, false otherwise */ -STATIC INLINE int force_software_aes(void) -{ - static int use = -1; - - if (use != -1) - return use; - - const char *env = getenv("MONERO_USE_SOFTWARE_AES"); - if (!env) { - use = 0; - } - else if (!strcmp(env, "0") || !strcmp(env, "no")) { - use = 0; - } - else { - use = 1; - } - return use; -} STATIC INLINE int check_aes_hw(void) { @@ -1009,6 +1012,44 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int } #elif !defined NO_AES && (defined(__arm__) || defined(__aarch64__)) +#ifdef __aarch64__ +#include <sys/mman.h> +THREADV uint8_t *hp_state = NULL; +THREADV int hp_malloced = 0; + +void cn_slow_hash_allocate_state(void) +{ + if(hp_state != NULL) + return; + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0 +#endif + hp_state = mmap(0, MEMORY, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0); + + if(hp_state == MAP_FAILED) + hp_state = NULL; + if(hp_state == NULL) + { + hp_malloced = 1; + hp_state = (uint8_t *) malloc(MEMORY); + } +} + +void cn_slow_hash_free_state(void) +{ + if(hp_state == NULL) + return; + + if (hp_malloced) + free(hp_state); + else + munmap(hp_state, MEMORY); + hp_state = NULL; + hp_malloced = 0; +} +#else void cn_slow_hash_allocate_state(void) { // Do nothing, this is just to maintain compatibility with the upgraded slow-hash.c @@ -1020,6 +1061,7 @@ void cn_slow_hash_free_state(void) // As above return; } +#endif #if defined(__GNUC__) #define RDATA_ALIGN16 __attribute__ ((aligned(16))) @@ -1060,6 +1102,23 @@ union cn_slow_hash_state * and moving between vector and regular registers stalls the pipeline. */ #include <arm_neon.h> +#ifndef __APPLE__ +#include <sys/auxv.h> +#include <asm/hwcap.h> +#endif + +STATIC INLINE int check_aes_hw(void) +{ +#ifdef __APPLE__ + return 1; +#else + static int supported = -1; + + if(supported < 0) + supported = (getauxval(AT_HWCAP) & HWCAP_AES) != 0; + return supported; +#endif +} #define TOTALBLOCKS (MEMORY / AES_BLOCK_SIZE) @@ -1156,7 +1215,6 @@ __asm__( STATIC INLINE void aes_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey, int nblocks) { const uint8x16_t *k = (const uint8x16_t *)expandedKey, zero = {0}; - uint8x16_t tmp; int i; for (i=0; i<nblocks; i++) @@ -1191,7 +1249,6 @@ STATIC INLINE void aes_pseudo_round_xor(const uint8_t *in, uint8_t *out, const u { const uint8x16_t *k = (const uint8x16_t *)expandedKey; const uint8x16_t *x = (const uint8x16_t *)xor; - uint8x16_t tmp; int i; for (i=0; i<nblocks; i++) @@ -1244,16 +1301,17 @@ STATIC INLINE void aligned_free(void *ptr) } #endif /* FORCE_USE_HEAP */ +STATIC INLINE void xor_blocks(uint8_t* a, const uint8_t* b) +{ + U64(a)[0] ^= U64(b)[0]; + U64(a)[1] ^= U64(b)[1]; +} + void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height) { RDATA_ALIGN16 uint8_t expandedKey[240]; -#ifndef FORCE_USE_HEAP - RDATA_ALIGN16 uint8_t local_hp_state[MEMORY]; -#else - uint8_t *local_hp_state = (uint8_t *)aligned_malloc(MEMORY,16); -#endif - + uint8_t *local_hp_state; uint8_t text[INIT_SIZE_BYTE]; RDATA_ALIGN16 uint64_t a[2]; RDATA_ALIGN16 uint64_t b[4]; @@ -1264,12 +1322,22 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int size_t i, j; uint64_t *p = NULL; + oaes_ctx *aes_ctx = NULL; + int useAes = !force_software_aes() && check_aes_hw(); static void (*const extra_hashes[4])(const void *, size_t, char *) = { hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein }; + // this isn't supposed to happen, but guard against it for now. + if(hp_state == NULL) + cn_slow_hash_allocate_state(); + + // locals to avoid constant TLS dereferencing + local_hp_state = hp_state; + + // locals to avoid constant TLS dereferencing /* CryptoNight Step 1: Use Keccak1600 to initialize the 'state' (and 'text') buffers from the data. */ if (prehashed) { @@ -1287,11 +1355,26 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int * the 2MB large random access buffer. */ - aes_expand_key(state.hs.b, expandedKey); - for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) + if(useAes) { - aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK); - memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE); + aes_expand_key(state.hs.b, expandedKey); + for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) + { + aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK); + memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE); + } + } + else + { + aes_ctx = (oaes_ctx *) oaes_alloc(); + oaes_key_import_data(aes_ctx, state.hs.b, AES_KEY_SIZE); + for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) + { + for(j = 0; j < INIT_SIZE_BLK; j++) + aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data); + + memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE); + } } U64(a)[0] = U64(&state.k[0])[0] ^ U64(&state.k[32])[0]; @@ -1307,13 +1390,26 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int _b = vld1q_u8((const uint8_t *)b); _b1 = vld1q_u8(((const uint8_t *)b) + AES_BLOCK_SIZE); - for(i = 0; i < ITER / 2; i++) + if(useAes) { - pre_aes(); - _c = vaeseq_u8(_c, zero); - _c = vaesmcq_u8(_c); - _c = veorq_u8(_c, _a); - post_aes(); + for(i = 0; i < ITER / 2; i++) + { + pre_aes(); + _c = vaeseq_u8(_c, zero); + _c = vaesmcq_u8(_c); + _c = veorq_u8(_c, _a); + post_aes(); + } + } + else + { + for(i = 0; i < ITER / 2; i++) + { + pre_aes(); + aesb_single_round((uint8_t *) &_c, (uint8_t *) &_c, (uint8_t *) &_a); + post_aes(); + } + } /* CryptoNight Step 4: Sequentially pass through the mixing buffer and use 10 rounds @@ -1322,11 +1418,27 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int memcpy(text, state.init, INIT_SIZE_BYTE); - aes_expand_key(&state.hs.b[32], expandedKey); - for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) + if(useAes) + { + aes_expand_key(&state.hs.b[32], expandedKey); + for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) + { + // add the xor to the pseudo round + aes_pseudo_round_xor(text, text, expandedKey, &local_hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK); + } + } + else { - // add the xor to the pseudo round - aes_pseudo_round_xor(text, text, expandedKey, &local_hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK); + oaes_key_import_data(aes_ctx, &state.hs.b[32], AES_KEY_SIZE); + for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) + { + for(j = 0; j < INIT_SIZE_BLK; j++) + { + xor_blocks(&text[j * AES_BLOCK_SIZE], &local_hp_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]); + aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data); + } + } + oaes_free((OAES_CTX **) &aes_ctx); } /* CryptoNight Step 5: Apply Keccak to the state again, and then @@ -1339,10 +1451,6 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int memcpy(state.init, text, INIT_SIZE_BYTE); hash_permutation(&state.hs); extra_hashes[state.hs.b[0] & 3](&state, 200, hash); - -#ifdef FORCE_USE_HEAP - aligned_free(local_hp_state); -#endif } #else /* aarch64 && crypto */ |