aboutsummaryrefslogtreecommitdiff
path: root/src/crypto
diff options
context:
space:
mode:
authorRiccardo Spagni <ric@spagni.net>2019-02-14 21:30:42 +0200
committerRiccardo Spagni <ric@spagni.net>2019-02-14 21:30:42 +0200
commitb9b54dffbe328a05ea54b46fda9a8e1ec75511a9 (patch)
tree65e1d7953c4cd82ae6bd1e0f92799d43ad539d23 /src/crypto
parentMerge pull request #4716 (diff)
parentBuild fixes for some platforms (diff)
downloadmonero-b9b54dffbe328a05ea54b46fda9a8e1ec75511a9.tar.xz
Merge pull request #5144
0b0fb709 Build fixes for some platforms (moneromooo-monero) 3ac3366a blockchain: add v10 fork heights (moneromooo-monero) c2b8037a Adding cnv4-2 tweaks (Lee Clagett) 1b77e80a Cryptonight variant 4 aka CryptonightR (SChernykh) b980fec4 slow-hash: some more big endian fixes (xiphon) cd22a02d slow-hash: fix for big endian (moneromooo-monero) 96f53815 Small function declaration cleanup in slow-hash.c (Pol Mauri) 612ecb13 Add support for V10 protocol with BulletProofV2 and short amount. (cslashm) 823b38bc Fix dummy decryption in debug mode (cslashm) 377de050 fix log namespace (cslashm) ff9853b4 New scheme key destination contrfol (cslashm) 041af954 cryptonote: Fix enum check in expand_transaction_2 (Tom Smeding) 1767c817 simplewallet: tell the user to complain to the recipient (moneromooo-monero) c2d0e9a1 ringct: fix v1 ecdhInfo serialization (moneromooo-monero) 0b18fa54 ringct: the commitment mask is now deterministic (moneromooo-monero) 6ba3a116 ringct: encode 8 byte amount, saving 24 bytes per output (moneromooo-monero) 77f8f454 ringct: save 3 bytes on bulletproof size (moneromooo-monero) f7f67600 add a bulletproof version, new bulletproof type, and rct config (moneromooo-monero) 5b07e0c9 core: include a dummy encrypted payment id when no payment is used (moneromooo-monero) 2eff9b12 core, wallet: remember original text version of destination address (moneromooo-monero) e10d12b6 simplewallet: disable long payment ids by default (moneromooo-monero) b84b350f blockchain: fix wrong hf version when popping multiple blocks (moneromooo-monero) 1c7650f4 simplewallet: remove ability to transfer with detached short payment ids (moneromooo-monero) 4a7917ef blockchain: fix block rate check for empty blockchains (moneromooo-monero) 34d2850f ignore child process when exec (Jethro Grassie) 5f89caea wallet2: fix ring reuse breaking when using histogram (moneromooo-monero) db1e0a53 core: fix unmixable special case allowing ring size below 11 (moneromooo-monero) 66772f12 blockchain: include number of discarded blocks in --reorg-notify (moneromooo-monero) af1ade4e core: add a few more block rate window sizes (moneromooo-monero) fb0dbab9 notify: fix tokenizing being too strict (moneromooo-monero) 0ac22d0e core: add --block-rate-notify (moneromooo-monero) 7d2f817f blockchain: add --reorg-notify (moneromooo-monero) 1168e8d5 cryptonote_core: warn when the block rate deviates from expectations (moneromooo-monero) 842a5d8b notify: handle arbitrary tags (moneromooo-monero) ebc60a09 ArticMine's new block weight algorithm (moneromooo-monero)
Diffstat (limited to 'src/crypto')
-rw-r--r--src/crypto/chacha.h8
-rw-r--r--src/crypto/hash-ops.h2
-rw-r--r--src/crypto/hash.h8
-rw-r--r--src/crypto/slow-hash.c196
-rw-r--r--src/crypto/variant4_random_math.h441
5 files changed, 592 insertions, 63 deletions
diff --git a/src/crypto/chacha.h b/src/crypto/chacha.h
index 6e85ad0e9..0610f7051 100644
--- a/src/crypto/chacha.h
+++ b/src/crypto/chacha.h
@@ -73,18 +73,18 @@ namespace crypto {
inline void generate_chacha_key(const void *data, size_t size, chacha_key& key, uint64_t kdf_rounds) {
static_assert(sizeof(chacha_key) <= sizeof(hash), "Size of hash must be at least that of chacha_key");
epee::mlocked<tools::scrubbed_arr<char, HASH_SIZE>> pwd_hash;
- crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 0/*prehashed*/);
+ crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 0/*prehashed*/, 0/*height*/);
for (uint64_t n = 1; n < kdf_rounds; ++n)
- crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/);
+ crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/, 0/*height*/);
memcpy(&unwrap(unwrap(key)), pwd_hash.data(), sizeof(key));
}
inline void generate_chacha_key_prehashed(const void *data, size_t size, chacha_key& key, uint64_t kdf_rounds) {
static_assert(sizeof(chacha_key) <= sizeof(hash), "Size of hash must be at least that of chacha_key");
epee::mlocked<tools::scrubbed_arr<char, HASH_SIZE>> pwd_hash;
- crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 1/*prehashed*/);
+ crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 1/*prehashed*/, 0/*height*/);
for (uint64_t n = 1; n < kdf_rounds; ++n)
- crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/);
+ crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/, 0/*height*/);
memcpy(&unwrap(unwrap(key)), pwd_hash.data(), sizeof(key));
}
diff --git a/src/crypto/hash-ops.h b/src/crypto/hash-ops.h
index d77d55cf3..891896120 100644
--- a/src/crypto/hash-ops.h
+++ b/src/crypto/hash-ops.h
@@ -79,7 +79,7 @@ enum {
};
void cn_fast_hash(const void *data, size_t length, char *hash);
-void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed);
+void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height);
void hash_extra_blake(const void *data, size_t length, char *hash);
void hash_extra_groestl(const void *data, size_t length, char *hash);
diff --git a/src/crypto/hash.h b/src/crypto/hash.h
index 995e2294e..165fe6bb0 100644
--- a/src/crypto/hash.h
+++ b/src/crypto/hash.h
@@ -71,12 +71,12 @@ namespace crypto {
return h;
}
- inline void cn_slow_hash(const void *data, std::size_t length, hash &hash, int variant = 0) {
- cn_slow_hash(data, length, reinterpret_cast<char *>(&hash), variant, 0/*prehashed*/);
+ inline void cn_slow_hash(const void *data, std::size_t length, hash &hash, int variant = 0, uint64_t height = 0) {
+ cn_slow_hash(data, length, reinterpret_cast<char *>(&hash), variant, 0/*prehashed*/, height);
}
- inline void cn_slow_hash_prehashed(const void *data, std::size_t length, hash &hash, int variant = 0) {
- cn_slow_hash(data, length, reinterpret_cast<char *>(&hash), variant, 1/*prehashed*/);
+ inline void cn_slow_hash_prehashed(const void *data, std::size_t length, hash &hash, int variant = 0, uint64_t height = 0) {
+ cn_slow_hash(data, length, reinterpret_cast<char *>(&hash), variant, 1/*prehashed*/, height);
}
inline void tree_hash(const hash *hashes, std::size_t count, hash &root_hash) {
diff --git a/src/crypto/slow-hash.c b/src/crypto/slow-hash.c
index 40cfb0461..164cff3d4 100644
--- a/src/crypto/slow-hash.c
+++ b/src/crypto/slow-hash.c
@@ -39,6 +39,7 @@
#include "hash-ops.h"
#include "oaes_lib.h"
#include "variant2_int_sqrt.h"
+#include "variant4_random_math.h"
#define MEMORY (1 << 21) // 2MB scratchpad
#define ITER (1 << 20)
@@ -47,8 +48,8 @@
#define INIT_SIZE_BLK 8
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
-extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
-extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
+extern void aesb_single_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
+extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
#define VARIANT1_1(p) \
do if (variant == 1) \
@@ -109,69 +110,96 @@ extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *exp
memcpy(b + AES_BLOCK_SIZE, state.hs.b + 64, AES_BLOCK_SIZE); \
xor64(b + AES_BLOCK_SIZE, state.hs.b + 80); \
xor64(b + AES_BLOCK_SIZE + 8, state.hs.b + 88); \
- division_result = state.hs.w[12]; \
- sqrt_result = state.hs.w[13]; \
+ division_result = SWAP64LE(state.hs.w[12]); \
+ sqrt_result = SWAP64LE(state.hs.w[13]); \
} while (0)
#define VARIANT2_SHUFFLE_ADD_SSE2(base_ptr, offset) \
do if (variant >= 2) \
{ \
- const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
+ __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
+ if (variant >= 4) \
+ { \
+ chunk1 = _mm_xor_si128(chunk1, chunk2); \
+ _c = _mm_xor_si128(_c, chunk3); \
+ _c = _mm_xor_si128(_c, chunk1); \
+ } \
} while (0)
#define VARIANT2_SHUFFLE_ADD_NEON(base_ptr, offset) \
do if (variant >= 2) \
{ \
- const uint64x2_t chunk1 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x10))); \
+ uint64x2_t chunk1 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x10))); \
const uint64x2_t chunk2 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x20))); \
const uint64x2_t chunk3 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x30))); \
vst1q_u64(U64((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \
vst1q_u64(U64((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \
vst1q_u64(U64((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \
+ if (variant >= 4) \
+ { \
+ chunk1 = veorq_u64(chunk1, chunk2); \
+ _c = vreinterpretq_u8_u64(veorq_u64(vreinterpretq_u64_u8(_c), chunk3)); \
+ _c = vreinterpretq_u8_u64(veorq_u64(vreinterpretq_u64_u8(_c), chunk1)); \
+ } \
} while (0)
-#define VARIANT2_PORTABLE_SHUFFLE_ADD(base_ptr, offset) \
+#define VARIANT2_PORTABLE_SHUFFLE_ADD(out, a_, base_ptr, offset) \
do if (variant >= 2) \
{ \
uint64_t* chunk1 = U64((base_ptr) + ((offset) ^ 0x10)); \
uint64_t* chunk2 = U64((base_ptr) + ((offset) ^ 0x20)); \
uint64_t* chunk3 = U64((base_ptr) + ((offset) ^ 0x30)); \
\
- const uint64_t chunk1_old[2] = { chunk1[0], chunk1[1] }; \
+ uint64_t chunk1_old[2] = { SWAP64LE(chunk1[0]), SWAP64LE(chunk1[1]) }; \
+ const uint64_t chunk2_old[2] = { SWAP64LE(chunk2[0]), SWAP64LE(chunk2[1]) }; \
+ const uint64_t chunk3_old[2] = { SWAP64LE(chunk3[0]), SWAP64LE(chunk3[1]) }; \
\
uint64_t b1[2]; \
- memcpy(b1, b + 16, 16); \
- chunk1[0] = chunk3[0] + b1[0]; \
- chunk1[1] = chunk3[1] + b1[1]; \
+ memcpy_swap64le(b1, b + 16, 2); \
+ chunk1[0] = SWAP64LE(chunk3_old[0] + b1[0]); \
+ chunk1[1] = SWAP64LE(chunk3_old[1] + b1[1]); \
\
uint64_t a0[2]; \
- memcpy(a0, a, 16); \
- chunk3[0] = chunk2[0] + a0[0]; \
- chunk3[1] = chunk2[1] + a0[1]; \
+ memcpy_swap64le(a0, a_, 2); \
+ chunk3[0] = SWAP64LE(chunk2_old[0] + a0[0]); \
+ chunk3[1] = SWAP64LE(chunk2_old[1] + a0[1]); \
\
uint64_t b0[2]; \
- memcpy(b0, b, 16); \
- chunk2[0] = chunk1_old[0] + b0[0]; \
- chunk2[1] = chunk1_old[1] + b0[1]; \
+ memcpy_swap64le(b0, b, 2); \
+ chunk2[0] = SWAP64LE(chunk1_old[0] + b0[0]); \
+ chunk2[1] = SWAP64LE(SWAP64LE(chunk1_old[1]) + b0[1]); \
+ if (variant >= 4) \
+ { \
+ uint64_t out_copy[2]; \
+ memcpy_swap64le(out_copy, out, 2); \
+ chunk1_old[0] ^= chunk2_old[0]; \
+ chunk1_old[1] ^= chunk2_old[1]; \
+ out_copy[0] ^= chunk3_old[0]; \
+ out_copy[1] ^= chunk3_old[1]; \
+ out_copy[0] ^= chunk1_old[0]; \
+ out_copy[1] ^= chunk1_old[1]; \
+ memcpy_swap64le(out, out_copy, 2); \
+ } \
} while (0)
#define VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr) \
- ((uint64_t*)(b))[0] ^= division_result ^ (sqrt_result << 32); \
+ uint64_t tmpx = division_result ^ (sqrt_result << 32); \
+ ((uint64_t*)(b))[0] ^= SWAP64LE(tmpx); \
{ \
- const uint64_t dividend = ((uint64_t*)(ptr))[1]; \
- const uint32_t divisor = (((uint64_t*)(ptr))[0] + (uint32_t)(sqrt_result << 1)) | 0x80000001UL; \
+ const uint64_t dividend = SWAP64LE(((uint64_t*)(ptr))[1]); \
+ const uint32_t divisor = (SWAP64LE(((uint64_t*)(ptr))[0]) + (uint32_t)(sqrt_result << 1)) | 0x80000001UL; \
division_result = ((uint32_t)(dividend / divisor)) + \
(((uint64_t)(dividend % divisor)) << 32); \
} \
- const uint64_t sqrt_input = ((uint64_t*)(ptr))[0] + division_result
+ const uint64_t sqrt_input = SWAP64LE(((uint64_t*)(ptr))[0]) + division_result
#define VARIANT2_INTEGER_MATH_SSE2(b, ptr) \
- do if (variant >= 2) \
+ do if ((variant == 2) || (variant == 3)) \
{ \
VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr); \
VARIANT2_INTEGER_MATH_SQRT_STEP_SSE2(); \
@@ -181,7 +209,7 @@ extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *exp
#if defined DBL_MANT_DIG && (DBL_MANT_DIG >= 50)
// double precision floating point type has enough bits of precision on current platform
#define VARIANT2_PORTABLE_INTEGER_MATH(b, ptr) \
- do if (variant >= 2) \
+ do if ((variant == 2) || (variant == 3)) \
{ \
VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr); \
VARIANT2_INTEGER_MATH_SQRT_STEP_FP64(); \
@@ -191,7 +219,7 @@ extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *exp
// double precision floating point type is not good enough on current platform
// fall back to the reference code (integer only)
#define VARIANT2_PORTABLE_INTEGER_MATH(b, ptr) \
- do if (variant >= 2) \
+ do if ((variant == 2) || (variant == 3)) \
{ \
VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr); \
VARIANT2_INTEGER_MATH_SQRT_STEP_REF(); \
@@ -199,18 +227,70 @@ extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *exp
#endif
#define VARIANT2_2_PORTABLE() \
- if (variant >= 2) { \
+ if (variant == 2 || variant == 3) { \
xor_blocks(long_state + (j ^ 0x10), d); \
xor_blocks(d, long_state + (j ^ 0x20)); \
}
#define VARIANT2_2() \
- do if (variant >= 2) \
+ do if (variant == 2 || variant == 3) \
{ \
- *U64(hp_state + (j ^ 0x10)) ^= hi; \
- *(U64(hp_state + (j ^ 0x10)) + 1) ^= lo; \
- hi ^= *U64(hp_state + (j ^ 0x20)); \
- lo ^= *(U64(hp_state + (j ^ 0x20)) + 1); \
+ *U64(hp_state + (j ^ 0x10)) ^= SWAP64LE(hi); \
+ *(U64(hp_state + (j ^ 0x10)) + 1) ^= SWAP64LE(lo); \
+ hi ^= SWAP64LE(*U64(hp_state + (j ^ 0x20))); \
+ lo ^= SWAP64LE(*(U64(hp_state + (j ^ 0x20)) + 1)); \
+ } while (0)
+
+#define V4_REG_LOAD(dst, src) \
+ do { \
+ memcpy((dst), (src), sizeof(v4_reg)); \
+ if (sizeof(v4_reg) == sizeof(uint32_t)) \
+ *(dst) = SWAP32LE(*(dst)); \
+ else \
+ *(dst) = SWAP64LE(*(dst)); \
+ } while (0)
+
+#define VARIANT4_RANDOM_MATH_INIT() \
+ v4_reg r[9]; \
+ struct V4_Instruction code[NUM_INSTRUCTIONS_MAX + 1]; \
+ do if (variant >= 4) \
+ { \
+ for (int i = 0; i < 4; ++i) \
+ V4_REG_LOAD(r + i, (uint8_t*)(state.hs.w + 12) + sizeof(v4_reg) * i); \
+ v4_random_math_init(code, height); \
+ } while (0)
+
+#define VARIANT4_RANDOM_MATH(a, b, r, _b, _b1) \
+ do if (variant >= 4) \
+ { \
+ uint64_t t[2]; \
+ memcpy(t, b, sizeof(uint64_t)); \
+ \
+ if (sizeof(v4_reg) == sizeof(uint32_t)) \
+ t[0] ^= SWAP64LE((r[0] + r[1]) | ((uint64_t)(r[2] + r[3]) << 32)); \
+ else \
+ t[0] ^= SWAP64LE((r[0] + r[1]) ^ (r[2] + r[3])); \
+ \
+ memcpy(b, t, sizeof(uint64_t)); \
+ \
+ V4_REG_LOAD(r + 4, a); \
+ V4_REG_LOAD(r + 5, (uint64_t*)(a) + 1); \
+ V4_REG_LOAD(r + 6, _b); \
+ V4_REG_LOAD(r + 7, _b1); \
+ V4_REG_LOAD(r + 8, (uint64_t*)(_b1) + 1); \
+ \
+ v4_random_math(code, r); \
+ \
+ memcpy(t, a, sizeof(uint64_t) * 2); \
+ \
+ if (sizeof(v4_reg) == sizeof(uint32_t)) { \
+ t[0] ^= SWAP64LE(r[2] | ((uint64_t)(r[3]) << 32)); \
+ t[1] ^= SWAP64LE(r[0] | ((uint64_t)(r[1]) << 32)); \
+ } else { \
+ t[0] ^= SWAP64LE(r[2] ^ r[3]); \
+ t[1] ^= SWAP64LE(r[0] ^ r[1]); \
+ } \
+ memcpy(a, t, sizeof(uint64_t) * 2); \
} while (0)
@@ -297,6 +377,7 @@ extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *exp
p = U64(&hp_state[j]); \
b[0] = p[0]; b[1] = p[1]; \
VARIANT2_INTEGER_MATH_SSE2(b, c); \
+ VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \
__mul(); \
VARIANT2_2(); \
VARIANT2_SHUFFLE_ADD_SSE2(hp_state, j); \
@@ -693,7 +774,7 @@ void slow_hash_free_state(void)
* @param length the length in bytes of the data
* @param hash a pointer to a buffer in which the final 256 bit hash will be stored
*/
-void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed)
+void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height)
{
RDATA_ALIGN16 uint8_t expandedKey[240]; /* These buffers are aligned to use later with SSE functions */
@@ -729,6 +810,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
VARIANT1_INIT64();
VARIANT2_INIT64();
+ VARIANT4_RANDOM_MATH_INIT();
/* CryptoNight Step 2: Iteratively encrypt the results from Keccak to fill
* the 2MB large random access buffer.
@@ -900,6 +982,7 @@ union cn_slow_hash_state
p = U64(&hp_state[j]); \
b[0] = p[0]; b[1] = p[1]; \
VARIANT2_PORTABLE_INTEGER_MATH(b, c); \
+ VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \
__mul(); \
VARIANT2_2(); \
VARIANT2_SHUFFLE_ADD_NEON(hp_state, j); \
@@ -1062,7 +1145,7 @@ STATIC INLINE void aligned_free(void *ptr)
}
#endif /* FORCE_USE_HEAP */
-void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed)
+void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height)
{
RDATA_ALIGN16 uint8_t expandedKey[240];
@@ -1099,6 +1182,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
VARIANT1_INIT64();
VARIANT2_INIT64();
+ VARIANT4_RANDOM_MATH_INIT();
/* CryptoNight Step 2: Iteratively encrypt the results from Keccak to fill
* the 2MB large random access buffer.
@@ -1277,10 +1361,11 @@ STATIC INLINE void xor_blocks(uint8_t* a, const uint8_t* b)
U64(a)[1] ^= U64(b)[1];
}
-void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed)
+void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height)
{
uint8_t text[INIT_SIZE_BYTE];
uint8_t a[AES_BLOCK_SIZE];
+ uint8_t a1[AES_BLOCK_SIZE];
uint8_t b[AES_BLOCK_SIZE * 2];
uint8_t c[AES_BLOCK_SIZE];
uint8_t c1[AES_BLOCK_SIZE];
@@ -1316,6 +1401,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
VARIANT1_INIT64();
VARIANT2_INIT64();
+ VARIANT4_RANDOM_MATH_INIT();
// use aligned data
memcpy(expandedKey, aes_ctx->key->exp_data, aes_ctx->key->exp_data_len);
@@ -1339,10 +1425,10 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
// Iteration 1
j = state_index(a);
p = &long_state[j];
- aesb_single_round(p, p, a);
- copy_block(c1, p);
+ aesb_single_round(p, c1, a);
- VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
+ VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
+ copy_block(p, c1);
xor_blocks(p, b);
VARIANT1_1(p);
@@ -1351,13 +1437,15 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
p = &long_state[j];
copy_block(c, p);
+ copy_block(a1, a);
VARIANT2_PORTABLE_INTEGER_MATH(c, c1);
+ VARIANT4_RANDOM_MATH(a1, c, r, b, b + AES_BLOCK_SIZE);
mul(c1, c, d);
VARIANT2_2_PORTABLE();
- VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
- sum_half_blocks(a, d);
- swap_blocks(a, c);
- xor_blocks(a, c);
+ VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
+ sum_half_blocks(a1, d);
+ swap_blocks(a1, c);
+ xor_blocks(a1, c);
VARIANT1_2(U64(c) + 1);
copy_block(p, c);
@@ -1365,6 +1453,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
copy_block(b + AES_BLOCK_SIZE, b);
}
copy_block(b, c1);
+ copy_block(a, a1);
}
memcpy(text, state.init, INIT_SIZE_BYTE);
@@ -1408,10 +1497,7 @@ static void (*const extra_hashes[4])(const void *, size_t, char *) = {
hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein
};
-extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
-extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
-
-static size_t e2i(const uint8_t* a, size_t count) { return (*((uint64_t*)a) / AES_BLOCK_SIZE) & (count - 1); }
+static size_t e2i(const uint8_t* a, size_t count) { return (SWAP64LE(*((uint64_t*)a)) / AES_BLOCK_SIZE) & (count - 1); }
static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) {
uint64_t a0, b0;
@@ -1478,7 +1564,7 @@ union cn_slow_hash_state {
};
#pragma pack(pop)
-void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed) {
+void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height) {
#ifndef FORCE_USE_HEAP
uint8_t long_state[MEMORY];
#else
@@ -1488,6 +1574,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
union cn_slow_hash_state state;
uint8_t text[INIT_SIZE_BYTE];
uint8_t a[AES_BLOCK_SIZE];
+ uint8_t a1[AES_BLOCK_SIZE];
uint8_t b[AES_BLOCK_SIZE * 2];
uint8_t c1[AES_BLOCK_SIZE];
uint8_t c2[AES_BLOCK_SIZE];
@@ -1507,6 +1594,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
VARIANT1_PORTABLE_INIT();
VARIANT2_PORTABLE_INIT();
+ VARIANT4_RANDOM_MATH_INIT();
oaes_key_import_data(aes_ctx, aes_key, AES_KEY_SIZE);
for (i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) {
@@ -1530,7 +1618,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
j = e2i(a, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE;
copy_block(c1, &long_state[j]);
aesb_single_round(c1, c1, a);
- VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
+ VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
copy_block(&long_state[j], c1);
xor_blocks(&long_state[j], b);
assert(j == e2i(a, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE);
@@ -1538,22 +1626,22 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
/* Iteration 2 */
j = e2i(c1, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE;
copy_block(c2, &long_state[j]);
+ copy_block(a1, a);
VARIANT2_PORTABLE_INTEGER_MATH(c2, c1);
+ VARIANT4_RANDOM_MATH(a1, c2, r, b, b + AES_BLOCK_SIZE);
mul(c1, c2, d);
VARIANT2_2_PORTABLE();
- VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
- swap_blocks(a, c1);
- sum_half_blocks(c1, d);
- swap_blocks(c1, c2);
- xor_blocks(c1, c2);
+ VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
+ sum_half_blocks(a1, d);
+ swap_blocks(a1, c2);
+ xor_blocks(a1, c2);
VARIANT1_2(c2 + 8);
copy_block(&long_state[j], c2);
- assert(j == e2i(a, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE);
if (variant >= 2) {
copy_block(b + AES_BLOCK_SIZE, b);
}
- copy_block(b, a);
- copy_block(a, c1);
+ copy_block(b, c1);
+ copy_block(a, a1);
}
memcpy(text, state.init, INIT_SIZE_BYTE);
diff --git a/src/crypto/variant4_random_math.h b/src/crypto/variant4_random_math.h
new file mode 100644
index 000000000..be5447395
--- /dev/null
+++ b/src/crypto/variant4_random_math.h
@@ -0,0 +1,441 @@
+#ifndef VARIANT4_RANDOM_MATH_H
+#define VARIANT4_RANDOM_MATH_H
+
+// Register size can be configured to either 32 bit (uint32_t) or 64 bit (uint64_t)
+typedef uint32_t v4_reg;
+
+enum V4_Settings
+{
+ // Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications
+ TOTAL_LATENCY = 15 * 3,
+
+ // Always generate at least 60 instructions
+ NUM_INSTRUCTIONS_MIN = 60,
+
+ // Never generate more than 70 instructions (final RET instruction doesn't count here)
+ NUM_INSTRUCTIONS_MAX = 70,
+
+ // Available ALUs for MUL
+ // Modern CPUs typically have only 1 ALU which can do multiplications
+ ALU_COUNT_MUL = 1,
+
+ // Total available ALUs
+ // Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code
+ ALU_COUNT = 3,
+};
+
+enum V4_InstructionList
+{
+ MUL, // a*b
+ ADD, // a+b + C, C is an unsigned 32-bit constant
+ SUB, // a-b
+ ROR, // rotate right "a" by "b & 31" bits
+ ROL, // rotate left "a" by "b & 31" bits
+ XOR, // a^b
+ RET, // finish execution
+ V4_INSTRUCTION_COUNT = RET,
+};
+
+// V4_InstructionDefinition is used to generate code from random data
+// Every random sequence of bytes is a valid code
+//
+// There are 9 registers in total:
+// - 4 variable registers
+// - 5 constant registers initialized from loop variables
+// This is why dst_index is 2 bits
+enum V4_InstructionDefinition
+{
+ V4_OPCODE_BITS = 3,
+ V4_DST_INDEX_BITS = 2,
+ V4_SRC_INDEX_BITS = 3,
+};
+
+struct V4_Instruction
+{
+ uint8_t opcode;
+ uint8_t dst_index;
+ uint8_t src_index;
+ uint32_t C;
+};
+
+#ifndef FORCEINLINE
+#if defined(__GNUC__)
+#define FORCEINLINE __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#define FORCEINLINE __forceinline
+#else
+#define FORCEINLINE inline
+#endif
+#endif
+
+#ifndef UNREACHABLE_CODE
+#if defined(__GNUC__)
+#define UNREACHABLE_CODE __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define UNREACHABLE_CODE __assume(false)
+#else
+#define UNREACHABLE_CODE
+#endif
+#endif
+
+// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU:
+// every switch-case will point to the same destination on every iteration of Cryptonight main loop
+//
+// This is about as fast as it can get without using low-level machine code generation
+static FORCEINLINE void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
+{
+ enum
+ {
+ REG_BITS = sizeof(v4_reg) * 8,
+ };
+
+#define V4_EXEC(i) \
+ { \
+ const struct V4_Instruction* op = code + i; \
+ const v4_reg src = r[op->src_index]; \
+ v4_reg* dst = r + op->dst_index; \
+ switch (op->opcode) \
+ { \
+ case MUL: \
+ *dst *= src; \
+ break; \
+ case ADD: \
+ *dst += src + op->C; \
+ break; \
+ case SUB: \
+ *dst -= src; \
+ break; \
+ case ROR: \
+ { \
+ const uint32_t shift = src % REG_BITS; \
+ *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \
+ } \
+ break; \
+ case ROL: \
+ { \
+ const uint32_t shift = src % REG_BITS; \
+ *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \
+ } \
+ break; \
+ case XOR: \
+ *dst ^= src; \
+ break; \
+ case RET: \
+ return; \
+ default: \
+ UNREACHABLE_CODE; \
+ break; \
+ } \
+ }
+
+#define V4_EXEC_10(j) \
+ V4_EXEC(j + 0) \
+ V4_EXEC(j + 1) \
+ V4_EXEC(j + 2) \
+ V4_EXEC(j + 3) \
+ V4_EXEC(j + 4) \
+ V4_EXEC(j + 5) \
+ V4_EXEC(j + 6) \
+ V4_EXEC(j + 7) \
+ V4_EXEC(j + 8) \
+ V4_EXEC(j + 9)
+
+ // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency
+ // I've checked all block heights < 10,000,000 and here is the distribution of program sizes:
+ //
+ // 60 27960
+ // 61 105054
+ // 62 2452759
+ // 63 5115997
+ // 64 1022269
+ // 65 1109635
+ // 66 153145
+ // 67 8550
+ // 68 4529
+ // 69 102
+
+ // Unroll 70 instructions here
+ V4_EXEC_10(0); // instructions 0-9
+ V4_EXEC_10(10); // instructions 10-19
+ V4_EXEC_10(20); // instructions 20-29
+ V4_EXEC_10(30); // instructions 30-39
+ V4_EXEC_10(40); // instructions 40-49
+ V4_EXEC_10(50); // instructions 50-59
+ V4_EXEC_10(60); // instructions 60-69
+
+#undef V4_EXEC_10
+#undef V4_EXEC
+}
+
+// If we don't have enough data available, generate more
+static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size)
+{
+ if (*data_index + bytes_needed > data_size)
+ {
+ hash_extra_blake(data, data_size, (char*) data);
+ *data_index = 0;
+ }
+}
+
+// Generates as many random math operations as possible with given latency and ALU restrictions
+// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions
+static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_t height)
+{
+ // MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle
+ // These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake
+ //
+ // AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors
+ // Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors
+ // AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same
+ // Source: https://www.agner.org/optimize/instruction_tables.pdf
+ const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 };
+
+ // Instruction latencies for theoretical ASIC implementation
+ const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 };
+
+ // Available ALUs for each instruction
+ const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT };
+
+ int8_t data[32];
+ memset(data, 0, sizeof(data));
+ uint64_t tmp = SWAP64LE(height);
+ memcpy(data, &tmp, sizeof(uint64_t));
+ data[20] = 0xda; // change seed
+
+ // Set data_index past the last byte in data
+ // to trigger full data update with blake hash
+ // before we start using it
+ size_t data_index = sizeof(data);
+
+ int code_size;
+
+ // There is a small chance (1.8%) that register R8 won't be used in the generated program
+ // So we keep track of it and try again if it's not used
+ bool r8_used;
+ do {
+ int latency[9];
+ int asic_latency[9];
+
+ // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution
+ // byte 0: current value of the destination register
+ // byte 1: instruction opcode
+ // byte 2: current value of the source register
+ //
+ // Registers R4-R8 are constant and are treated as having the same value because when we do
+ // the same operation twice with two constant source registers, it can be optimized into a single operation
+ uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
+
+ bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT];
+ bool is_rotation[V4_INSTRUCTION_COUNT];
+ bool rotated[4];
+ int rotate_count = 0;
+
+ memset(latency, 0, sizeof(latency));
+ memset(asic_latency, 0, sizeof(asic_latency));
+ memset(alu_busy, 0, sizeof(alu_busy));
+ memset(is_rotation, 0, sizeof(is_rotation));
+ memset(rotated, 0, sizeof(rotated));
+ is_rotation[ROR] = true;
+ is_rotation[ROL] = true;
+
+ int num_retries = 0;
+ code_size = 0;
+
+ int total_iterations = 0;
+ r8_used = false;
+
+ // Generate random code to achieve minimal required latency for our abstract CPU
+ // Try to get this latency for all 4 registers
+ while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64))
+ {
+ // Fail-safe to guarantee loop termination
+ ++total_iterations;
+ if (total_iterations > 256)
+ break;
+
+ check_data(&data_index, 1, data, sizeof(data));
+
+ const uint8_t c = ((uint8_t*)data)[data_index++];
+
+ // MUL = opcodes 0-2
+ // ADD = opcode 3
+ // SUB = opcode 4
+ // ROR/ROL = opcode 5, shift direction is selected randomly
+ // XOR = opcodes 6-7
+ uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1);
+ if (opcode == 5)
+ {
+ check_data(&data_index, 1, data, sizeof(data));
+ opcode = (data[data_index++] >= 0) ? ROR : ROL;
+ }
+ else if (opcode >= 6)
+ {
+ opcode = XOR;
+ }
+ else
+ {
+ opcode = (opcode <= 2) ? MUL : (opcode - 2);
+ }
+
+ uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1);
+ uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1);
+
+ const int a = dst_index;
+ int b = src_index;
+
+ // Don't do ADD/SUB/XOR with the same register
+ if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
+ {
+ // Use register R8 as source instead
+ b = 8;
+ src_index = 8;
+ }
+
+ // Don't do rotation with the same destination twice because it's equal to a single rotation
+ if (is_rotation[opcode] && rotated[a])
+ {
+ continue;
+ }
+
+ // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized:
+ // 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations
+ // 2xXOR(a, b) = NOP
+ if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16)))
+ {
+ continue;
+ }
+
+ // Find which ALU is available (and when) for this instruction
+ int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b];
+ int alu_index = -1;
+ while (next_latency < TOTAL_LATENCY)
+ {
+ for (int i = op_ALUs[opcode] - 1; i >= 0; --i)
+ {
+ if (!alu_busy[next_latency][i])
+ {
+ // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check
+ if ((opcode == ADD) && alu_busy[next_latency + 1][i])
+ {
+ continue;
+ }
+
+ // Rotation can only start when previous rotation is finished, so do an additional availability check
+ if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode]))
+ {
+ continue;
+ }
+
+ alu_index = i;
+ break;
+ }
+ }
+ if (alu_index >= 0)
+ {
+ break;
+ }
+ ++next_latency;
+ }
+
+ // Don't generate instructions that leave some register unchanged for more than 7 cycles
+ if (next_latency > latency[a] + 7)
+ {
+ continue;
+ }
+
+ next_latency += op_latency[opcode];
+
+ if (next_latency <= TOTAL_LATENCY)
+ {
+ if (is_rotation[opcode])
+ {
+ ++rotate_count;
+ }
+
+ // Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined
+ alu_busy[next_latency - op_latency[opcode]][alu_index] = true;
+ latency[a] = next_latency;
+
+ // ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple
+ asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode];
+
+ rotated[a] = is_rotation[opcode];
+
+ inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16);
+
+ code[code_size].opcode = opcode;
+ code[code_size].dst_index = dst_index;
+ code[code_size].src_index = src_index;
+ code[code_size].C = 0;
+
+ if (src_index == 8)
+ {
+ r8_used = true;
+ }
+
+ if (opcode == ADD)
+ {
+ // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too
+ alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true;
+
+ // ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C"
+ check_data(&data_index, sizeof(uint32_t), data, sizeof(data));
+ uint32_t t;
+ memcpy(&t, data + data_index, sizeof(uint32_t));
+ code[code_size].C = SWAP32LE(t);
+ data_index += sizeof(uint32_t);
+ }
+
+ ++code_size;
+ if (code_size >= NUM_INSTRUCTIONS_MIN)
+ {
+ break;
+ }
+ }
+ else
+ {
+ ++num_retries;
+ }
+ }
+
+ // ASIC has more execution resources and can extract as much parallelism from the code as possible
+ // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC
+ // Get this latency for at least 1 of the 4 registers
+ const int prev_code_size = code_size;
+ while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
+ {
+ int min_idx = 0;
+ int max_idx = 0;
+ for (int i = 1; i < 4; ++i)
+ {
+ if (asic_latency[i] < asic_latency[min_idx]) min_idx = i;
+ if (asic_latency[i] > asic_latency[max_idx]) max_idx = i;
+ }
+
+ const uint8_t pattern[3] = { ROR, MUL, MUL };
+ const uint8_t opcode = pattern[(code_size - prev_code_size) % 3];
+ latency[min_idx] = latency[max_idx] + op_latency[opcode];
+ asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode];
+
+ code[code_size].opcode = opcode;
+ code[code_size].dst_index = min_idx;
+ code[code_size].src_index = max_idx;
+ code[code_size].C = 0;
+ ++code_size;
+ }
+
+ // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
+ // It never does more than 4 iterations for all block heights < 10,000,000
+ } while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX));
+
+ // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here
+ // Add final instruction to stop the interpreter
+ code[code_size].opcode = RET;
+ code[code_size].dst_index = 0;
+ code[code_size].src_index = 0;
+ code[code_size].C = 0;
+
+ return code_size;
+}
+
+#endif